## Using BERT model for multi-class sentiment analysis

Bak Kallemov, Insight AI Fellowship programm AICV20B, June 2020



In [None]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from utils import metrics

import random
seed_val = 1
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
batch_size = 8
epochs = 10
max_length=300
polarity_dict={'positive':0, 'negative':1, 'neutral':2}
inverse_dict={y:x for x,y in polarity_dict.items()}
saved_model='notebook_data/finetuned_BERT_epoch_10.model'

In [None]:
df = pd.read_csv('data/polarity3_data/twitter_polarity3.csv', encoding='latin-1')
df['label'] = df.sentiment.replace(polarity_dict)
#df=df.sample(n=10000,axis=0)
df.shape

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(polarity_dict),
                                                      output_attentions=True,
                                                      output_hidden_states=False)
model.to(device)
pass

In [None]:
def load_model(saved_model):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(polarity_dict),
                                                      output_attentions=True,
                                                      output_hidden_states=True)
    model.to(device)
    model.load_state_dict(torch.load(saved_model, map_location=torch.device('cuda')))
    return model

def train(dataloader_train, dataloader_validation):
    if not os.path.isdir('checkpoints'): 
        os.mkdir('checkpoints')

    for epoch in tqdm(range(1, epochs+1)):

        model.train()

        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()

            batch = tuple(b.to(device) for b in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                     }       

            outputs = model(**inputs)

            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


        torch.save(model.state_dict(), f'checkpoints/finetuned_BERT_epoch_{epoch}.model')

        tqdm.write(f'\nEpoch {epoch}')

        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        tqdm.write(f'Validation loss: {val_loss}')
        
        tqdm.write(f'F1 Score (Weighted): {metrics.f1_score_func(predictions, true_vals)}')
        tqdm.write(metrics.classification_report_func(predictions, true_vals,polarity_dict.keys()))

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

def predict(inputs):

    model.eval()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
    encoded_data_predict = tokenizer.batch_encode_plus(
        inputs, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=max_length, 
        return_tensors='pt'
    )
    encoded_data_predict.to(device)
    input_ids_predict = encoded_data_predict['input_ids']
    attention_masks_predict = encoded_data_predict['attention_mask']

    dataset_predict = TensorDataset(input_ids_predict, attention_masks_predict)
    dataloader_predict = DataLoader(dataset_predict, 
                                    sampler=SequentialSampler(dataset_predict), 
                                    batch_size=batch_size)

    
    predictions = []
    total_score=[0]*len(polarity_dict)
    for batch in dataloader_predict:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
        }

        with torch.no_grad():        
            outputs = model(**inputs)
            scores=torch.max(torch.nn.Softmax(dim=1)(outputs[0]),axis=1)
            for i,x in enumerate(scores[0]):
                ind=int(scores[1][i])
                predictions.append((ind,round(float(x),4)))
                total_score[ind] +=x
    return predictions, total_score, outputs
    
def predict_single(sentense):
    model.eval()
    loss_val_total = 0
    encoded_data_test = tokenizer.encode_plus(
    sentense, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=max_length, 
    return_tensors='pt'
    )
    encoded_data_test.to(device)
    with torch.no_grad():        
        outputs = model(**encoded_data_test)
    return torch.max(torch.nn.Softmax(dim=1)(outputs[0]),axis=1), encoded_data_test['input_ids'], outputs

In [None]:
batch_size = 8
epochs = 10
max_length=300
polarity_dict={'positive':0, 'negative':1, 'neutral':2}
inverse_dict={y:x for x,y in polarity_dict.items()}
saved_model='notebook_data/finetuned_BERT_epoch_10.model'
X_train, X_val, y_train, y_val = train_test_split(df.text.values, 
                                                  df.label.values, 
                                                  test_size=0.10, 
                                                  random_state=1, 
                                                  stratify=df.label.values)


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    X_train, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=max_length, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    X_val,
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=max_length, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_val)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)


pass

In [None]:
train(dataloader_train, dataloader_validation)

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)
report=metrics.classification_report_func(predictions, true_vals, polarity_dict.keys())
print(report)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

preds_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_vals.flatten()
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');
cm = confusion_matrix(labels_flat, preds_flat)
df_cm = pd.DataFrame(cm, index=polarity_dict.keys(), columns=polarity_dict.keys())
show_confusion_matrix(df_cm)

In [None]:
inputs=['Imagine all the people living life in peace',
        'I love this restaurant but the line is too long',
        'we should be making a profit by this time next year.',
        'She earned her PhD in physics before becoming a postdoctoral fellow at Rockefeller University, where she worked on developing and implementing an underwater touchscreen for dolphins.',
        'I did smile today!',
        'Applies the Softmax function to an n-dimensional input Tensor', 
        'I have not slept enough today', 
        'the weather is crazy hot today',
        'Listen Morty, I hate to break it to you, but what people call “love” is just a chemical reaction that compels animals to breed. It hits hard, Morty, then it slowly fades, leaving you stranded in a failing marriage. I did it. Your parents are gonna do it. Break the cycle, Morty. Rise above. Focus on science.',
        'I am a scientist, because I invent, transform, create, and destroy for a living, and when I do not like something about the world, I change it',
        'I turned myself into a pickle, Morty! I’m Pickle Rick!',
        'Do you wanna develop an app?',
        'Honey, stop raising your father cholesterol so you can take a hot funeral selfie.',
        'To live is to risk it all; otherwise you are just an inert chunk of randomly assembled molecules drifting wherever the universe blows you...'
        ]

In [None]:
model=load_model(saved_model)
scores, overall_score,outputs = predict(inputs)
[(inverse_dict[int(s[0])],round(float(s[1]),3), inputs[i][:50]+ ('...' if len(inputs[i])>50 else '')) for i,s in enumerate(scores)]

In [None]:
outputs[1][0].shape

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
encoding = tokenizer.encode_plus(
  inputs[0],
  max_length=50,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=True,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)
out=tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
pad_index=out.index('[PAD]')
out[:pad_index]

In [None]:
df=pd.read_json('test.json')
df=pd.DataFrame(df['sentences'].to_list())
inputs=df.text.values
scores, overall_score = predict(inputs)
[(inverse_dict[int(s[0])],round(float(s[1]),3), inputs[i][:50]+ ('...' if len(inputs[i])>50 else '')) for i,s in enumerate(scores)]

In [None]:
score, input_ids, output = predict_single('Today is not an awesome day one two three')
inverse_dict[int(score[1])],round(float(score[0]),3)

In [None]:
out=tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
pad_index=out.index('[PAD]')
print(pad_index,out[:pad_index])


In [None]:
output[1][0][:,:pad_index,:].shape

# CAPTUM experiments


In [1]:
import os
import torch
import numpy as np
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from captum.attr import IntegratedGradients
from captum.attr import visualization


import random
seed_val = 1
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
max_length=300
polarity_dict={'positive':0, 'negative':1, 'neutral':2}
inverse_dict={y:x for x,y in polarity_dict.items()}
saved_model='notebook_data/finetuned_BERT_epoch_10.model'
sentence='Imagine all the people living life in peace.'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def load_model(saved_model):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(polarity_dict),
                                                      output_attentions=True,
                                                      output_hidden_states=True)
    model.to(device)
    model.load_state_dict(torch.load(saved_model, map_location=torch.device('cuda')))
    return model
def predict_single(sentense):
    model.eval()
    loss_val_total = 0
    encoded_data_test = tokenizer.encode_plus(
    sentense, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=False, 
    max_length=max_length, 
    return_tensors='pt'
    )
    encoded_data_test.to(device)
    with torch.no_grad():        
        outputs = model(**encoded_data_test)
    return torch.max(torch.nn.Softmax(dim=1)(outputs[0]),axis=1), encoded_data_test['input_ids'], outputs

In [3]:
model=load_model(saved_model)

In [4]:
def compute_bert_outputs(model_bert, embedding_output, attention_mask=None, head_mask=None):
    if attention_mask is None:
        attention_mask = torch.ones(embedding_output.shape[0], embedding_output.shape[1]).to(embedding_output)

    extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

    extended_attention_mask = extended_attention_mask.to(dtype=next(model_bert.parameters()).dtype) # fp16 compatibility
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

    if head_mask is not None:
        if head_mask.dim() == 1:
            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
            head_mask = head_mask.expand(model_bert.config.num_hidden_layers, -1, -1, -1, -1)
        elif head_mask.dim() == 2:
            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
        head_mask = head_mask.to(dtype=next(model_bert.parameters()).dtype) # switch to fload if need + fp16 compatibility
    else:
        head_mask = [None] * model_bert.config.num_hidden_layers

    encoder_outputs = model_bert.encoder(embedding_output,
                                         extended_attention_mask,
                                         head_mask=head_mask)
    sequence_output = encoder_outputs[0]
    sequence_output.to(device)
    pooled_output = model_bert.pooler(sequence_output)
    pooled_output.to(device)
    outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
    return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)    


class BertModelWrapper(torch.nn.Module):
    
    def __init__(self, model):
        super(BertModelWrapper, self).__init__()
        self.model = model
        
    def forward(self, embeddings):        
        outputs = compute_bert_outputs(self.model.bert, embeddings)
        
        pooled_output = outputs[1]
        #pooled_output = self.model.dropout(pooled_output)
        logits = self.model.classifier(pooled_output)
        return torch.softmax(logits, dim=1)[:, 1].unsqueeze(1)

In [5]:
bert_model_wrapper = BertModelWrapper(model)
bert_model_wrapper.to(device)
ig = IntegratedGradients(bert_model_wrapper)

In [6]:
scores,input_ids,output = predict_single(sentence)
input_embedding=output[1][0]

In [13]:
import time

vis_data_records_ig = []

def interpret_sentence(model_wrapper, sentence, label=0):
    start = time.time()
    '''
    model_wrapper.eval()
    model_wrapper.zero_grad()
    
    #input_ids = torch.tensor([tokenizer.encode(sentence, add_special_tokens=True)])
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
    encoded_data = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=False, 
        max_length=300, 
        return_tensors='pt'
    )
    encoded_data.to(device)
    input_ids=encoded_data['input_ids']
    
    input_ids.to(device)
    input_embedding = model_wrapper.model.bert.embeddings(encoded_data['input_ids'])
    '''
    print('shape= ', input_ids.shape, input_embedding.shape)
    # predict
    pred = model_wrapper(input_embedding).item()
    pred_ind = round(pred)
    end = time.time()
    print('embedding time=', end - start)
    start=time.time()
    
    # compute attributions and approximation delta using integrated gradients
    attributions_ig, delta = ig.attribute(input_embedding, n_steps=100, return_convergence_delta=True)
    end = time.time()
    print('ig time = ', end - start)
    
    #print('pred: ', pred_ind, '(', '%.2f' % pred, ')', ', delta: ', abs(delta))

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].detach().cpu().numpy().tolist())    
    print(tokens[1:-1])
    #add_attributions_to_visualizer(attributions_ig, tokens, pred, pred_ind, label, delta, vis_data_records_ig)
    attributions=add_attributions_to_visualizer(attributions_ig, tokens, 
                                   pred, pred_ind, label, delta, 
                                   vis_data_records_ig)
   
    return tokens[1:-1], attributions    
def add_attributions_to_visualizer(attributions, tokens, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.detach().cpu().numpy()
    attributions[0]=0
    attributions[-1]=0
    attributions /=max(attributions)
    print(attributions)
    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions[1:-1],
                            pred,
                            pred_ind,
                            label,
                            "label",
                            attributions.sum(),       
                            tokens[1:-1],
                            delta
                            ))
    return attributions
    
tokens, attributions=interpret_sentence(bert_model_wrapper, sentence=sentence, label=0)
atts=visualization.visualize_text(vis_data_records_ig)

shape=  torch.Size([1, 11]) torch.Size([1, 11, 768])
embedding time= 0.021704673767089844
ig time =  0.17499494552612305
['imagine', 'all', 'the', 'people', 'living', 'life', 'in', 'peace', '.']
[ 0.          1.          0.29191798 -0.05807778 -0.24827234 -0.1239868
 -0.27984333 -0.01044027  0.23614582  0.22888951  0.        ]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,0 (0.32),label,1.04,imagine all the people living life in peace .
,,,,


In [14]:
def format_special_tokens(token):
    #if token.startswith("<") and token.endswith(">"):
    #    return "#" + token.strip("<>")
    return token

def _get_color(attr):
    # clip values to prevent CSS errors (Values should be from [-1,1])
    attr = max(-1, min(1, attr))
    if attr > 0:
        hue = 120
        sat = 75
        lig = 100 - int(50 * attr)
    else:
        hue = 0
        sat = 75
        lig = 100 - int(-40 * attr)
    return "hsl({}, {}%, {}%)".format(hue, sat, lig)

def format_word_importances(words, importances):
    if importances is None or len(importances) == 0:
        return "<td></td>"
    assert len(words) <= len(importances)
    tags = ["<td>"]
    for word, importance in zip(words, importances[: len(words)]):
        word = format_special_tokens(word)
        color = _get_color(importance)
        unwrapped_tag = '<mark style="background-color: {color}; opacity:1.0; \
                    line-height:1.75"><font color="black"> {word}\
                    </font></mark>'.format(
            color=color, word=word
        )
        tags.append(unwrapped_tag)
    tags.append("</td>")
    return "".join(tags)

In [18]:
from IPython.core.display import display, HTML
dom = ["<table width: 100%>"]
rows = ["<th>Word Importance</th>"]

rows.append(
            "".join(
                [
                    format_word_importances(tokens, attributions)
                    ,
                    "<tr>",
                ]
            )
        )

dom.append("".join(rows))
dom.append("</table>")
display(HTML("".join(dom)))
