# Project  TDDE09
## Hugo Bjork || Jakob Berggren || Martin Forsberg

For this project we will need to run it on the GPU to optimize speed.

In [1]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
#!pip install nltk
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jakobberggren/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## The data

The data used fo this project is movie reviews from Imdb. The data set consists of 50 000 reviews labeled positive or negative. Our first course of action was to slim the data set down to only the revies with less than 128 words in them. This is done in order to be able to train the model within a resonable timeframe and avoid needing to chop up the reviews into chunks due to BERTs max length of 512.


In [4]:
#!pip install pandas
import pandas as pd

df = pd.read_csv('imdb.csv')
print(df.shape)
df = df[df['review'].str.split().apply(len) <= 128]
print(df.shape)
label_map = {'positive': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(label_map)
print((df[df['sentiment']==1].count()[0]/len(df))*100)

(50000, 2)
(13274, 2)
51.94364923911405


The data is loaded and preproccessed into a smaller set of max review length of 128.

In [5]:
def preVader(data):
    vadScore = [sid.polarity_scores(a) for a in data.review]
    columns = list(vadScore[0].keys())
    tens = torch.empty(len(vadScore), len(columns)).to(device)

    # All reviews for traning where columns is neg&neu&pos&comp
    for i, score in enumerate(vadScore):
        for j, key in enumerate(columns):
            tens[i, j] = score[key]
    return tens

## Baseline

Our first task is to create our baseline by fine tuning the BERT model to our IMDB data set.

We need two classes from the Transformers library:

In [6]:
#!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification

#Instantiating both classes with the pre-trained bert-base-uncased model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In the tensorize function the data is preproccessed to fit the bert modle requirements by translating the reviews to token ids, masking the padding tokens and finaly a tensor with the labels correspinding to each reviews. These are returned by a TensorDataset so it can easily be split by a dataloader.

In [8]:
from torch.utils.data import TensorDataset

MAX_LENGTH = 130

def tensorize(reviews):
    input_ids = []
    labels = []
    attention_masks = []
    for index, rev in reviews.iterrows():
        encoded = tokenizer.encode_plus(
                    rev[0].split(), 
                    add_special_tokens=True, 
                    max_length=MAX_LENGTH,
                    padding='max_length',
                    return_attention_mask=True,
                    return_tensors='pt',
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(rev[1])
    return TensorDataset(torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels))


Below we train our baseline model

In [9]:
#!pip install scikit-learn
#!pip install tqdm
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import *

def train_bert(n_epochs=1, batch_size=32):
    train = df[0:int(len(df)*0.75)]
    test = df[int(len(df)*0.75):]
    
    train = tensorize(train)
    test = tensorize(test)
        
    model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False).to(device)
    
    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters(), lr=1e-5)
    
    # set model in training mode
    model.train()
    for i in range(n_epochs):
        with tqdm(total=len(list(train))) as pbar:
            pbar.set_description(f'Epoch {i + 1}')
            for index, batch in enumerate(DataLoader(train, batch_size, shuffle=False)): 

               # Reset the accumulated gradients
                optimizer.zero_grad()

                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = model(input_ids=b_ids, attention_mask=b_mask,
                        labels=b_labels)
                
                # Backward pass; propagates the loss and computes the gradients
                loss = outputs.loss
                loss.backward()

                # Update the parameters of the model
                optimizer.step()
                
                # Update diagnostics
                pbar.set_postfix(loss=loss.item())
                pbar.update(batch_size)
                
    model.eval()  # Sets the model to evaluation mode
    with torch.no_grad():  # Blocks the accumulation of gradients
        y,p = [], [] # init list of actuals and predictions
        with tqdm(total=len(list(test))) as pbar:
            pbar.set_description(f'test')
            for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): 
                
                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = model(input_ids=b_ids, attention_mask=b_mask,
                                labels=b_labels)
                
                loss = outputs.loss     
                pred = torch.argmax(outputs[1], dim=1)
                                
                y.extend(b_labels.cpu().detach().numpy())
                p.extend(pred.cpu().detach().numpy())
                
                # Update diagnostics
                pbar.update(batch_size)

            # using sklearn to compute performance metrics
            cm = confusion_matrix(y, p)
            acc = accuracy_score(y, p)
            prec = precision_score(y, p)
            rec = recall_score(y, p)
            f1 = f1_score(y, p)
            
            # update status bar with metrics
            pbar.set_postfix(accuracy=float(acc)*100,
                              precision=float(prec)*100,
                              recall=float(rec)*100,
                              f1_score=float(f1)*100)
            print(f"\n{cm}, CONFUSION MATRIX")
    return model

In [11]:
bert=train_bert(n_epochs=1, batch_size=32)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


[[305 154]
 [ 33 548]], CONFUSION MATRIX





Next we will look at the perfomance of only using VADER. This will not be used as baseline, but its important information when interpreting later results.

In [34]:
train = df[0:int(len(df)*0.75)]
test = df[len(train):]

vader_tr = preVader(train)
vad_pos = vader_tr[:, 2]
vad_neg = vader_tr[:, 0]
vader_tr = F.softmax(torch.column_stack((vad_neg, vad_pos)), dim=1)

vader_te = preVader(test)
vad_pos = vader_te[:, 2]
vad_neg = vader_te[:, 0]
vader_te = F.softmax(torch.column_stack((vad_neg, vad_pos)), dim=1)

train_labels = torch.tensor(list(train.sentiment))
test_labels = torch.tensor(list(test.sentiment))

loss_train = F.cross_entropy(vader_tr, train_labels)
loss_test = F.cross_entropy(vader_te, test_labels)

pred_train = torch.argmax(vader_tr, dim=1)
pred_test = torch.argmax(vader_te, dim=1)

# using sklearn to compute performance metrics
cm = confusion_matrix(train_labels, pred_train)
acc = accuracy_score(train_labels, pred_train)
prec = precision_score(train_labels, pred_train)
rec = recall_score(train_labels, pred_train)
f1 = f1_score(train_labels, pred_train)
print("--------Train-----------")
print(f"{acc}, acc")
print(f"{prec}, prec")
print(f"{rec}, recall")
print(f"{f1}, f1")
print(f"{cm}, conf matrix")


cm = confusion_matrix(test_labels, pred_test)
acc = accuracy_score(test_labels, pred_test)
prec = precision_score(test_labels, pred_test)
rec = recall_score(test_labels, pred_test)
f1 = f1_score(test_labels, pred_test)
print("--------Test-----------")
print(f"{acc}, acc")
print(f"{prec}, prec")
print(f"{rec}, recall")
print(f"{f1}, f1")
print(f"{cm}, conf matrix")

--------Train-----------
0.7293822199899548, acc
0.6845141251294187, prec
0.8918866833686645, recall
0.7745606694560669, f1
[[2633 2133]
 [ 561 4628]], conf matrix
--------Test-----------
0.724013257005122, acc
0.6758682101513802, prec
0.8898007033997656, recall
0.7682186234817814, f1
[[ 885  728]
 [ 188 1518]], conf matrix


## Method 1 - MLP architecture

Method 1 combines outcomes from the fine tuned BERT model with VAD-score sentiment values from the VADER model by implementing a Multi Layer Perceptron. First, we build the architecture for the multi layer perceptron, which will be used to align our data in the same vector space. Later, the last linear layer in this model will be trained.

In [10]:
import torch.nn as nn

class MultilayerPerceptron(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.seq = nn.Sequential(
                nn.LazyLinear(hidden_dim),
                torch.nn.ReLU(),
                nn.Linear(hidden_dim, output_dim),
       )
    def forward(self, bert, vader):
        x = torch.cat((bert, vader), dim=1)
        x =  self.seq(x)
        return x

In [11]:
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import numpy as np

def train_mlp(bert, n_epochs=1, batch_size=32, learning_rate=1e-5):
    train = df[0:int(len(df)*0.75)]
    test = df[len(train):]

    vader_tr = preVader(train).to(device)
    vad_pos = vader_tr[:, 2]
    vad_neg = vader_tr[:, 0]
    vader_tr = F.softmax(torch.column_stack((vad_neg, vad_pos)))

    vader_te = preVader(test).to(device)
    vad_pos = vader_te[:, 2]
    vad_neg = vader_te[:, 0]
    vader_te = F.softmax(torch.column_stack((vad_neg, vad_pos)))

    train = tensorize(train)
    test = tensorize(test)
    
    # init multi layer perceptron
    mlp = MultilayerPerceptron(250, 2).to(device)
        
    # Initialize the optimizer. Here we use Adam rather than plain SGD
    optimizer = optim.Adam(mlp.parameters(), lr=learning_rate)
    mlp.train()
    
    for i in range(n_epochs):
        with tqdm(total=len(list(train))) as pbar:
            pbar.set_description(f'Epoch {i + 1}')
            for index, batch in enumerate(DataLoader(train, batch_size, shuffle=False)): 

               # Reset the accumulated gradients
                optimizer.zero_grad()

                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = bert(input_ids=b_ids, attention_mask=b_mask,
                        labels=b_labels)

                # combine features of bert and vader using the mlp
                mlp_out = mlp.forward(outputs.logits, vader_tr[index*batch_size:index*batch_size+outputs.logits.size(0),:3]) 
             
                # Backward pass; propagates the loss and computes the gradients
                loss = F.cross_entropy(mlp_out, b_labels)
                loss.backward()

                # Update the parameters of the model
                optimizer.step()

                # Update diagnostics
                pbar.set_postfix(loss=loss.item())
                pbar.update(batch_size)
                
    model.eval()  # Sets the model to evaluation mode
    with torch.no_grad():  # Blocks the accumulation of gradients
        y,p = [], [] # init list of actuals and predictions
        with tqdm(total=len(list(test))) as pbar:
            pbar.set_description(f'test')
            for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): 
              
                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = bert(input_ids=b_ids, attention_mask=b_mask,
                                labels=b_labels)
                
                mlp_out = mlp.forward(outputs.logits, vader_te[index*batch_size:index*batch_size+outputs.logits.size(0),:3]) 
                
                loss = F.cross_entropy(mlp_out, b_labels)
                pred = torch.argmax(mlp_out, dim=1)
                
                y.extend(b_labels.cpu().detach().numpy())
                p.extend(pred.cpu().detach().numpy())

                # Update diagnostics
                pbar.update(batch_size)

            # using sklearn to compute performance metrics
            cm = confusion_matrix(y, p)
            acc = accuracy_score(y, p)
            prec = precision_score(y, p)
            rec = recall_score(y, p)
            f1 = f1_score(y, p)
            
            # update status bar with metrics
            pbar.set_postfix(accuracy=float(acc)*100,
                              precision=float(prec)*100,
                              recall=float(rec)*100,
                              f1_score=float(f1)*100)
            print(f"\n{cm}, CONFUSION MATRIX")
    return model

In [14]:
mlp_model = train_mlp(bert, n_epochs=1, batch_size=32, learning_rate=1e-4)

  vader_tr = F.softmax(torch.column_stack((vad_neg, vad_pos)))
  vader_te = F.softmax(torch.column_stack((vad_neg, vad_pos)))
Epoch 1: : 3136it [00:39, 79.72it/s, loss=0.383]
test: : 1056it [00:05, 200.97it/s, accuracy=83, f1_score=86, precision=79.6, recall=93.5]


[[320 139]
 [ 38 543]], CONFUSION MATRIX





## Method 2 - Weights

In method two, the respective outputs from BERT and VADER are assigned a weight and then added together. To try different weights, we simply implement a loop. Results are thus not optimal, but show an approximation of the relationship between the models.

In [35]:
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import numpy as np

def train_weights(bert,w, n_epochs=1, batch_size=32):
    train = df[0:int(len(df)*0.75)]
    test = df[len(train):]
    vader = preVader(test).to(device)
    
    train = tensorize(train)
    test = tensorize(test)

    y,p = [], [] # init list of actuals and predictions
    with tqdm(total=len(list(test))) as pbar:
        pbar.set_description(f'test')
        for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): 
                  
            b_ids = batch[0].to(device)
            b_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            outputs = bert(input_ids=b_ids, attention_mask=b_mask,
                            labels=b_labels)
            
            vad_pos = vader[batch_size*index:batch_size*index+len(b_ids), 2]
            vad_neg = vader[batch_size*index:batch_size*index+len(b_ids), 0]

            pred_vad = F.softmax(torch.column_stack((vad_neg, vad_pos)), dim=0)
            pred_bert = F.softmax(outputs.logits, dim=0)

            pred = torch.argmax(torch.add(w*pred_vad, (1-w)*pred_bert), dim=1)

            y.extend(b_labels.cpu().detach().numpy())
            p.extend(pred.cpu().detach().numpy())

            # Update diagnostics
            pbar.update(batch_size)

        # using sklearn to compute performance metrics
        cm = confusion_matrix(y, p)
        acc = accuracy_score(y, p)
        prec = precision_score(y, p)
        rec = recall_score(y, p)
        f1 = f1_score(y, p)
        
        # update status bar with metrics
        pbar.set_postfix(accuracy=float(acc)*100,
                          precision=float(prec)*100,
                          recall=float(rec)*100,
                          f1_score=float(f1)*100)
        print(f"\n{cm}, CONFUSION MATRIX")
    return ['Accuracy',acc*100, 'Precision', prec*100,'recall', rec*100,'F1-score', f1, 'CM', cm]

In [36]:
acc_list = {}
for w in np.arange(1, 1.05, 0.05):
    acc = train_weights(bert,w, n_epochs=1, batch_size=32)
    acc_list['Vader', int(w*100), 'bert', int((1-w)*100)] = acc
print(acc_list)

NameError: name 'bert' is not defined