# Project  TDDE09
## Hugo Bjork || Jakob Berggren || Martin Forsberg

For this project we will need to run it on the GPU to optimize speed.

In [86]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
# If on mac:
# setting up mps as device. 
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

Using device: mps


In [87]:
#!pip install nltk
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## The data

The data used fo this project is movie reviews from Imdb. The data set consists of 50 000 reviews labeled positive or negative. Our first course of action was to slim the data set down to only the revies with less than 128 words in them. This is done in order to be able to train the model within a resonable timeframe and avoid needing to chop up the reviews into chunks due to BERTs max length of 512.


In [88]:
#!pip install pandas
import pandas as pd

df = pd.read_csv('imdb.csv')
print(df.shape)
df = df[df['review'].str.split().apply(len) <= 128]
print(df.shape)
label_map = {'positive': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(label_map)
print((df[df['sentiment']==1].count()[0]/len(df))*100)

train = df[0:int(len(df)*0.075)]
test = df[int(len(df)*0.075):int(len(df)*0.1)]
vadScore = [sid.polarity_scores(a) for a in train.review]

(50000, 2)
(13274, 2)
51.94364923911405


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['sentiment'].map(label_map)


The data is loaded and preproccessed into a smaller set of max review length of 128.

In [89]:
def preVader(data):
    vadScore = [sid.polarity_scores(a) for a in data.review]
    columns = list(vadScore[0].keys())
    tens = torch.empty(len(vadScore), len(columns)).to(device)

    # All reviews for traning where columns is neg&neu&pos&comp
    for i, score in enumerate(vadScore):
        for j, key in enumerate(columns):
            tens[i, j] = score[key]
    return tens

In [90]:
tens = preVader(train)
print(tens[:4,:3])

tensor([[0.0170, 0.7580, 0.2250],
        [0.0940, 0.5310, 0.3750],
        [0.0840, 0.6960, 0.2210],
        [0.0860, 0.7950, 0.1200]], device='cuda:0')


## Baseline

Our first task is to create our baseline by fine tuning the BERT model to our IMDB data set.

We need two classes from the Transformers library:

In [91]:
#!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification

#Instantiating both classes with the pre-trained bert-base-uncased model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In the tensorize function the data is preproccessed to fit the bert modle requirements by translating the reviews to token ids, masking the padding tokens and finaly a tensor with the labels correspinding to each reviews. These are returned by a TensorDataset so it can easily be split by a dataloader.

In [92]:
from torch.utils.data import TensorDataset

MAX_LENGTH = 130

def tensorize(reviews):
    input_ids = []
    labels = []
    attention_masks = []
    for index, rev in reviews.iterrows():
        encoded = tokenizer.encode_plus(
                    rev[0].split(), 
                    add_special_tokens=True, 
                    max_length=MAX_LENGTH,
                    padding='max_length',
                    return_attention_mask=True,
                    return_tensors='pt',
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(rev[1])
    return TensorDataset(torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels))


Below we will make a dataloader class

In [93]:
from torch.utils.data import DataLoader
dataset = tensorize(train.iloc[:11,:]) 
datalord = DataLoader(dataset, batch_size=32, shuffle=True)

In [94]:
#!pip install scikit-learn
#!pip install tqdm
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import *

def train_bert(n_epochs=1, batch_size=32):
    train = df[0:int(len(df)*0.75)]
    test = df[int(len(df)*0.75):]
    
    train = tensorize(train)
    test = tensorize(test)
        
    model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False).to(device)
    
    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters(), lr=1e-5)
    
    # set model in training mode
    model.train()
    for i in range(n_epochs):
        with tqdm(total=len(list(train))) as pbar:
            for index, batch in enumerate(DataLoader(train, batch_size, shuffle=False)): 

               # Reset the accumulated gradients
                optimizer.zero_grad()

                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = model(input_ids=b_ids, attention_mask=b_mask,
                        labels=b_labels)
                
                # Backward pass; propagates the loss and computes the gradients
                loss = outputs.loss
                loss.backward()

                # Update the parameters of the model
                optimizer.step()
                
                # Update diagnostics
                pbar.set_description(f'Epoch {i + 1}')
                pbar.set_postfix(loss=loss.item())
                pbar.update(batch_size)
                
    model.eval()  # Sets the model to evaluation mode
    with torch.no_grad():  # Blocks the accumulation of gradients
        y,p = [], [] # init list of actuals and predictions
        with tqdm(total=len(list(test))) as pbar:
            for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): 
                
                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = model(input_ids=b_ids, attention_mask=b_mask,
                                labels=b_labels)
                
                loss = outputs.loss     
                pred = torch.argmax(outputs[1], dim=1)
                                
                y.extend(b_labels.cpu().detach().numpy())
                p.extend(pred.cpu().detach().numpy())
                
                # Update diagnostics
                pbar.set_description(f'test')
                pbar.update(batch_size)

            # using sklearn to compute performance metrics
            cm = confusion_matrix(y, p)
            acc = accuracy_score(y, p)
            prec = precision_score(y, p)
            rec = recall_score(y, p)
            f1 = f1_score(y, p)
            
            # update status bar with metrics
            pbar.set_postfix(accuracy=float(acc)*100,
                              precision=float(prec)*100,
                              recall=float(rec)*100,
                              f1_score=float(f1)*100)
            print(f"\n{cm}, CONFUSION MATRIX")
    return model

In [96]:
bert=train_bert(n_epochs=1, batch_size=32)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


[[1324  289]
 [ 124 1582]], CONFUSION MATRIX





## Method 1 - MLP architecture

Next, we will build the architecture for a multi layer perceptron, which will be used to align our data in the same vector space. Later, the last linear layer in this model will be trained.

In [97]:
import torch.nn as nn

class MultilayerPerceptron(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.seq = nn.Sequential(
                nn.LazyLinear(hidden_dim),
                torch.nn.ReLU(),
                nn.Linear(hidden_dim, output_dim),
       )
    def forward(self, bert, vader):
        x = torch.cat((bert, vader), dim=1)
        x =  self.seq(x)
        return x

In [98]:
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import numpy as np

def train_mlp(bert, n_epochs=1, batch_size=32, learning_rate=1e-5):
    train = df[0:int(len(df)*0.75)]
    test = df[len(train):]

    vader_tr = preVader(train).to(device)
    vad_pos = vader_tr[:, 2]
    vad_neg = vader_tr[:, 0]
    vader_tr = F.softmax(torch.column_stack((vad_neg, vad_pos)))

    vader_te = preVader(test).to(device)
    vad_pos = vader_te[:, 2]
    vad_neg = vader_te[:, 0]
    vader_te = F.softmax(torch.column_stack((vad_neg, vad_pos)))

    train = tensorize(train)
    test = tensorize(test)
    
    # init multi layer perceptron
    mlp = MultilayerPerceptron(250, 2).to(device)
        
    # Initialize the optimizer. Here we use Adam rather than plain SGD
    optimizer = optim.Adam(mlp.parameters(), lr=learning_rate)
    mlp.train()
    
    for i in range(n_epochs):
        with tqdm(total=len(list(train))) as pbar:
            for index, batch in enumerate(DataLoader(train, batch_size, shuffle=False)): 

               # Reset the accumulated gradients
                optimizer.zero_grad()

                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = bert(input_ids=b_ids, attention_mask=b_mask,
                        labels=b_labels)

                # combine features of bert and vader using the mlp
                mlp_out = mlp.forward(outputs.logits, vader_tr[index*batch_size:index*batch_size+outputs.logits.size(0),:3]) 
             
                # Backward pass; propagates the loss and computes the gradients
                loss = F.cross_entropy(mlp_out, b_labels)
                loss.backward()

                # Update the parameters of the model
                optimizer.step()

                # Update diagnostics
                pbar.set_description(f'Epoch {i + 1}')
                pbar.set_postfix(loss=loss.item())
                pbar.update(batch_size)
                
    model.eval()  # Sets the model to evaluation mode
    with torch.no_grad():  # Blocks the accumulation of gradients
        y,p = [], [] # init list of actuals and predictions
        with tqdm(total=len(list(test))) as pbar:
            for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): 
              
                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = bert(input_ids=b_ids, attention_mask=b_mask,
                                labels=b_labels)
                
                mlp_out = mlp.forward(outputs.logits, vader_te[index*batch_size:index*batch_size+outputs.logits.size(0),:3]) 
                
                loss = F.cross_entropy(mlp_out, b_labels)
                pred = torch.argmax(mlp_out, dim=1)
                
                y.extend(b_labels.cpu().detach().numpy())
                p.extend(pred.cpu().detach().numpy())

                # Update diagnostics
                pbar.set_description(f'test')
                pbar.update(batch_size)

            # using sklearn to compute performance metrics
            cm = confusion_matrix(y, p)
            acc = accuracy_score(y, p)
            prec = precision_score(y, p)
            rec = recall_score(y, p)
            f1 = f1_score(y, p)
            
            # update status bar with metrics
            pbar.set_postfix(accuracy=float(acc)*100,
                              precision=float(prec)*100,
                              recall=float(rec)*100,
                              f1_score=float(f1)*100)
            print(f"\n{cm}, CONFUSION MATRIX")
    return model

In [100]:
mlp_model = train_mlp(bert, n_epochs=1, batch_size=32, learning_rate=1e-4)

  vader_tr = F.softmax(torch.column_stack((vad_neg, vad_pos)))
  vader_te = F.softmax(torch.column_stack((vad_neg, vad_pos)))
Epoch 1: : 9984it [03:14, 51.25it/s, loss=0.394]                        
test: : 3328it [00:23, 144.18it/s, accuracy=88.4, f1_score=88.9, precision=87.7, recall=90.2]


[[1397  216]
 [ 168 1538]], CONFUSION MATRIX





In [None]:
import pickle
# Save the model to a file
# Denna kodsnutt laddar ner modellen till report, tryck refresh i colab så ser man den, sen kan man importa till django
with open('./model.pkl', 'wb') as f:
    pickle.dump(model, f)