# Project  TDDE09
## Hugo Bjork || Jakob Berggren || Martin Forsberg

For this project we will need to run it on the GPU to optimize speed.

In [14]:
#!pip install nltk
import torch
import nltk

In [15]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jakobberggren/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## The data

The data used fo this project is movie reviews from Imdb. The data set consists of 50 000 reviews labeled positive or negative. Our first course of action was to slim the data set down to only the revies with less than 120 words in them. This is done in order to be able to train the model within a resonable timeframe and avoid needing to chop up the reviews into chunks due to BERTs max length of 512.


In [16]:
#!pip install pandas
import pandas as pd

df = pd.read_csv('imdb.csv')
print(df.shape)
df = df[df['review'].str.split().apply(len) <= 120]
print(df.shape)
label_map = {'positive': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(label_map)
print((df[df['sentiment']==1].count()[0]/len(df))*100)

train = df[0:int(len(df)*0.075)]
test = df[int(len(df)*0.075):int(len(df)*0.1)]
vadScore = [sid.polarity_scores(a) for a in train.review]

(50000, 2)
(10494, 2)
52.81113016962073


The data is loaded and preproccessed into a smaller set of max review length of 120.

Her is an example from the data set

In [17]:
train.iloc[0:1,:]
vadScore[1]

{'neg': 0.094, 'neu': 0.531, 'pos': 0.375, 'compound': 0.9149}

In [18]:
def preVader(data):
    vadScore = [sid.polarity_scores(a) for a in data.review]
    columns = list(vadScore[0].keys())
    tens = torch.empty(len(vadScore), len(columns)).to(device)

    # All reviews for traning where columns is neg&neu&pos&comp
    for i, score in enumerate(vadScore):
        for j, key in enumerate(columns):
            tens[i, j] = score[key]
    return tens

In [19]:
tens= preVader(train)
print(tens[:4,:3])

tensor([[0.0170, 0.7580, 0.2250],
        [0.0940, 0.5310, 0.3750],
        [0.0840, 0.6960, 0.2210],
        [0.0860, 0.7950, 0.1200]])


## Baseline

Our first task is to create our baseline by fine tuning the BERT model to our IMDB data set.

We need two classes from the Transformers library:

In [20]:
#!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification

#Instantiating both classes with the pre-trained bert-base-uncased model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [21]:
import torch.nn as nn

class MultilayerPerceptron(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.seq = nn.Sequential(
                nn.LazyLinear(hidden_dim),
                torch.nn.ReLU(),
                nn.LazyLinear(output_dim),
       )
    def forward(self, bert, vader): 
        x = torch.cat((bert, vader), dim=1)
        x =  self.seq(x)
        print(x.size())
        return x

In the tensorize function the data is preproccessed to fit the bert modle requirements by translating the reviews to token ids, masking the padding tokens and finaly a tensor with the labels correspinding to each reviews. These are returned by a TensorDataset so it can easily be split by a dataloader.

In [22]:
from torch.utils.data import TensorDataset

def tensorize(reviews):
    input_ids = []
    labels = []
    attention_masks = []
    for index, rev in reviews.iterrows():
        encoded = tokenizer.encode_plus(
                    rev[0].split(), 
                    add_special_tokens=True, 
                    max_length=122,
                    padding='max_length',
                    return_attention_mask=True,
                    return_tensors='pt', 
       )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(rev[1])
    return TensorDataset(torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels))


Below I will make a dataloader class that has batch size of 32 and using list compehension pre computes all the inputs, masks and labels.

In [23]:
from torch.utils.data import DataLoader
dataset= tensorize(train.iloc[:11,:]) 
datalord = DataLoader(dataset, batch_size=10, shuffle=True)

for i,data in enumerate(datalord):
    print('BATCH', i)
    print(data)


BATCH 0
[tensor([[ 101,  100, 1037,  ...,    0,    0,    0],
        [ 101,  100,  100,  ...,    0,    0,    0],
        [ 101,  100,  100,  ...,    0,    0,    0],
        ...,
        [ 101,  100, 1996,  ...,    0,    0,    0],
        [ 101,  100, 3459,  ...,    0,    0,    0],
        [ 101,  100, 2017,  ...,    0,    0,    0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 0, 1, 1, 0, 1, 0, 0, 0, 1])]
BATCH 1
[tensor([[  101,   100,  2003,  2019,   100,  3185,  3005,  2069,  7494,  4519,
          2003,  2008,  2009,  3340,   100,   100,  2247,  2007,  1037,  2204,
          3459,  1997,  4637,   100,   100,  2466, 19223,  2105,  1037,   100,
         18901, 13877,  2040,  4150,  3297,  2044, 21089,  7494,  1996,  2166,
          1997,  2019,   100,   100,   100,  2466,  3632, 19448,  8576,  2083,
 

In [30]:
#!pip install scikit-learn
#!pip install tqdm
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import numpy as np

def train_bert(n_epochs=1, batch_size=10):
    train = df[0:int(len(df)*0.75)]
    vader = preVader(train).to(device)
    test = df[int(len(df)*0.75):]
    
    train = tensorize(train)
    test = tensorize(test)
    
    n=len(train)
    
    mlp = MultilayerPerceptron(50,2).to(device)
    
    model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False).to(device)
    true = 0
    counter=0
    size_last_batch=(len(test) % batch_size)
    # Initialize the optimizer. Here we use Adam rather than plain SGD
    optimizer = optim.Adam(model.parameters(), lr=1e-5)
    model.train()
    counter = 0
    tot_loss=0
    total_loss = 0
    losses=[0]
    for i in range(n_epochs):
        with tqdm(total=len(list(train))) as pbar:
            for index, batch in enumerate(DataLoader(train, batch_size, shuffle=False)): 

               # Reset the accumulated gradients
                optimizer.zero_grad()

                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = model(input_ids=b_ids, attention_mask=b_mask,
                        labels=b_labels)
                
                loss = outputs.loss

                loss.backward()

                # Update the parameters of the model
                optimizer.step()
                
                ## Update diagnostics
                tot_loss+=loss.item()
                pbar.set_postfix(loss=loss.item())
                pbar.update(batch_size)
                
            model.eval()  # Sets the model to evaluation mode
    with torch.no_grad():  # Blocks the accumulation of gradients
        TP_FP,TP,FN, y,p=0,0,0,[],[]
        for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): 
            counter +=1 
            b_ids = batch[0].to(device)
            b_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            outputs = model(input_ids=b_ids, attention_mask=b_mask,
                            labels=b_labels)
            loss = outputs.loss
            pred = torch.argmax(outputs[1], dim=1 )
            TP_FP +=sum(pred==1)
            TP+=torch.sum((pred==1) & (b_labels==1))
            FN +=torch.sum((pred==0) & (b_labels==1))
            true += sum(pred == b_labels)
            y.extend(b_labels.cpu().detach().numpy())
            p.extend(pred.cpu().detach().numpy())


        cm = confusion_matrix(y, p)
        Precision =TP/TP_FP
        Recall = TP/(TP + FN)
        F1_score=(2 * Precision * Recall)/(Precision + Recall)
        acc = true/(((counter-1)*batch_size)+size_last_batch)
        print(float(acc)*100, ' ACCURACY')
        print(float(Precision)*100, ' Precision')
        print(float(Recall)*100, ' Recall')
        print(float(F1_score)*100, ' F1_score')
        print(cm, 'confusion matrix')
    return model

In [31]:
bert=train_bert(n_epochs=1, batch_size=10)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

KeyboardInterrupt: 

In [35]:
#!pip install scikit-learn
#!pip install tqdm
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import numpy as np

def train_mlp(bert, n_epochs=1, batch_size=10):
    train = df[0:int(len(df)*0.75)]
    vader = preVader(train).to(device)
    test = df[int(len(df)*0.75):]
    
    train = tensorize(train)
    test = tensorize(test)
    
    n=len(train)
    
    mlp = MultilayerPerceptron(50,2).to(device)
    
    true = 0
    counter=0
    size_last_batch=(len(test) % batch_size)
    
    # Initialize the optimizer. Here we use Adam rather than plain SGD
    optimizer = optim.Adam(mlp.parameters(), lr=1e-5)
    mlp.train()
    
    counter = 0
    tot_loss=0
    total_loss = 0
    losses=[0]
    
    for i in range(n_epochs):
        with tqdm(total=len(list(train))) as pbar:
            for index, batch in enumerate(DataLoader(train, batch_size, shuffle=False)): 

               # Reset the accumulated gradients
                optimizer.zero_grad()

                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                
                outputs = bert(input_ids=b_ids, attention_mask=b_mask,
                        labels=b_labels)
                
                mlp_out = mlp.forward(outputs.logits, vader[index*batch_size:index*batch_size+batch_size,:3])
                
                loss = F.cross_entropy(mlp_out, b_labels)
                loss.backward()

                # Update the parameters of the model
                optimizer.step()
                
                ## Update diagnostics
                tot_loss+=loss.item()
                pbar.set_postfix(loss=loss.item())
                pbar.update(batch_size)
                
            model.eval()  # Sets the model to evaluation mode
    with torch.no_grad():  # Blocks the accumulation of gradients
        TP_FP,TP,FN, y,p=0,0,0,[],[]
        for index, batch in enumerate(DataLoader(test, batch_size, shuffle=True)): 
            
            counter +=1 
            
            b_ids = batch[0].to(device)
            b_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            outputs = bert(input_ids=b_ids, attention_mask=b_mask,
                            labels=b_labels)
            
            mlp_out = mlp.forward(outputs.logits, vader[index*batch_size:index*batch_size+batch_size,:3])    
            loss = F.cross_entropy(mlp_out, b_labels)
            
            pred = torch.argmax(mlp_out[1], dim=1 )
            
            TP_FP +=sum(pred==1)
            TP+=torch.sum((pred==1) & (b_labels==1))
            FN +=torch.sum((pred==0) & (b_labels==1))
            true += sum(pred == b_labels)
            y.extend(b_labels.cpu().detach().numpy())
            p.extend(pred.cpu().detach().numpy())

        cm = confusion_matrix(y, p)
        Precision =TP/TP_FP
        Recall = TP/(TP + FN)
        F1_score=(2 * Precision * Recall)/(Precision + Recall)
        acc = true/(((counter-1)*batch_size)+size_last_batch)
        print(float(acc)*100, ' ACCURACY')
        print(float(Precision)*100, ' Precision')
        print(float(Recall)*100, ' Recall')
        print(float(F1_score)*100, ' F1_score')
        print(cm, 'confusion matrix')
    return model

In [37]:
mlp_model = train_mlp(bert, n_epochs=1, batch_size=10)

NameError: name 'bert' is not defined

In [None]:
import pickle
# Save the model to a file
# Denna kodsnutt laddar ner modellen till report, tryck refresh i colab så ser man den, sen kan man importa till django
with open('./model.pkl', 'wb') as f:
    pickle.dump(model, f)