# Project  Baseline TDDE09
## Hugo Bjork || Jakob Berggren || Martin Forsberg

For this project we will need to run it on the GPU to optimize speed.

In [3]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
# if cuda memory is full, run this
import torch, gc
gc.collect()
torch.cuda.empty_cache()

## The data

The data used for this project is movie reviews from Imdb. The data set consists of 50 000 reviews labeled positive or negative. We have chosen to slim down the dataset to only include reviews with 256 words or less in them. This is done in order to train the model within a reasonable time frame since and avoid the need to chop up reviews into chunks due to BERTs max length of 512.

In [4]:
#!pip install pandas
import pandas as pd

df = pd.read_csv('imdb.csv')
print(df.shape)
df = df[df['review'].str.split().apply(len) <= 256]
print(df.shape)
label_map = {'positive': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(label_map)
print((df[df['sentiment']==1].count()[0]/len(df))*100)

train = df[0:int(len(df)*0.75)]
test = df[int(len(df)*0.75):int(len(df))]

(50000, 2)
(35711, 2)
49.83618492901347


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['sentiment'].map(label_map)


The data is loaded and preproccessed into a smaller set of max review length of 256.

Here is an example from the data set

In [6]:
train.iloc[0:1,:]

Unnamed: 0,review,sentiment
1,A wonderful little production. <br /><br />The...,1


## Baseline

Our first task is to create our baseline by fine tuning the BERT model to our IMDB data set.

We need two classes from the Transformers library:

In [10]:
#!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification

#Instantiating both classes with the pre-trained bert-base-uncased model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In the tensorize function the data is preproccessed to fit the bert modle requirements by translating the reviews to token ids, masking the padding tokens and finaly a tensor with the labels correspinding to each reviews. These are returned by a TensorDataset so it can easily be split by a dataloader.

In [11]:
from torch.utils.data import TensorDataset

def tensorize(reviews):
    input_ids = []
    labels = []
    attention_masks = []
    for index, rev in reviews.iterrows():
        encoded = tokenizer.encode_plus(
                    rev[0].split(), 
                    add_special_tokens=True, 
                    max_length=258,
                    padding='max_length',
                    return_attention_mask=True,
                    return_tensors='pt', 
       )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(rev[1])
    return TensorDataset(torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels))


A dataloader class is created, which pre computes inputs, masks and labels. We chose a batch size of 32.

In [12]:
from torch.utils.data import DataLoader
dataset= tensorize(train.iloc[:11,:]) 
datalord = DataLoader(dataset, batch_size=32, shuffle=True)

for i,data in enumerate(datalord):
    print('BATCH', i)
    print(data)


BATCH 0
[tensor([[ 101,  100, 2265,  ...,    0,    0,    0],
        [ 101,  100, 2469,  ...,    0,    0,    0],
        [ 101,  100, 2017,  ...,    0,    0,    0],
        ...,
        [ 101,  100, 2245,  ...,    0,    0,    0],
        [ 101,  100, 2011,  ...,    0,    0,    0],
        [ 101,  100,  100,  ...,    0,    0,    0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1])]


In [13]:
#!pip install scikit-learn
#!pip install tqdm
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import numpy as np

def train_bert(n_epochs=1, batch_size=32):
    train = df[0:int(len(df)*0.75)]
    test = df[int(len(df)*0.75):int(len(df))]

    train = tensorize(train)
    test= tensorize(test)

    model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels = 2, output_attentions = False,output_hidden_states = False).to(device)

    true = 0
    counter=0
    size_last_batch=(len(test) % batch_size)

    # Initialize the optimizer. Here we use Adam rather than plain SGD
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

    model.train()

    counter = 0
    for i in range(n_epochs):
        with tqdm(total=len(list(train))) as pbar:
            for index, batch in enumerate(DataLoader(train, batch_size, shuffle=True)):

               # Reset the accumulated gradients
                optimizer.zero_grad()

                b_ids = batch[0].to(device)
                b_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                outputs = model(input_ids=b_ids, attention_mask=b_mask,
                        labels=b_labels)

                # Backward pass; propagates the loss and computes the gradients
                loss = outputs.loss
                loss.backward()

                # Update the parameters of the model
                optimizer.step()
                
                # Update diagnostics
                pbar.set_description(f'Epoch {i + 1}')
                pbar.set_postfix(loss=loss.item())
                pbar.update(batch_size)

    with torch.no_grad():  # Blocks the accumulation of gradients
        TP_FP,TP,FN, y,p=0,0,0,[],[]
        with tqdm(total=len(list(test))) as pbar:
          for index, batch in enumerate(DataLoader(test, batch_size, shuffle=True)): 
              counter +=1 
              b_ids = batch[0].to(device)
              b_mask = batch[1].to(device)
              b_labels = batch[2].to(device)
              outputs = model(input_ids=b_ids, attention_mask=b_mask,
                              labels=b_labels)
              loss = outputs.loss
              pred = torch.argmax(outputs[1], dim=1 )
              TP_FP +=sum(pred==1)
              TP+=torch.sum((pred==1) & (b_labels==1))
              FN +=torch.sum((pred==0) & (b_labels==1))
              true += sum(pred == b_labels)
              y.extend(b_labels.cpu().detach().numpy())
              p.extend(pred.cpu().detach().numpy())

              # Update diagnostics
              pbar.set_description(f'training')
              pbar.update(batch_size)

          cm = confusion_matrix(y, p)
          Precision =TP/TP_FP
          Recall = TP/(TP + FN)
          F1_score=(2 * Precision * Recall)/(Precision + Recall)
          acc = true/(((counter-1)*batch_size)+size_last_batch)
          pbar.set_postfix(accuracy=float(acc)*100,
                           precision=float(Precision)*100,
                           recall=float(Recall)*100,
                           f1_score=float(F1_score)*100)
          print(f"\n{cm}, CONFUSION MATRIX")
    return model

In [14]:
model=train_bert(n_epochs=1, batch_size=32)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


[[3941  551]
 [ 400 4036]], CONFUSION MATRIX





In [16]:
import pickle
# Save the model to a file
with open('./model.pkl', 'wb') as f:
    pickle.dump(model, f)