In [1]:
%pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
import re

from tqdm.auto import tqdm

from transformers import BertTokenizer, AutoModelForSequenceClassification, get_scheduler

import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
pd.set_option('display.max_colwidth', 200)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
DATA_PATH = 'IMDB Dataset.csv'
SAVED_MODEL_PATH = '../model'
BERT_CHECKPOINT = 'bert-base-uncased'

MAX_LEN = 128
BATCH_SIZE = 64
NUM_CLASSES = 2
LEARNING_RATE = 2e-5
NUM_EPOCHS= 5

In [5]:

def clean_text(text):
    """Removes extra whitespaces and html tags from text."""
    # remove weird spaces
    text =  " ".join(text.split())
    # remove html tags
    text = re.sub(r'<.*?>', '', text)
    return text


# Class for custom dataset
class CustomDataset(Dataset):
    def __init__(self, review, target, tokenizer, max_len, clean_text=None):
        self.clean_text = clean_text
        self.review = review
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.review)

    def __getitem__(self, idx):
        y = torch.tensor(self.target[idx], dtype=torch.long)
        X = str(self.review[idx])
        if self.clean_text:
            X = self.clean_text(X)
        
        encoded_X = self.tokenizer(
            X, 
            return_tensors = 'pt', 
            max_length = self.max_len, 
            truncation=True,
            padding = 'max_length'
            )

        return {'input_ids': encoded_X['input_ids'].squeeze(),
                'attention_mask': encoded_X['attention_mask'].squeeze(),
                'labels': y}



# Traing loop for one epoch
def train_epoch(model, dataloader, optimizer, scheduler, device, progress_bar):

    losses = []
    accuracies = []

    model.train()
    for batch in dataloader:

        optimizer.zero_grad()
        batch = {k:v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        preds = torch.argmax(outputs.logits, dim=1)
        acc = torch.sum(preds == batch['labels']) / len(preds)
        accuracies.append(acc)
        losses.append(loss)

        progress_bar.update(1)
    
    return torch.tensor(losses, dtype=torch.float).mean().item(), torch.tensor(accuracies).mean().item()


# Evaluation loop
def eval_epoch(model, dataloader, device):
    losses = []
    accuracies = []

    model.eval()
    with torch.no_grad():
        for batch in dataloader:

            batch = {k:v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            preds = torch.argmax(outputs.logits, dim=1)
            acc = torch.sum(preds == batch['labels']) / len(preds)
            accuracies.append(acc)
            losses.append(loss)
        
        return torch.tensor(losses, dtype=torch.float).mean().item(), torch.tensor(accuracies).mean().item()
    
    


# For final evaluation on test set
def test(model, dataloader, device):
    y_preds = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for batch in dataloader:

            batch = {k:v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
        
         
            y_preds.extend( torch.argmax(outputs.logits, dim=1) )
            y_true.extend( batch['labels'])
            
        return y_preds, y_true

In [6]:
data = (pd.read_csv(DATA_PATH).drop_duplicates())

print(f'Numbers of samples: {len(data)}')
data.head()

Numbers of samples: 49582


Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me...",positive
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire p...",positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue i...",positive
3,Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenl...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what mone...",positive


In [7]:
fraction = 0.3
data = data.sample(frac=fraction, random_state = 42).reset_index(drop=True)

In [8]:
data['sentiment'] = data['sentiment'].apply(lambda x: 0 if x == "negative" else 1)
data.head()

Unnamed: 0,review,sentiment
0,"""Soul Plane"" is a horrible attempt at comedy that only should appeal people with thick skulls, bloodshot eyes and furry pawns. <br /><br />The plot is not only incoherent but also non-existent, ac...",0
1,"Guest from the Future tells a fascinating story of time travel, friendship, battle of good and evil -- all with a small budget, child actors, and few special effects. Something for Spielberg and L...",1
2,"""National Treasure"" (2004) is a thoroughly misguided hodge-podge of plot entanglements that borrow from nearly every cloak and dagger government conspiracy cliché that has ever been written. The f...",0
3,"OK. First said, I just wanted to check whether this movie has an average rating below or exactly -1. But 5,9. This is sicker than any of the killers' proceedings -,- . That made me curious what pe...",0
4,"I haven't always been a fan, but the show grew on me. It wasn't until after season 5 that I started to see the richness of the show. They finally brought Daniel Jackson's search for his wife to an...",1


In [9]:

train_df, test_val_df = train_test_split(data, test_size=0.2, stratify=data['sentiment'], random_state=20)

val_df, test_df = train_test_split(test_val_df, test_size=0.5, stratify=test_val_df['sentiment'], random_state=20)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(f'Number of samples in train set: {len(train_df)}')
print(f'Number of samples in validation set: {len(val_df)}')
print(f'Number of samples in test set: {len(test_df)}')

Number of samples in train set: 11900
Number of samples in validation set: 1487
Number of samples in test set: 1488


In [10]:
tokenizer = BertTokenizer.from_pretrained(BERT_CHECKPOINT)

In [11]:
dfs = {'train':train_df, 'val':val_df, 'test':test_df}
dataloaders = {}

for df in dfs:
    should_shuffle = True if df == 'train' else False
    dataloaders[df] = DataLoader(
    CustomDataset(dfs[df]['review'],  dfs[df]['sentiment'], tokenizer=tokenizer, max_len=MAX_LEN, clean_text=clean_text),
    batch_size=BATCH_SIZE, shuffle=should_shuffle
    )
# Testing if batch loads properly
for batch in dataloaders['train']:
    print({k:v.shape for k, v in batch.items()})
    break

{'input_ids': torch.Size([64, 128]), 'attention_mask': torch.Size([64, 128]), 'labels': torch.Size([64])}


In [12]:
# SETUP

# model
model = AutoModelForSequenceClassification.from_pretrained(BERT_CHECKPOINT, num_labels=NUM_CLASSES)

model.to(device)
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

# scheduler
num_training_steps = NUM_EPOCHS * len(dataloaders['train'])
scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
# Test 
outputs = model(**{k:v.to(device) for k, v in batch.items()})
print(outputs.loss, outputs.logits.shape)

tensor(0.6859, device='cuda:0', grad_fn=<NllLossBackward0>) torch.Size([64, 2])


In [14]:
!CUDA_LAUNCH_BLOCKING=1.

In [None]:
# Training, evaluation

progress_bar = tqdm(range(num_training_steps))
history = {'train_loss':[], 'train_acc':[], 'val_loss':[], 'val_acc': []}

best_accuracy = 0
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(model, dataloaders['train'], optimizer, scheduler, device, progress_bar)
    print(f'Train Loss: {train_loss :.4f} | Accuracy: {train_acc*100 :.2f}')

    val_loss, val_acc = eval_epoch(model, dataloaders['val'], device)
    print(f'Eval Loss: {val_loss :.4f} | Accuracy: {val_acc*100 :.2f}')

    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)

    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    # save best model
    if val_acc > best_accuracy:
        model.save_pretrained(SAVED_MODEL_PATH)
        best_accuracy = val_acc
        
    print('-'*50)

        

  0%|          | 0/930 [00:00<?, ?it/s]

Train Loss: 0.3789 | Accuracy: 82.95
Eval Loss: 0.2992 | Accuracy: 86.13
--------------------------------------------------


In [None]:
# Plots
x_epochs = list(range(NUM_EPOCHS))
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.plot(x_epochs, history['train_loss'], marker='o', label='train')
plt.plot(x_epochs, history['val_loss'], marker='o', label='val')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(x_epochs, history['train_acc'], marker='o', label='train')
plt.plot(x_epochs, history['val_acc'], marker='o', label='val')
plt.axhline(best_accuracy, c='grey', ls='--',
            label=f'Best_accuracy({best_accuracy*100 :.2f}%)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()



In [None]:
def make_plots():
    plt.plot(history['train_acc'])
    plt.plot(history['val_acc'])

    plt.title('Accuracy')
    plt.ylabel('Value')
    plt.xlabel('Epoch')
    plt.legend(['train','val'], loc='upper left')
    plt.show()

    plt.plot(history['train_loss'])
    plt.plot(history['val_loss'])

    plt.title('Binary Crossentropy')
    plt.ylabel('Value')
    plt.xlabel('epoch')
    plt.legend(['train','val'], loc='upper left')
    plt.show()

make_plots()

In [None]:
y_preds, y_true = test(model,dataloaders['test'], device)
_ , ax = plt.subplots(figsize=(8,8))
ConfusionMatrixDisplay.from_predictions(torch.tensor(y_true), torch.tensor(y_preds), ax = ax, colorbar = False);


In [None]:
print(classification_report(torch.tensor(y_true), torch.tensor(y_preds)))

In [None]:
from sklearn.metrics import f1_score
f1_score(torch.tensor(y_true), torch.tensor(y_preds))