In [None]:
import nltk
import platform
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler
from torch.utils.data import Dataset, DataLoader

In [None]:
NB_EPOCHS = 5
LR = 1e-5
MAX_LEN = 64
TRAIN_BS = 2
VALID_BS = 2
BERT_MODEL = 'bert-base-uncased'
FILE_NAME = '../data/IMDB Dataset.csv'
TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
    
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, idx):
        review = str(self.review[idx])
        review = ' '.join(review.split())
        
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True
        )
        
        ids = torch.tensor(inputs['inputs_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        targets = torch.tensor(self.target['idx'], dtype=torch.float)
        
        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets': targets
        }

In [None]:
# Functions to train the model
class Trainer:
    def __init__(
        self, 
        model, 
        optimizer, 
        scheduler, 
        train_dataloader, 
        valid_dataloader
    ):
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.train_data = train_dataloader
        self.valid_data = valid_dataloader
        self.loss_fn = nn.BCEWithLogitsLoss()
        
    def train_one_epoch(self):
        prog_bar = tqdm(self.train_data, total=len(self.train_data))
        self.model.train()
        for idx, inputs in prog_bar:
            ids = inputs['ids'].to(device, dtype=torch.long)
            mask = inputs['mask'].to(device, dtype=torch.long)
            ttis = inputs['token_type_ids'].to(device, dtype=torch.long)
            targets = inputs['targets'].to(device, dtype=torch.float)
            
            self.optimizer.zero_grad()
            outputs = self.model(ids=ids, mask=mask, token_type_ids=ttis)
            
            loss = self.loss_fn(outputs, targets)
            prog_bar.set_description('loss: {}'.format())
            
            loss.backward()
            self.optimizer.step()
            self.scheduler.step()
    
    def valid_one_epoch(self):
        prog_bar = tqdm(self.valid_data, total=len(self.valid_data))
        self.model.eval()
        all_targets = []
        all_predictions = []
        for idx, inputs in prog_bar:
            ids = inputs['ids'].to(device, dtype=torch.long)
            mask = inputs['mask'].to(device, dtype=torch.long)
            ttis = inputs['token_type_ids'].to(device, dtype=torch.long)
            targets = inputs['targets'].to(device, dtype=torch.float)
            
            outputs = self.model(ids=ids, mask=mask, token_type_ids=ttis)

            all_targets.extend(targets.cpu().detach().numpy().tolist())
            all_predictions.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        
        output_for_accuracy = all_predictions >= 0.5
        val_accuracy = accuracy_score(all_targets, output_for_accuracy)
        print('Validation Accuracy: {:.2f}'.format(val_accuracy))
        
        return val_accuracy
    
    def get_model(self):
        return self.model

In [None]:
# Model
class BERTModel(nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_MODEL)
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
    
    def forward(self, ids, mask, token_type_ids):
        _, out = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        out = self.drop(out)
        out = self.out(out)
        return out

In [None]:
# Training Code
if __name__ == '__main__':
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}".format(torch.cuda.get_device_name()))
        DEVICE = torch.device('cuda:0')
    else:
        print("[INFO] GPU not found. Using CPU: {}".format(platform.processor()))
        DEVICE = torch.device('cpu')
    
    data = pd.read_csv(FILE_NAME)
    data = data.sample(frac=1).reset_index(drop=True)
    data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})
    
    train_data = data[:45000].sample(frac=1).reset_index(drop=True)
    valid_data = data[45000:].sample(frac=1).reset_index(drop=True)
    print(f"[INFO] Training on: {train_data.shape[0]} samples")
    print(f"[INFO] Validation on: {valid_data.shape[0]} samples")
    
    train_set = BERTDataset(
        review = train_data['review'].values,
        target = train_data['sentiment'].values
    )
    
    valid_set = BERTDataset(
        review = valid_data['review'].values,
        target = valid_data['sentiment'].values
    )
    
    train = DataLoader(
        train_set,
        batch_size = TRAIN_BS,
        shuffle = True,
        num_workers = 4
    )
    
    valid = DataLoader(
        valid_set,
        batch_size = VALID_BS,
        shuffle = False,
        num_workers = 2
    )
    
    print("[INFO] Created Dataloaders!")
    
    model = BERTModel().to(DEVICE)
    nb_train_steps = int(len(train_data) / TRAIN_BS * NB_EPOCHS)
    optimizer = transformers.AdamW(model.parameters(), lr=LR)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=nb_train_steps
    )
    
    trainer = Trainer(model, optimizer, scheduler, train, valid)
    print("[INFO] Initialized Trainer and Models, Starting training...")
    
    best_accuracy = 0
    for epoch in range(1, NB_EPOCHS+1):
        print(f"{'='*20} EPOCH: {epoch} {'='*20}")
        
        # Train for 1 epoch
        trainer.train_one_epoch()
        
        # Validate for 1 epoch
        current_accuracy = trainer.valid_one_epoch()
        
        if current_accuracy > best_accuracy:
            print(f"Saving the Model for Best Accuracy: {current_accuracy:.4f} %")
            torch.save(trainer.get_model().state_dict(), "BERT_BASE_UNCASED_MODEL.pt")
            best_accuracy = current_accuracy
    print("Model Finished Training!")
    print(f"Best Accuracy was: {best_accuracy:.4f}%")
    print(f"Final Accuracy was: {current_accuracy:.4f}%")