## In this notebook, we will Fine Tune a **RoBERTa** Transformer for Sentiment Analysis


#### Importing Python Libraries and Setting Up the Environment

In this step, we will import the necessary libraries and modules to execute our script. The libraries include:
* Pandas
* PyTorch
* PyTorch Utils for Dataset and Dataloader
* Transformers
* RoBERTa Model and Tokenizer


In [81]:
import pandas as pd
import numpy as np
import torch
import transformers
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer

import logging
logging.basicConfig(level=logging.ERROR)

#### Configuring the Device for GPU Usage:


In [82]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

#### Loading Training Data from the 'train.tsv' File:


In [83]:
train = pd.read_csv('/kaggle/input/movie-review-sentiment-analysis-kernels-only/train.tsv.zip', delimiter='\t')

In [84]:
train.shape

(156060, 4)

In [85]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [86]:
train['Sentiment'].unique()

array([1, 2, 3, 4, 0])

In [87]:
train.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


#### We will keep the two columns 'Phrase' (attribute) and 'Sentiment' (target):


In [89]:
new_df = train[['Phrase', 'Sentiment']]
new_df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


#### Defining Key Variables to be Used Later in Training and Validadtion :


In [67]:
MAX_LEN = 52

TRAIN_BATCH_SIZE = 64

VALID_BATCH_SIZE = 32

LEARNING_RATE = 5e-05

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

#### Preparing the Dataset and Dataloader :
This class is defined to accept the Dataframe as input and generate tokenized output that is used by the Roberta model for training.


In [68]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Phrase
        self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

#### Fraction of Data Used for Training and Validation :


In [69]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
val_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("VALIDATION Dataset: {}".format(val_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
validation_set = SentimentData(val_data, tokenizer, MAX_LEN)


FULL Dataset: (156060, 2)
TRAIN Dataset: (124848, 2)
TEST Dataset: (31212, 2)


#### Configuring Training and Validation Parameters with Creation of Corresponding Data Loaders


In [70]:

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)

#### Creating the Neural Network for Fine Tuning :

In [71]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
    
        self.dropout = torch.nn.Dropout(0.3)
     
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    
        hidden_state = output_1[0]
     
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
       
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [72]:
model = RobertaClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

#### Loss Function and Optimizer :


In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [74]:
def calculate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

#### Training Function for RoBERTa Sentiment Analysis Model
Here we define a training function that trains the model on the training dataset


In [75]:
def train(epoch):
   
    tr_loss = 0
   
    n_correct = 0
    
    nb_tr_steps = 0
    nb_tr_examples = 0

    
    model.train()

    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

       
        outputs = model(ids, mask, token_type_ids)

       
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()

       
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calculate_accuracy(big_idx, targets)

     
        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

       
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

       
        optimizer.zero_grad()
        loss.backward()
      
        optimizer.step()
    
    
    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [76]:
EPOCHS = 4
for epoch in range(EPOCHS):
    train(epoch)

1it [00:00,  4.21it/s]

Training Loss per 5000 steps: 1.603122591972351
Training Accuracy per 5000 steps: 20.3125


1951it [09:42,  3.35it/s]

The Total Accuracy for Epoch 0: 65.29059336152761
Training Loss Epoch: 0.8403654154418738
Training Accuracy Epoch: 65.29059336152761



1it [00:00,  4.92it/s]

Training Loss per 5000 steps: 0.7435369491577148
Training Accuracy per 5000 steps: 70.3125


1951it [09:43,  3.34it/s]

The Total Accuracy for Epoch 1: 70.32471485326157
Training Loss Epoch: 0.7146328512764417
Training Accuracy Epoch: 70.32471485326157



1it [00:00,  4.76it/s]

Training Loss per 5000 steps: 0.7087920904159546
Training Accuracy per 5000 steps: 71.875


1951it [09:43,  3.34it/s]

The Total Accuracy for Epoch 2: 72.70040369088812
Training Loss Epoch: 0.6553729299793849
Training Accuracy Epoch: 72.70040369088812



1it [00:00,  4.85it/s]

Training Loss per 5000 steps: 0.6167944669723511
Training Accuracy per 5000 steps: 73.4375


1951it [09:43,  3.34it/s]

The Total Accuracy for Epoch 3: 74.81657695758042
Training Loss Epoch: 0.6079179163256527
Training Accuracy Epoch: 74.81657695758042





#### Validation Function for RoBERTa Sentiment Analysis Model
During the validation stage we pass the unseen data(Validation Dataset) to the model. This step determines how good the model performs on the unseen data.

In [77]:
def valid(model, validation_loader):

  
    model.eval()

    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0

   
    with torch.no_grad():
        for _, data in tqdm(enumerate(validation_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()

           
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calculate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

          
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
                
 
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu

In [78]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

0it [00:00, ?it/s]

Validation Loss per 100 steps: 0.6581960916519165
Validation Accuracy per 100 steps: 71.875


976it [00:49, 19.76it/s]

Validation Loss Epoch: 0.7687701086406825
Validation Accuracy Epoch: 69.11444316288608
Accuracy on test data = 69.11%



