<a href="https://colab.research.google.com/github/learneverythingai/Shivam-Modi-Data-Science-Analytics-Course/blob/main/Advanced%20Deep%20Learning%20Course/bert_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Author and Instructor of this Notebook is **Shivam Modi**.
## LinkedIn: https://www.linkedin.com/in/shivam-modi-datascientist/

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as scheduler
import transformers
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
import numpy as np
from tqdm.autonotebook import tqdm
from sklearn.model_selection import StratifiedKFold

In [None]:
class Config:
    device='cuda'if torch.cuda.is_available() else 'cpu'
    train_batch_size=8
    val_batch_size=8
    seed=5
    n_splits=5
    pooler_output_dim=768
    metric=f1_score
    lr=1e-4
    step_size=10
    gamma=0.1
    epochs=5

In [None]:
checkpoint="bert-base-cased"
bert=transformers.AutoModel.from_pretrained(checkpoint)
bert_tokenizer=transformers.AutoTokenizer.from_pretrained(checkpoint)

In [None]:
data=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
data['sentiment']=list(map(lambda x : 1 if x=="positive" else 0,data['sentiment'].values))


from sklearn.model_selection import StratifiedKFold
data.loc[:,'kfold']=-1
data.sample(frac=1,random_state=Config.seed).reset_index(drop=True)
labels=data['sentiment'].values
skf=StratifiedKFold(n_splits=Config.n_splits)

for fold,(train,val) in enumerate(skf.split(X=data,y=data['sentiment'])):
    data.loc[val,'kfold']=fold
data.to_csv('/kaggle/working/folds.csv',index=False)

In [None]:
class ReviewData:
    def __init__(self,data):
        self.data=data.reset_index(drop=True)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        review=bert_tokenizer(self.data['review'][idx],truncation=True,max_length=300,padding='max_length',return_tensors='pt')
        sentiment=torch.tensor(self.data['sentiment'][idx],dtype=torch.long)
        return {'x':{k:v.to(device=Config.device) for k,v in review.items()},'y':sentiment.to(device=Config.device)}

    
    
class Classif_Model(nn.Module):
    def __init__(self,bert_model,hidden_shape,out_shape):
        super().__init__()
        self.bert_model=bert_model
        self.hidden_shape=hidden_shape
        self.out_shape=out_shape
        self.layers=nn.Sequential(
            nn.Linear(Config.pooler_output_dim,hidden_shape),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.1),
            nn.Linear(hidden_shape,out_shape)
        
        )
    
    def forward(self,batch):
        out=self.bert_model(batch)
        out=out['pooler_output']
        out=self.layers(out)
        return out
    
model=Classif_Model(bert,100,2)

In [None]:
class Train:
    def __init__(self,model,criterion,optimizer,scheduler):
        self.model=model
        self.criterion=criterion
        self.optimizer=optimizer
        self.scheduler=scheduler
    
    @staticmethod
    def logits_to_prediction(output):
        preds=torch.argmax(torch.softmax(output.detach().to('cpu'),dim=-1),dim=-1)
        return preds.view(1,-1)
    @staticmethod
    def return_metric(preds,labels):
        return Config.metric(preds.to('cpu'),labels.to('cpu'),average='weighted')
    
    def fit(self,data_loader):
        self.model.train()
        for batch in data_loader:
            out=self.model(batch['x']['input_ids'].squeeze_(1)) 
            loss=self.criterion(out,batch['y'])
            loss.backward()
            self.optimizer.zero_grad()
            self.optimizer.step()
            self.scheduler.step()
            
    def predict_metric(self,data_loader,type_='train'):
        batch_loss=0
        if type_=='train':
            batch_size=Config.train_batch_size
        elif type_=='val':
            batch_size=Config.val_batch_size
            
        preds=torch.empty([len(data_loader),batch_size])
        labels=torch.empty([len(data_loader),batch_size])
        
        with torch.no_grad():
            self.model.eval()
            for idx,batch in enumerate(data_loader):
                out=self.model(batch['x']['input_ids'].squeeze_(1)) 
                batch_loss+=self.criterion(out,batch['y']).item()
                preds[idx,:]=self.logits_to_prediction(out)
                labels[idx,:]=batch['y'].to('cpu')
        
            metric=self.return_metric(preds,labels)
        return metric,batch_loss/len(data_loader)
    
             
            
folds=pd.read_csv("/kaggle/working/folds.csv")
model=model.to(device=Config.device)
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=Config.lr)
scheduler=optim.lr_scheduler.StepLR(optimizer,step_size=Config.step_size,gamma=Config.gamma)
train=Train(model,criterion,optimizer,scheduler)

epoch_train_loss=[] #per epoch
epoch_val_loss=[]   #per epoch
epoch_train_f1=[]   #per epoch
epoch_val_f1=[]     #per epoch

for epoch in tqdm(range(Config.epochs)):

    train_loss=[] 
    val_loss=[]   
    train_f1=[]   
    val_f1=[]  
    
    for fold in range(Config.n_splits):
        
        train_data=DataLoader(ReviewData(folds.loc[folds['kfold']!=fold]),batch_size=Config.train_batch_size,drop_last=True)
        val_data=DataLoader(ReviewData(folds.loc[folds['kfold']==fold]),batch_size=Config.val_batch_size,drop_last=True)
        train.fit(train_data)
        
        t_f1,t_loss=train.predict_metric(train_data,'train')
        train_loss.append(t_loss)
        train_f1.append(t_f1)
        
        
        v_f1,v_loss=train.predict_metric(val_data,'val')
        val_loss.append(v_loss)
        val_f1.append(v_f1)
        
    epoch_train_loss.append(np.mean(train_loss))
    epoch_train_f1.append(np.mean(train_f1))
    epoch_val_loss.append(np.mean(val_loss))
    epoch_val_f1.append(np.mean(val_f1))


In [None]:
import matplotlib.pyplot as plt
plt.plot(range(Config.epochs),epoch_train_loss)
plt.plot(range(Config.epochs),epoch_val_loss)
plt.plot(range(Config.epochs),epoch_train_f1)
plt.plot(range(Config.epochs),epoch_val_f1)
plt.legend(['train_loss','val_loss','train_f1','val_f1'],bbox_to_anchor=(1.05,1),loc='upper left');