<a href="https://www.kaggle.com/code/kartikeysharmaah/1tr720-notebook-3?scriptVersionId=232070118" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Fine-tuning (for sentiment analysis)
* TPU utilization discarded😭
* load model weights
* load token bag
* load train dataset
* load valid dataset
* remove the last mlp block
* add another linear layer and sigmoid
* add dropout (optional)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline 

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

**Dataset and token**

In [4]:
class SentimentDataset(Dataset): ## Custom dataset copy
    def __init__(self, text, target):
        super().__init__()
        self.text = text
        self.target = target
    def __len__(self):
        return len(self.target)
    def __getitem__(self,idx):
        return self.text[idx],self.target[idx]

train_dataset = torch.load('/kaggle/input/twitter-text-dataset/train_dataset.pt')
valid_dataset = torch.load('/kaggle/input/twitter-text-dataset/valid_dataset.pt')

  train_dataset = torch.load('/kaggle/input/twitter-text-dataset/train_dataset.pt')
  valid_dataset = torch.load('/kaggle/input/twitter-text-dataset/valid_dataset.pt')


In [5]:
print(f'Train dataset size: {len(train_dataset)}')
print(f'Valid dataset size: {len(valid_dataset)}')

Train dataset size: 1436967
Valid dataset size: 159663


In [6]:
token_bag = torch.load('/kaggle/input/twitter-text-dataset/token_bag.pt')
token_length = len(token_bag)

  token_bag = torch.load('/kaggle/input/twitter-text-dataset/token_bag.pt')


**Model weights**   
Define custom model class (redundancy)

In [7]:
class MultiHeadSelfAttention(nn.Module): ## Multi-head
    def __init__(self, embedding_dimension, num_heads):
        super().__init__()
        assert embedding_dimension % num_heads == 0

        self.dim = embedding_dimension
        self.num_heads = num_heads
        self.head_dim = embedding_dimension//num_heads

        self.K = nn.Linear(self.dim,self.dim)
        self.V = nn.Linear(self.dim,self.dim)
        self.Q = nn.Linear(self.dim,self.dim)
        self.projection = nn.Linear(self.dim,self.dim,bias=False)

    def forward(self, x):

        B,N,D = x.shape

        Kx = self.K(x) ## BxNxD
        Qx = self.Q(x)
        Vx = self.V(x)

        Kx = torch.reshape(Kx,(B,N,self.num_heads,self.head_dim)) ## BxNxHx(D/H)
        Qx = torch.reshape(Qx,(B,N,self.num_heads,self.head_dim))
        Vx = torch.reshape(Vx,(B,N,self.num_heads,self.head_dim))

        Attx = nn.Softmax(dim=3)((1/np.sqrt(self.head_dim))*torch.transpose(Qx,1,2)@torch.transpose(torch.transpose(Kx,1,2),2,3)) ## BxHxNxN
        Satx = torch.transpose(Attx@torch.transpose(Vx,1,2),1, 2) ## BxNxHx(D/H)

        return self.projection(torch.reshape(Satx,(B,N,self.dim)))## BxNxD

In [8]:
class MLP(nn.Module):
    def __init__(self, input_dimension):
        super().__init__()
        self.dim = input_dimension

        self.gelu = torch.nn.GELU(approximate='tanh')
        self.l1 = nn.Linear(self.dim,self.dim*4)
        self.l2 = nn.Linear(self.dim*4,self.dim)

    def forward(self, x):
        return self.l2(self.gelu(self.l1(x))) ## MLP!

In [9]:
class Transformer(nn.Module):
    def __init__(self, embedding_dimension, num_heads):
        super().__init__()
        
        self.dim = embedding_dimension
        self.num_heads = num_heads

        self.mhsa = MultiHeadSelfAttention(self.dim,self.num_heads)
        self.ln1  = nn.LayerNorm(self.dim)
        self.ln2  = nn.LayerNorm(self.dim)
        self.mlp  = MLP(self.dim)

    def forward(self, x):
        y1 = self.mhsa(x)
        y2 = x + y1
        y3 = self.ln1(y2)
        y4 = self.mlp(y3)
        y5 = y3 + y4
        y6 = self.ln2(y5)
        return y6 ## mhsa -> ln1->mlp->ln2

In [10]:
class Encoder(nn.Module):
    def __init__(self, embedding_dimension, num_heads, label, tweet_length=400):
        super().__init__()

        ## Encoder block contains 2 transformer layers,
        ## followed by a linear layer that outputs one 
        ## Inputs have tokens and postional embeddings.
        
        self.dim = embedding_dimension
        self.num_heads = num_heads
        self.label = label

        self.t1 = Transformer(self.dim,self.num_heads)
        self.t2 = Transformer(self.dim,self.num_heads)
        self.te = nn.Embedding(token_length, self.dim)
        self.pe = nn.Embedding(tweet_length, self.dim)

        self.pipelines = nn.Sequential(nn.Linear(self.dim,4*self.dim),nn.ReLU(),
                                       nn.Linear(4*self.dim,token_length))

    def forward(self, x):

        N = x.shape[1]
        embed = self.te(x) + self.pe(torch.arange(N,device=x.device))
        embed = self.t1(embed)
        embed = self.t2(embed)
        embed = self.pipelines(embed)
        return embed

In [11]:
## HYPERPARAMETERS
embedding_dimension = 256
num_heads = 8
learning_rate = 0.001

In [12]:
encoder_model = Encoder(embedding_dimension,num_heads,1)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [14]:
## save of GPU, load on GPU
encoder_model.load_state_dict(torch.load('/kaggle/input/bert-encoder-model/pytorch/alpha/1/encoder_model.pth',weights_only=True))
encoder_model = encoder_model.to(device=device)

In [15]:
for layer in encoder_model.state_dict():
    print(f'Layer: {layer}')
    print(f'Shape: {encoder_model.state_dict()[layer].shape}')

Layer: t1.mhsa.K.weight
Shape: torch.Size([256, 256])
Layer: t1.mhsa.K.bias
Shape: torch.Size([256])
Layer: t1.mhsa.V.weight
Shape: torch.Size([256, 256])
Layer: t1.mhsa.V.bias
Shape: torch.Size([256])
Layer: t1.mhsa.Q.weight
Shape: torch.Size([256, 256])
Layer: t1.mhsa.Q.bias
Shape: torch.Size([256])
Layer: t1.mhsa.projection.weight
Shape: torch.Size([256, 256])
Layer: t1.ln1.weight
Shape: torch.Size([256])
Layer: t1.ln1.bias
Shape: torch.Size([256])
Layer: t1.ln2.weight
Shape: torch.Size([256])
Layer: t1.ln2.bias
Shape: torch.Size([256])
Layer: t1.mlp.l1.weight
Shape: torch.Size([1024, 256])
Layer: t1.mlp.l1.bias
Shape: torch.Size([1024])
Layer: t1.mlp.l2.weight
Shape: torch.Size([256, 1024])
Layer: t1.mlp.l2.bias
Shape: torch.Size([256])
Layer: t2.mhsa.K.weight
Shape: torch.Size([256, 256])
Layer: t2.mhsa.K.bias
Shape: torch.Size([256])
Layer: t2.mhsa.V.weight
Shape: torch.Size([256, 256])
Layer: t2.mhsa.V.bias
Shape: torch.Size([256])
Layer: t2.mhsa.Q.weight
Shape: torch.Size([256,

**Need to drop 'pipeline' layers**

In [16]:
encoder_model.pipelines = nn.Identity() ## identity takes and sends the same value

---

In [17]:
## Create module class that takes output from Encoder
## model and pass it into a the linear layer. Finally
## add the  sigmoid function. We put sigmoid on first
## embedding which is the <cls> tag. Threshold is 0.5

class UpdatedEncoder(nn.Module):
    def __init__(self, model, embedding_dimension):
        super().__init__()
        self.model = model
        self.dim   = embedding_dimension

        self.stack = nn.Sequential(nn.Linear(self.dim,4*self.dim),nn.ReLU(),nn.Dropout(0.3),
                                   nn.Linear(4*self.dim,1)) ## single logit
    def forward(self, x):
        return self.stack(self.model(x))

In [18]:
updated_encoder_model = UpdatedEncoder(encoder_model, embedding_dimension).to(device=device)

In [19]:
for layer in updated_encoder_model.state_dict():
    print(f'Layer: {layer}')
    print(f'Shape: {updated_encoder_model.state_dict()[layer].shape}') ## gets model overview

Layer: model.t1.mhsa.K.weight
Shape: torch.Size([256, 256])
Layer: model.t1.mhsa.K.bias
Shape: torch.Size([256])
Layer: model.t1.mhsa.V.weight
Shape: torch.Size([256, 256])
Layer: model.t1.mhsa.V.bias
Shape: torch.Size([256])
Layer: model.t1.mhsa.Q.weight
Shape: torch.Size([256, 256])
Layer: model.t1.mhsa.Q.bias
Shape: torch.Size([256])
Layer: model.t1.mhsa.projection.weight
Shape: torch.Size([256, 256])
Layer: model.t1.ln1.weight
Shape: torch.Size([256])
Layer: model.t1.ln1.bias
Shape: torch.Size([256])
Layer: model.t1.ln2.weight
Shape: torch.Size([256])
Layer: model.t1.ln2.bias
Shape: torch.Size([256])
Layer: model.t1.mlp.l1.weight
Shape: torch.Size([1024, 256])
Layer: model.t1.mlp.l1.bias
Shape: torch.Size([1024])
Layer: model.t1.mlp.l2.weight
Shape: torch.Size([256, 1024])
Layer: model.t1.mlp.l2.bias
Shape: torch.Size([256])
Layer: model.t2.mhsa.K.weight
Shape: torch.Size([256, 256])
Layer: model.t2.mhsa.K.bias
Shape: torch.Size([256])
Layer: model.t2.mhsa.V.weight
Shape: torch.Siz

**test model**

In [20]:
tempx = torch.tensor([[1,345,4324,66,46930,8665,637,2,9748,10]],device=device)
tempy = updated_encoder_model(tempx)[0,0,0] ## considering only <cls> gets tag
print(tempy)

tensor(-0.0084, device='cuda:0', grad_fn=<SelectBackward0>)


In [21]:
optimizer = torch.optim.SGD(updated_encoder_model.parameters(),lr=learning_rate,momentum=0.9)
loss = nn.BCEWithLogitsLoss()

---

**Training**

In [22]:
def text_embedding(text):
    embedding = []
    token_list = str(text).lower().split(' ')
    for token in token_list:
        
        if len(token) == 0:
            continue
        if token[0] == '@':
            continue

        token = token.strip('.')
        yoken = ''
        for chars in token:
            if chars =='.':
                if yoken in token_bag.keys():
                    embedding.append(token_bag[yoken])
                yoken = ''
            else:
                yoken = yoken + chars
        if yoken in token_bag.keys():
            embedding.append(token_bag[yoken])

    return torch.tensor([embedding])

In [24]:
updated_encoder_model.train()
## fine-tuning: train model

step = 0
loss_avg = []
for text,target in train_dataset:
    x = text_embedding(text)
    if x.shape[1] == 0:
        continue

    ## add <cls> tag to the
    ## text, that maps to 1
    
    x = torch.cat((torch.tensor([[1]]),x),dim=1)
    y = torch.tensor(target,dtype=float)

    ## Training
    optimizer.zero_grad() ## set gradients to 0
    x = x.to(device=device)
    y = y.to(device=device)
    logits =  updated_encoder_model(x)[0, 0, 0]
    loss_value = loss(logits,y)
    loss_avg.append(loss_value)
    loss_value.backward()
    optimizer.step()

    if step%100000 == 0:
        loss_mean = torch.mean(torch.tensor(loss_avg)) ## way to capture progress
        print(f'Step: {step+1} \t Mean Loss: {loss_mean:.2f}')
        loss_avg = []
    step += 1

Step: 1 	 Mean Loss: 0.18
Step: 100001 	 Mean Loss: 0.50
Step: 200001 	 Mean Loss: 0.49
Step: 300001 	 Mean Loss: 0.49
Step: 400001 	 Mean Loss: 0.49
Step: 500001 	 Mean Loss: 0.49
Step: 600001 	 Mean Loss: 0.49
Step: 700001 	 Mean Loss: 0.49
Step: 800001 	 Mean Loss: 0.48
Step: 900001 	 Mean Loss: 0.48
Step: 1000001 	 Mean Loss: 0.48
Step: 1100001 	 Mean Loss: 0.48
Step: 1200001 	 Mean Loss: 0.48
Step: 1300001 	 Mean Loss: 0.48
Step: 1400001 	 Mean Loss: 0.48


---

**Predict**

In [25]:
sigmoid = nn.Sigmoid()

In [26]:
def compute_stats(dataset): ## accuracy and average loss
    
    updated_encoder_model.eval()
    ## evaluation mode, No grad
    ## update required
    
    count = 0
    valid_loss = []
    
    with torch.no_grad(): ## No gradients are calculated
        for text, target in dataset:
            x = text_embedding(text)
            if x.shape[1] == 0:
                continue

            x = torch.cat((torch.tensor([[1]]),x),dim=1)
            y = torch.tensor(target,dtype=float)
            x = x.to(device=device)
            y = y.to(device=device)
            
            logits = updated_encoder_model(x)[0, 0, 0]
            valid_loss.append(loss(logits,y))
    
            p = 1.0 if sigmoid(logits) >= 0.5 else 0.0
    
            if p == y:
                count = count + 1
    return (count*100.00)/len(dataset),torch.mean(torch.tensor(valid_loss))

In [27]:
train_accuracy,train_loss = compute_stats(train_dataset)
print(f'Train Accuracy: {train_accuracy:.2f}% \t Train Loss: {train_loss:.2f}')

Train Accuracy: 77.36% 	 Train Loss: 0.47


In [29]:
valid_accuracy,valid_loss = compute_stats(valid_dataset)
print(f'Valid Accuracy: {valid_accuracy:.2f}% \t Valid Loss: {valid_loss:.2f}')

Valid Accuracy: 76.81% 	 Valid Loss: 0.48


---
**END🫡**