## Fine-tuning (for sentiment analysis)
* TPU utilization discarded😭
* load model weights
* load token bag
* load train dataset
* load valid dataset
* remove the last mlp block
* add another linear layer and sigmoid
* add dropout (optional)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline 

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

**Dataset and token**

In [3]:
class SentimentDataset(Dataset): ## Custom dataset copy
    def __init__(self, text, target):
        super().__init__()
        self.text = text
        self.target = target
    def __len__(self):
        return len(self.target)
    def __getitem__(self,idx):
        return self.text[idx],self.target[idx]

train_dataset = torch.load('/kaggle/input/twitter-text-dataset/train_dataset.pt')
valid_dataset = torch.load('/kaggle/input/twitter-text-dataset/valid_dataset.pt')

  train_dataset = torch.load('/kaggle/input/twitter-text-dataset/train_dataset.pt')
  valid_dataset = torch.load('/kaggle/input/twitter-text-dataset/valid_dataset.pt')


In [4]:
print(f'Train dataset size: {len(train_dataset)}')
print(f'Valid dataset size: {len(valid_dataset)}')

Train dataset size: 1436967
Valid dataset size: 159663


In [5]:
token_bag = torch.load('/kaggle/input/twitter-text-dataset/token_bag.pt')
token_length = len(token_bag)

  token_bag = torch.load('/kaggle/input/twitter-text-dataset/token_bag.pt')


**Model weights**   
Define custom model class (redundancy)

In [6]:
class MultiHeadSelfAttention(nn.Module): ## Multi-head
    def __init__(self, embedding_dimension, num_heads):
        super().__init__()
        assert embedding_dimension % num_heads == 0

        self.dim = embedding_dimension
        self.num_heads = num_heads
        self.head_dim = embedding_dimension//num_heads

        self.K = nn.Linear(self.dim,self.dim)
        self.V = nn.Linear(self.dim,self.dim)
        self.Q = nn.Linear(self.dim,self.dim)
        self.projection = nn.Linear(self.dim,self.dim,bias=False)

    def forward(self, x):

        B,N,D = x.shape

        Kx = self.K(x) ## BxNxD
        Qx = self.Q(x)
        Vx = self.V(x)

        Kx = torch.reshape(Kx,(B,N,self.num_heads,self.head_dim)) ## BxNxHx(D/H)
        Qx = torch.reshape(Qx,(B,N,self.num_heads,self.head_dim))
        Vx = torch.reshape(Vx,(B,N,self.num_heads,self.head_dim))

        Attx = nn.Softmax(dim=3)((1/np.sqrt(self.head_dim))*torch.transpose(Qx,1,2)@torch.transpose(torch.transpose(Kx,1,2),2,3)) ## BxHxNxN
        Satx = torch.transpose(Attx@torch.transpose(Vx,1,2),1, 2) ## BxNxHx(D/H)

        return self.projection(torch.reshape(Satx,(B,N,self.dim)))## BxNxD

In [7]:
class MLP(nn.Module):
    def __init__(self, input_dimension):
        super().__init__()
        self.dim = input_dimension

        self.gelu = torch.nn.GELU(approximate='tanh')
        self.l1 = nn.Linear(self.dim,self.dim*4)
        self.l2 = nn.Linear(self.dim*4,self.dim)

    def forward(self, x):
        return self.l2(self.gelu(self.l1(x))) ## MLP!

In [8]:
class Transformer(nn.Module):
    def __init__(self, embedding_dimension, num_heads):
        super().__init__()
        
        self.dim = embedding_dimension
        self.num_heads = num_heads

        self.mhsa = MultiHeadSelfAttention(self.dim,self.num_heads)
        self.ln1  = nn.LayerNorm(self.dim)
        self.ln2  = nn.LayerNorm(self.dim)
        self.mlp  = MLP(self.dim)

    def forward(self, x):
        y1 = self.mhsa(x)
        y2 = x + y1
        y3 = self.ln1(y2)
        y4 = self.mlp(y3)
        y5 = y3 + y4
        y6 = self.ln2(y5)
        return y6 ## mhsa -> ln1->mlp->ln2

In [9]:
class Encoder(nn.Module):
    def __init__(self, embedding_dimension, num_heads, label, tweet_length=400):
        super().__init__()

        ## Encoder block contains 2 transformer layers,
        ## followed by a linear layer that outputs one 
        ## Inputs have tokens and postional embeddings.
        
        self.dim = embedding_dimension
        self.num_heads = num_heads
        self.label = label

        self.t1 = Transformer(self.dim,self.num_heads)
        self.t2 = Transformer(self.dim,self.num_heads)
        self.te = nn.Embedding(token_length, self.dim)
        self.pe = nn.Embedding(tweet_length, self.dim)

        self.pipelines = nn.Sequential(nn.Linear(self.dim,4*self.dim),nn.ReLU(),
                                       nn.Linear(4*self.dim,token_length))

    def forward(self, x):

        N = x.shape[1]
        embed = self.te(x) + self.pe(torch.arange(N,device=x.device))
        embed = self.t1(embed)
        embed = self.t2(embed)
        embed = self.pipelines(embed)
        return embed

In [10]:
## HYPERPARAMETERS
embedding_dimension = 256
num_heads = 8
learning_rate = 0.001

In [11]:
encoder_model = Encoder(embedding_dimension,num_heads,1)

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [13]:
## save of GPU, load on CPU
encoder_model.load_state_dict(torch.load('/kaggle/input/bert-encoder-model/pytorch/alpha/1/encoder_model.pth',map_location=device,weights_only=True))

<All keys matched successfully>

In [14]:
for layer in encoder_model.state_dict():
    print(f'Layer: {layer}')
    print(f'Shape: {encoder_model.state_dict()[layer].shape}')

Layer: t1.mhsa.K.weight
Shape: torch.Size([256, 256])
Layer: t1.mhsa.K.bias
Shape: torch.Size([256])
Layer: t1.mhsa.V.weight
Shape: torch.Size([256, 256])
Layer: t1.mhsa.V.bias
Shape: torch.Size([256])
Layer: t1.mhsa.Q.weight
Shape: torch.Size([256, 256])
Layer: t1.mhsa.Q.bias
Shape: torch.Size([256])
Layer: t1.mhsa.projection.weight
Shape: torch.Size([256, 256])
Layer: t1.ln1.weight
Shape: torch.Size([256])
Layer: t1.ln1.bias
Shape: torch.Size([256])
Layer: t1.ln2.weight
Shape: torch.Size([256])
Layer: t1.ln2.bias
Shape: torch.Size([256])
Layer: t1.mlp.l1.weight
Shape: torch.Size([1024, 256])
Layer: t1.mlp.l1.bias
Shape: torch.Size([1024])
Layer: t1.mlp.l2.weight
Shape: torch.Size([256, 1024])
Layer: t1.mlp.l2.bias
Shape: torch.Size([256])
Layer: t2.mhsa.K.weight
Shape: torch.Size([256, 256])
Layer: t2.mhsa.K.bias
Shape: torch.Size([256])
Layer: t2.mhsa.V.weight
Shape: torch.Size([256, 256])
Layer: t2.mhsa.V.bias
Shape: torch.Size([256])
Layer: t2.mhsa.Q.weight
Shape: torch.Size([256,

**Need to drop 'pipeline' layers**