## CHOOSE DATASET


In [55]:
# df = ["asc", "dsc", "news"]
df = "dsc"

## INITIALIZE LOGGER

In [65]:
# !pip3 install wandb

In [4]:
import wandb
wandb.login(key="3db31cd19d063689e924d07069de6c7a1670642b")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
wandb.init(
    # set the wandb project where this run will be logged
    project="sequential_meta_classifier",
    name="FINALRUNS",
)

[34m[1mwandb[0m: Currently logged in as: [33manushka16[0m ([33msequential_meta_classifier[0m). Use [1m`wandb login --relogin`[0m to force relogin


## IMPORTS

In [62]:
# !pip3 install datasets

In [32]:
import os
import json
import math
from collections import OrderedDict
import torch
from torch import nn, Tensor
from typing import Union, Tuple, List, Iterable, Dict
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
import random
import numpy as np
import gzip, csv
import pandas as pd
from tqdm.auto import tqdm
import torch.nn.init as init
from torch.utils.data import DataLoader, random_split
from transformers import BertTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset
from sklearn.metrics import f1_score

torch.manual_seed(0)
np.random.seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Activation Function

In [8]:
def gelu(x):
    """Implementation of the gelu activation function."""
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


## POSITIONAL ENCODING LAYER

In [9]:
class PositionalEncoding(nn.Module):

    def __init__(self, embed_dim: int, drop_rate=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=drop_rate)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
        pe = torch.zeros(1, max_len, embed_dim)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        
        Returns:
            torch.Tensor: Output tensor after adding positional encodings and applying dropout.
                 It has the same shape as the input tensor [batch_size, seq_len, embedding_dim].
                 The positional encodings are added to the input tensor along the sequence length dimension,
                 and dropout is applied to the combined tensor.
        
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

## ATTENTION MECHANISM

In [10]:
def scaled_dot_product(q, k, v, attn_drop_rate=0.1):
    """
    Args:
      q: query, shape: (batch, # heads, seq len, head dimension)
      k: keys, shape: (batch, # heads, seq len, head dimension)
      v: value, shape: (batch, # heads, seq len, head dimension)
      attn_drop_rate: probability of an element to be zeroed,
      mask: the optional masking of specific entries in the attention matrix.
              shape: (batch, seq len)
    
     Returns:
        torch.Tensor: Output tensor after scaled dot product attention computation.
           Shape: (batch, # heads, seq len, head dimension).
    
    """

    d_k = q.shape[-1]
    attn_logits = torch.matmul(q, k.transpose(-1, -2))
    attn_logits = attn_logits/math.sqrt(d_k)
    attention = F.softmax(attn_logits, dim=-1)
    attention = F.dropout(attention, p=attn_drop_rate)
    values = torch.matmul(attention,v)
    return values

In [11]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, n_heads, attn_drop_rate):
        super().__init__()
        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.head_dim = embed_dim // n_heads
        self.attn_drop_rate = attn_drop_rate
        self.query = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.key = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.value = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.o_proj = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self._reset_parameters()

    def _reset_parameters(self):
      nn.init.xavier_uniform_(self.query.weight)
      self.query.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.key.weight)
      self.key.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.value.weight)
      self.value.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.o_proj.weight)
      self.o_proj.bias.data.fill_(0)

    def split_heads(self, tensor):
       new_shape = tensor.size()[:-1] + (self.n_heads, self.head_dim)
       tensor = tensor.view(*new_shape)
       tensor = tensor.permute(0, 2, 1, 3).contiguous()
       return tensor

    def merge_heads(self, tensor, batch_size, seq_length):
       tensor = tensor.transpose(1, 2).contiguous().view(batch_size, seq_length, self.embed_dim)
       return tensor

    def forward(self, embedding):
      """
       Args:
        embedding (torch.Tensor): 
            A tensor of shape (batch_size, seq_length, embed_dim) representing the input embeddings.
            - `batch_size`: The number of samples in the batch.
            - `seq_length`: The number of tokens (or time steps) in each sequence.
            - `embed_dim`: The dimension of the embedding for each token.
       
       Returns:
        torch.Tensor: 
            A tensor of shape (batch_size, seq_length, embed_dim) representing the attended embeddings.
            - `batch_size`: The number of samples in the batch.
            - `seq_length`: The number of tokens (or time steps) in each sequence.
            - `embed_dim`: The dimension of the embedding for each token.
      """
      batch_size, seq_length, embed_dim = embedding.size()
      q, k, v = self.query(embedding), self.key(embedding), self.value(embedding)
      q = self.split_heads(q)
      k = self.split_heads(k)
      v = self.split_heads(v)
      values = scaled_dot_product(q, k, v, self.attn_drop_rate)
      values = self.merge_heads(values, batch_size, seq_length)
      attended_embeds = self.o_proj(values)
      return attended_embeds

## LAYER NORMALIZATION LAYER

In [12]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        """
         Args:
         inputs (Tensor): Input tensor to normalize.
         
         Returns:
                torch.Tensor: Normalized tensor after applying layer normalization.
                 It has the same shape as the input tensor `(batch_size, *parameters_shape)`.

        """
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y  + self.beta
        return out

## FEEDFORWARD LAYER

In [13]:
class PositionwiseFeedForward(nn.Module): 

    def __init__(self, embed_dim, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(embed_dim, 4*embed_dim)
        self.linear2 = nn.Linear(4*embed_dim, embed_dim)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        """
         Args:
             x (torch.Tensor): Input tensor to the feedforward network.
                 Its shape should be `(batch_size, sequence_length, embed_dim)`.
                 `batch_size` is the number of sequences in a batch,
                 `sequence_length` is the length of each sequence,
                 and `embed_dim` is the dimensionality of the input and output embeddings.
     
         Returns:
             torch.Tensor: Output tensor of the feedforward network.
                 It has the same shape as the input tensor `(batch_size, sequence_length, embed_dim)`.
        
        """
        x = self.linear1(x)
        x = self.gelu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

## CLASSIFIER LAYER

In [14]:
class Classifier(nn.Module):
    def __init__(self, input_dim, numclasses, dropout_rate=0.1):
        super(Classifier, self).__init__()
        self.linear = nn.Linear(input_dim, numclasses) 

    def forward(self, x):
     """
        Args:
            x (torch.Tensor): Input tensor to the classifier.
                Its shape should be `()`.
                `batch_size` is the number of samples in the batch,
                and `input_dim` is the dimensionality of the input features.

        Returns:
            torch.Tensor: Output tensor representing the logits for each class.
                It has the shape `()`.
                `batch_size` is the number of samples in the batch,
                and `num_classes` is the number of classes in the classification task.
     """
     x = self.linear(x)
     return x

## ENCODER LAYER

In [15]:
class EncoderLayer(nn.Module):

    def __init__(self, embed_dim, n_heads, attn_drop_rate, layer_drop_rate):
        super(EncoderLayer, self).__init__()
        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.attention = MultiHeadAttention(self.embed_dim, self.n_heads, attn_drop_rate)
        self.norm1 = LayerNormalization(parameters_shape=[self.embed_dim])
        self.dropout1 = nn.Dropout(p=layer_drop_rate)
        self.ffn = PositionwiseFeedForward(self.embed_dim,layer_drop_rate)
        self.norm2 = LayerNormalization(parameters_shape=[self.embed_dim])
        self.dropout2 = nn.Dropout(p=layer_drop_rate)

    def forward(self, x):
     """
        Args:
        x (torch.Tensor): Input tensor to the encoder layer.
            Its shape should be `(batch_size, seq_length, embed_dim)`.
            - `batch_size`: The number of samples in the batch.
            - `seq_length`: The number of tokens (or time steps) in each sequence.
            - `embed_dim`: The dimension of the embedding for each token.
            
        Returns:
            torch.Tensor: Output tensor representing the encoded representations.
                It has the same shape as the input tensor `(batch_size, seq_length, embed_dim)`.
                - `batch_size`: The number of samples in the batch.
                - `seq_length`: The number of tokens (or time steps) in each sequence.
                - `embed_dim`: The dimension of the embedding for each token.
            
     """
     residual_x = x
     x = self.attention(x)
     x = self.dropout1(x)
     x = x + residual_x
     x = self.norm1(x)
     residual_x = x
     x = self.ffn(x)
     x = self.dropout2(x)
     x = x + residual_x
     x = self.norm2(x)
     return x

## TRANSFORMER LAYER

In [16]:
class ENCTransformer(nn.Module): 
    def __init__(self, n_layers, vocab_size, embed_dim, n_heads, num_classes, attn_drop_rate, layer_drop_rate):
        super().__init__()
        self.embed = nn.Embedding(vocab_size+1, embed_dim)
        self.position = PositionalEncoding(embed_dim, layer_drop_rate)
        self.net = nn.Sequential(*[
        EncoderLayer(embed_dim, n_heads, attn_drop_rate, layer_drop_rate) for _ in range(n_layers)
        ])
        self.pooler = nn.Sequential(OrderedDict([
            ('dense', nn.Linear(embed_dim, embed_dim)),
            ('activation', nn.Tanh()),
        ]))
        self.classifier = Classifier(embed_dim, num_classes)
        self.saved_sample = None

    def forward(self, batch_text):
     """
        Args:
            batch_text (torch.Tensor): Batch of input texts represented as token indices.
                Its shape should be `(batch_size, seq_length)`.
        Returns:
            torch.Tensor: Predicted logits for each class.
                It has the shape `(batch_size, num_classes)`.
                - `batch_size`: The number of samples in the batch.
                - `num_classes`: The number of classes in the classification task.
     """
     batch_text = batch_text.squeeze(1)
     embedding = self.position(self.embed(batch_text)) 
     new_embedding = self.net((embedding))
     o = self.pooler(new_embedding[:, 0])
     preds = self.classifier(o)
     return preds



## DATALOADER

In [64]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
class MyDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(item['text'], truncation=True, padding='max_length', return_tensors='pt', max_length=512)
        encoding['label'] = torch.tensor(item['label'])
        return encoding


if df == "news":
    dataset = load_dataset("setfit/20_newsgroups")
elif df == "asc":
    path="/kaggle/input/dsc-dataset/dat/absa/"
    dataset={'train':[],'test':[],'val':[]}
    idx=0
    for subdir in os.listdir(path):
        if subdir!='XuSemEval':
            subdir_path = os.path.join(path, subdir+'/asc')
            for subsubdir in os.listdir(subdir_path):
                subsubdir_path=os.path.join(subdir_path, subsubdir)
                train_path = os.path.join(subsubdir_path, 'train.json')
                test_path = os.path.join(subsubdir_path, 'test.json')
                val_path = os.path.join(subsubdir_path, 'dev.json')
                paths=[train_path,test_path,val_path]
                for i in range(len(paths)):
                    with open(paths[i], 'r') as f:
                        l=dataset[list(dataset.keys())[i]]
                        data = json.load(f)
                        for entry in data.values():
                            if "sentence" in entry:
                                l.append({'text':entry["sentence"],'label':idx})
                idx+=1
        else:
            subdir_path = os.path.join(path, subdir+'/asc')
            flag=False
            for subsubdir in os.listdir(subdir_path):
                subsubdir_path=os.path.join(subdir_path, subsubdir)
                if subsubdir=='14':
                    flag=True
                for subsubsubdir in os.listdir(subsubdir_path):
                    subsubsubdir_path=os.path.join(subsubdir_path, subsubsubdir)
                    if subsubsubdir=='rest':
                        idx=14
                    train_path = os.path.join(subsubsubdir_path, 'train.json')
                    test_path = os.path.join(subsubsubdir_path, 'test.json')
                    val_path = os.path.join(subsubsubdir_path, 'dev.json')
                    paths=[train_path,test_path,val_path]
                    for i in range(len(paths)):
                        with open(paths[i], 'r') as f:
                            l=dataset[list(dataset.keys())[i]]
                            data = json.load(f)
                            for entry in data.values():
                                if flag:
                                  if "sentence" in entry:
                                            l.append({'text':entry["sentence"],'label':idx}) 
                                else:
                                    if entry is not None:
                                        for subentry in entry.values():
                                            if "sentence" in subentry:
                                                l.append({'text':subentry["sentence"],'label':idx})                
                    idx+=1
                flag=False
elif df == "dsc":
    path = "/kaggle/input/dsc-dataset/dat/dsc/"
    dataset={'train':[],'test':[],'val':[]}
    idx=0
    for subdir in os.listdir(path):
        subdir_path = os.path.join(path, subdir)
        if os.path.isdir(subdir_path):
            file_count = sum(1 for f in os.listdir(subdir_path) if f.endswith('.json'))
            if file_count > 3:
                train_path = os.path.join(subdir_path, 'train.json')
                test_path = os.path.join(subdir_path, 'test.json')
                val_path = os.path.join(subdir_path, 'dev.json')
                paths=[train_path,test_path,val_path]
                for i in range(len(paths)):
                    with open(paths[i], 'r') as f:
                        l=dataset[list(dataset.keys())[i]]
                        data = json.load(f)
                        for entry in data.values():
                            if "sentence" in entry:
                                l.append({'text':entry["sentence"],'label':idx})
                idx+=1


    
    
traindata = MyDataset(dataset=dataset['train'], tokenizer=tokenizer)
val_size = int(len(traindata) * 0.2)  
train_size = len(traindata) - val_size  
traindata, valdata = random_split(traindata, [train_size, val_size])
testdata = MyDataset(dataset=dataset['test'], tokenizer=tokenizer)
train_dataloader = DataLoader(traindata, batch_size=32, shuffle=True)
val_dataloader = DataLoader(valdata, batch_size=32)
test_dataloader = DataLoader(testdata, batch_size=32)

## CONFIG

In [60]:
embed_dim = 512
n_heads = 8
n_layers = 4
vocab_size = 30522
#vocab_size = 91015
attn_drop_rate = 0.5
layer_drop_rate = 0.2
num_classes=20
num_epochs = 40
model = ENCTransformer(n_layers, vocab_size, embed_dim, n_heads, num_classes, attn_drop_rate, layer_drop_rate)
model = model.to(device)

def calculate_accuracy(outputs, labels):
    _, predicted = torch.max(outputs, dim=1)
    correct = (predicted == labels).sum().item()  
    total = labels.size(0) 
    accuracy = correct / total
    return accuracy


optimizer = AdamW(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)

<torch.optim.lr_scheduler.CosineAnnealingLR at 0x7a55718030a0>

## TRAINING - VALIDATION LOOP

In [61]:
for epoch in range(num_epochs):
    wandb.log({"epoch": epoch+1})
    print(f"######## Training Epoch {epoch + 1}/{num_epochs} #########")
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    total_f1macro = 0
    no_of_batches = 0

    for batch_idx,batch in enumerate(train_dataloader,1):
        no_of_batches += 1
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1)
        labels = batch['label']
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        output = model(input_ids)
        loss = criterion(output, labels)
        wandb.log({"batch_loss": loss})
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        accuracy = calculate_accuracy(output, labels)
#         output = output.detach().cpu().numpy()
#         output = np.array(output.cpu().detach().numpy())
#         lables = np.array(lables.cpu().detach().numpy())
#         lables = labels.detach().cpu().numpy()
#         f1macro = f1_score(labels, output, average='macro')
#         output = torch.from_numpy(output).to('cuda')
#         lables = torch.from_numpy(lables).to('cuda')
        total_correct += (accuracy * labels.size(0))
        total_samples += labels.size(0)
#         total_f1macro += f1macro

        wandb.log({"batch_accuracy": accuracy})
#         wandb.log({"batch_f1macro": f1macro})

        if batch_idx % 10 == 0:  
            avg_loss = total_loss / batch_idx
            print(f"Batch {batch_idx}/{len(train_dataloader)} - Avg Loss: {avg_loss:.4f}")
            wandb.log({"avg_batch_loss": avg_loss})

    avg_epoch_loss = total_loss / len(train_dataloader)
    training_accuracy = total_correct / total_samples
    training_f1macro = total_f1macro / no_of_batches
    print(f"Epoch {epoch + 1}/{num_epochs} accuracy = {training_accuracy}")
    print(f"Epoch {epoch + 1} - Avg Loss: {avg_epoch_loss:.4f}")
#     print(f"Epoch {epoch + 1} - F1 Macro: {training_f1macro}")

    wandb.log({"avg_epoch_loss": avg_epoch_loss})
    wandb.log({"epoch_training_accuracy": training_accuracy})
#     wandb.log({"epoch_training_f1macro": training_f1macro})

    print("###### Validating ######")
    model.eval()  
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
      for batch in val_dataloader:
          input_ids = batch['input_ids'].squeeze(1)
          labels = batch['label']
          input_ids = input_ids.to(device)
          labels = labels.to(device)
          outputs = model(input_ids)
          accuracy = calculate_accuracy(outputs, labels)
#           valid_f1macro = f1_score(labels, outputs, average='macro')
          total_correct += (accuracy * labels.size(0))
          total_samples += labels.size(0)
    testing_accuracy = total_correct / total_samples
    print(f"Epoch {epoch + 1} - Validation Accuracy: {testing_accuracy:.4f}")
    print(f"Epoch {epoch + 1} - Validation F1 Macro: {valid_f1macro}")
    wandb.log({"validation_accuracy": testing_accuracy})
#     wandb.log({"validation_f1macro": valid_f1macro})

    # SAVE MODEL FOE EVERY EPOCH
    # curent_state = {
    #         'epoch': epoch + 1,
    #         'model_state': model.state_dict(),
    #         'optimizer_state': optimizer.state_dict(),
    #     }
    # save_path = f"models/cross_attention_endoffset_best_model_v14.pth"
    # torch.save(curent_state, save_path)
    # print(f"Saved model state to'{save_path}'")

######## Training Epoch 1/40 #########
Batch 10/934 - Avg Loss: 2.5628


KeyboardInterrupt: 

## TESTING LOOP

In [30]:
print("###### Final Testing on Test data ######")
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].squeeze(1)
        labels = batch['label']
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        outputs = model(input_ids)

        accuracy = calculate_accuracy(outputs, labels)
        total_correct += (accuracy * labels.size(0))
        total_samples += labels.size(0)

testing_accuracy = total_correct / total_samples
print(f"Final Testing Accuracy: {testing_accuracy:.4f}")
wandb.log({"testing accuracy": testing_accuracy})

###### Final Testing on Test data ######
Final Testing Accuracy: 0.0424
