In [1]:
!pip install transformers
!pip install evaluate
!pip install accelerate -U
!pip install torchmetrics
!pip install optuna
!pip install -U "neptune[optuna]"
!pip install shap



In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel,AutoTokenizer,AdamW
from tqdm import tqdm
import evaluate
from datasets import load_dataset
from tqdm.auto import tqdm
import numpy as np
import random
from torchmetrics.classification import BinaryAccuracy
import optuna
from google.colab import userdata
import neptune
import uuid
import neptune.integrations.optuna as npt_utils
import  hashlib
import time
import math
import shap
import pandas as pd

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["NEPTUNE_API_TOKEN"] = userdata.get('NEPTUNE_API_TOKEN')
device = torch.device("cuda:0"  if torch.cuda.is_available() else "cpu")

In [4]:
# @title Define Transformer Model Name#cardiffnlp/twitter-roberta-base-sep2022
bert_model_name = "cardiffnlp/twitter-roberta-base-sep2022" # @param {type:"string"}

In [5]:
# @title Define Hugging Face Dataset Name
dataset_name = "krishan-CSE/HatEval_Relabled_with_Emotion" # @param {type:"string"}

In [6]:
# @title  Define Neptuna Project Name,Study & Best Trial ID
study_id = "FIN-3468" # @param {type:"string"}
trial_id = "FIN-3566" # @param {type:"string"}
project_name="krishanchavinda.official/Fine-Tuning-DCL-Framework"

In [7]:
run_study = neptune.init_run(with_id=study_id,project=project_name,mode='read-only')

https://app.neptune.ai/krishanchavinda.official/Fine-Tuning-DCL-Framework/e/FIN-3468


In [8]:
run_trial = neptune.init_run(with_id=trial_id,project=project_name,mode='read-only')

https://app.neptune.ai/krishanchavinda.official/Fine-Tuning-DCL-Framework/e/FIN-3566


In [9]:
best_prams=run_trial["parameters"].fetch()

In [10]:
#@title #Variables
SEED = 1234 # @param {type:"integer"}
PADDING_MAX_LENGTH = 45 # @param {type:"integer"}

# Setting Random Seed for Reproducibility

In [11]:
def setup_seed(seed:int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

In [12]:
setup_seed(SEED)

# Loading Test Dataset

In [13]:
data_files = {"test": "test.csv"}
dataset = load_dataset(dataset_name, data_files=data_files)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Loading the Tokernizer for the Transformer Model

In [14]:
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

tokenizer_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

##Define the Tokenizer Function

In [15]:
def tokenize_function(examples):
    return tokenizer.batch_encode_plus(examples["text"], padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True,truncation=True)

## Tokenize the Dataset

In [16]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2724 [00:00<?, ? examples/s]

In [17]:
# Define a function to merge columns into a list
def merge_emotion_columns_to_list(example):
    return {'emotion_vector': [example['anger'], example['anticipation'], example['disgust'],example['fear'],example['joy'],example['love'],example['optimism'],example['pessimism'],example['sadness'],example['surprise'],example['trust']]}

# Apply the function to each example in the dataset
tokenized_datasets = tokenized_datasets.map(merge_emotion_columns_to_list)

Map:   0%|          | 0/2724 [00:00<?, ? examples/s]

## Remove Unwanted Coloumns

In [18]:
tokenized_datasets=tokenized_datasets.remove_columns(['text','anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'])

## Format the coloumns

In [19]:
tokenized_datasets=tokenized_datasets.with_format("torch")

# Creating DataLoaders for Test Dataset

In [20]:
test_dataloader=DataLoader(tokenized_datasets["test"], batch_size=best_prams["BATCH_SIZE"] , shuffle=False)

# Define the Dual Contrastive Learning Architecture

In [21]:
class DCLArchitecture(nn.Module):
    def __init__(self,dropout:float,bert_model_name:str='bert-base-cased'):
        super(DCLArchitecture, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.dim = 768
        self.dense = nn.Linear(self.dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self,batch_tokenized, if_train=False):
        input_ids = batch_tokenized['input_ids']
        attention_mask = batch_tokenized['attention_mask']
        bert_output = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        bert_cls_hidden_state = bert_output[1]

        if if_train:
            bert_cls_hidden_state_aug = self.dropout(bert_cls_hidden_state)
            bert_cls_hidden_state = torch.cat((bert_cls_hidden_state, bert_cls_hidden_state_aug), dim=1).reshape(-1, self.dim)
        else:
            bert_cls_hidden_state = self.dropout(bert_cls_hidden_state)

        linear_output = self.dense(bert_cls_hidden_state)
        linear_output = linear_output.squeeze(1)

        return bert_cls_hidden_state, linear_output

#Configuring the Model & Focal Loss

In [22]:
model = DCLArchitecture(bert_model_name=bert_model_name,dropout=best_prams["DROPOUT"])
model.to(device)

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sep2022 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DCLArchitecture(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

## Load the Best Trail Checkpoint

In [23]:
def load_checkpoint(run: neptune.Run,check_point_name:str):
    model_ext = run[check_point_name]["model"].fetch_extension()
    run[check_point_name]["model"].download()  # Download the checkpoint
    run.wait()
    # Load the checkpoint
    checkpoint = {
        "model_state_dict":torch.load(f"model.{model_ext}"),
    }
    return checkpoint

In [24]:
check_point_name="model_checkpoints/"

In [25]:
checkpoint=load_checkpoint(run_study,check_point_name)

In [26]:
model.load_state_dict(checkpoint["model_state_dict"])

<All keys matched successfully>

In [27]:
model.eval()

DCLArchitecture(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

In [28]:
fined_tuned_bert_model=model.bert

In [29]:
is_inference_mode = fined_tuned_bert_model.training

if is_inference_mode:
    print("The model is in training mode.")
else:
    print("The model is in inference mode.")

The model is in inference mode.


#Get Predictions For Test Dataset

In [30]:
predictions=torch.tensor([],device=device)

In [31]:
tot_labels=torch.tensor([],device=device)

In [32]:
for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        emotion_vector = batch['emotion_vector']
        labels =batch['labels']
        with torch.no_grad():
          bert_output = fined_tuned_bert_model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
          bert_cls_hidden_state = bert_output[1]
        combined_vector =torch.cat((bert_cls_hidden_state,emotion_vector), 1)
        tot_labels = torch.cat((tot_labels,labels ), dim=0)
        predictions= torch.cat(( predictions,combined_vector ), dim=0)

# Save the Combined Vector as a CSV File

In [33]:
# Convert the tensor to a NumPy array
predictions_array = predictions.cpu().numpy()

# Create a DataFrame with 779 columns
columns = [f'col_{i}' for i in range(779)]
data_embedding = pd.DataFrame(predictions_array, columns=columns)

In [34]:
# Convert the tensor to a NumPy array
labels_array = tot_labels.cpu().numpy()

# Create a DataFrame
data_labels = pd.DataFrame({'labels': labels_array})

In [35]:

# Combine the DataFrames column-wise
combined_df = pd.concat([data_embedding , data_labels], axis=1)

In [36]:
# Replace "/" with "-"
modified_model_name = bert_model_name.replace("/", "-")

In [37]:
combined_df.to_csv('combined_vector_'+modified_model_name+'.csv', index=False)