<a href="https://colab.research.google.com/github/jeet1912/ms/blob/main/ds677/assignments/DS677_Week10HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



In [None]:
!gdown --id '1LABaYT-2gWthtNnW7PKlG9pM8Mh3NvuA' --output DATA.zip
!unzip DATA.zip

Downloading...
From: https://drive.google.com/uc?id=1LABaYT-2gWthtNnW7PKlG9pM8Mh3NvuA
To: /content/DATA.zip
100% 1.89M/1.89M [00:00<00:00, 199MB/s]
Archive:  DATA.zip
   creating: data/
  inflating: data/data_test.csv      
  inflating: data/data_train.csv     


#Training part

In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
from math import ceil

os.environ["WANDB_MODE"] = "disabled"

###Load Dataset and model

In [None]:
# Load the dataset
df = pd.read_csv('data/data_train.csv')

# Concatenate context, question, and answer columns for BERT input
df['input_text'] = df['context'] + " [SEP] " + df['question'] + " [SEP] " + df['answer0'] + " [SEP] " + df['answer1'] + " [SEP] " + df['answer2']

# Split into training and validation sets (80-20 split)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=17)

#tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
#model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=3)
#tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
#model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(device)

cuda


In [None]:
print(f"Tensor device before any operation: {torch.device}")

Tensor device before any operation: <class 'torch.device'>


### Utilities

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_test=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
        self.is_test = is_test
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        inputs = self.tokenizer(row['input_text'], max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")

        if self.is_test:
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze()
            }
        else:
            label = torch.tensor(row['label'], dtype=torch.long).to(device)
            return {
                'input_ids': inputs['input_ids'].squeeze().to(device),
                'attention_mask': inputs['attention_mask'].squeeze().to(device),
                'labels': label
            }

# Define accuracy metric function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create training and validation datasets
train_dataset = CustomDataset(train_df, tokenizer, max_len=32)  # Reduced max_len for faster processing
val_dataset = CustomDataset(val_df, tokenizer, max_len=32)

###Params

In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

#total_steps = ceil((len(train_dataset) / 8) * 5)


#print(total_steps)

training_args = TrainingArguments(
  output_dir='./results',
  num_train_epochs=1,
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  warmup_steps=0,#int(total_steps*0.1),
  weight_decay=0.13,
  logging_dir='./logs',
  evaluation_strategy="steps",
  eval_steps=500,  # Increased frequency
  save_strategy="steps",
  save_steps=500,
  learning_rate=3e-5,  # Experiment with this
  lr_scheduler_type='linear',
  load_best_model_at_end=True,
  metric_for_best_model='loss',
  greater_is_better=False,
  report_to="none",
  fp16=True
)



In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Ensure all tensors in the model are contiguous to avoid ValueError during saving
for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print("Validation accuracy:", eval_results['eval_accuracy'])

# Save the model
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
500,1.0775,1.105571,0.333333


Validation accuracy: 0.3333333333333333


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

#Testing part

In [None]:
# Load the saved model for inference
loaded_model = BertForSequenceClassification.from_pretrained("./saved_model")
loaded_tokenizer = BertTokenizer.from_pretrained("./saved_model")

new_df = pd.read_csv('data/data_test.csv')
new_df['input_text'] = new_df['context'] + " [SEP] " + new_df['question'] + " [SEP] " + new_df['answer0'] + " [SEP] " + new_df['answer1'] + " [SEP] " + new_df['answer2']

predict_dataset = CustomDataset(new_df, loaded_tokenizer, max_len=32, is_test=True)

predict_trainer = Trainer(model=loaded_model)

# Make predictions
predictions = predict_trainer.predict(predict_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

prediction_df = pd.DataFrame({
    'id': new_df['id'],
    'label': predicted_labels
})

prediction_df.to_csv('prediction.csv', index=False)



In [None]:
new_df.head()

Unnamed: 0,id,context,question,answer0,answer1,answer2,label,input_text
0,0,"It 's consistent and reliable , which are two ...",How long will it take to install the stereo,All day D will cancel his birthday,None of the above choices .,Untill after D birthday .,Null,"It 's consistent and reliable , which are two ..."
1,1,Both are seasonal migrants to the local waters...,Why may the narrator not catch the fish later ...,The fish migrate .,None of the above choices .,The fish hibernate later in the year .,Null,Both are seasonal migrants to the local waters...
2,2,I ' m back in class today . I ' m actually exc...,Which of the following is true of the narrator ?,They need a summer break .,They work full time .,They realize they enjoy learning .,Null,I ' m back in class today . I ' m actually exc...
3,3,Then he went around the room again and had us ...,Why did I start crying in front of everyone ?,I wanted to know how to play the accordion .,I wanted to know why she had learned to play t...,I was very sad .,Null,Then he went around the room again and had us ...
4,4,Worst day ever yesterday ... Hayden got stung ...,Why would Hayden be so concerned about the abr...,None of the above choices .,Hayden ca n't stem the bleeding .,Hayden is allergic to bees .,Null,Worst day ever yesterday ... Hayden got stung ...


In [None]:
prediction_df['label'].value_counts() #benchmark

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,1559
2,328
1,235


In [None]:
prediction_df['label'].value_counts() #base-cased valAcc 0.35159010600706714, improved by 0.02

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,1398
0,697
1,27


In [None]:
prediction_df['label'].value_counts() #large-uncased valAcc  0.3339222614840989

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,1728
2,248
0,146


In [None]:
prediction_df['label'].value_counts() #baseCased valAcc

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,1025
2,661
0,436
