In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd "/content/drive/MyDrive/MCC-i cuarto semestre/Data Analytics/Project/"
!ls -al

In [None]:
!pip install "torch==2.2.2" tensorboard
!pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0" peft

In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
from scipy.stats import f_oneway
import re
sns.set()

import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import os

#from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from sklearn.metrics import f1_score, confusion_matrix, classification_report, cohen_kappa_score, balanced_accuracy_score, accuracy_score, roc_auc_score,r2_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)

import warnings
warnings.filterwarnings('ignore') 

from collections import Counter

# Login to Hugging face (required only for Llama 3)

if it is desired to fine-tune Llama-3 to the Essay Scoring Dataset, then you have to log in into hugging face hub to download the pretrained model.

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Loading Essay Score Dataset & perfom NLP Data Preprocessing

Essay Scoring Dataset contains: 17,307 essays
The label consists of a hollistic score from 1 to 6

In [2]:
data_path = "data/train.csv"
df = pd.read_csv(data_path)

data_path = "data/test.csv"
df_test = pd.read_csv(data_path)

In [3]:
X = df.drop(columns=["score","essay_id"])
y = df["score"].astype(float) 
test = df_test.drop(columns=["essay_id"])


## Classes weights

Syntetic probabilistic distribution in the class label space used for compensating our unbalance dataset.
Note: This distribution is only relevant if the optimization problem is formulated as a classification problem.

In [4]:
class_weights = y.value_counts(normalize=True).sort_index().to_list()
class_weights = torch.Tensor(class_weights)
class_weights=(1 - class_weights) / (1 - class_weights).sum()
print(class_weights)

labels_arr = np.arange(class_weights.shape[0])

tensor([0.1855, 0.1454, 0.1274, 0.1546, 0.1888, 0.1982])


## NLP Preprocessing

Execute a simple preprocessing to the essays:
1.- Remove contractions
2.- Remove HTMLs and URLs
3.- Remove consecutive spaces, commas, and periods

Note: Gramatical mistakes were preserved with the objective for the model to be aware of these errors and increase its capability to make more assertive predictions.

In [5]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)


cList = {#"dont" : "do not", "doesnt" : "does not", "thats" : "that is"
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
    "he'd": "he would",  ## --> he had or he would
    "he'd've": "he would have","he'll": "he will", "he'll've": "he will have", "he's": "he is", 
    "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    "I'd": "I would",   ## --> I had or I would
    "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
    "it'd": "it had",   ## --> It had or It would
    "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is",
    "let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have",
    "must've": "must have","mustn't": "must not","mustn't've": "must not have",
    "needn't": "need not","needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not","oughtn't've": "ought not have",
    "shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
    "she'd": "she would",   ## --> It had or It would
    "she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
    "should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
    "so've": "so have","so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have","that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have","there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
    "to've": "to have","wasn't": "was not","weren't": "were not",
    "we'd": "we had",
    "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
    "what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have",
    "when's": "when is","when've": "when have",
    "where'd": "where did","where's": "where is","where've": "where have",
    "who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have",
    "will've": "will have","won't": "will not","won't've": "will not have",
    "would've": "would have","wouldn't": "would not","wouldn't've": "would not have",
    "y'all": "you all","y'alls": "you alls","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
    "y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have",
    "you're": "you are",  "you've": "you have"
}
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    #x = re.sub("'\d+", '',x)
    #x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Remove \xa0
    x = x.replace(u'\xa0',' ')
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    x = expandContractions(x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
#     x = re.sub(r'[^\w\s.,;:""''?!]', '', x)
  #replace \'s with 's
    #print(re.findall("\\'s", x))
    #x = re.sub(r"\[\]'s", "'s", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [6]:
X["full_text"] = X.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)
test["full_text"] = test.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)


In [8]:
test.head()

Unnamed: 0,full_text
0,many people have car where they live. the thin...
1,i am a scientist at nasa that is discussing th...
2,people always wish they had the same technolog...


In [9]:
len(X)

17307

## Create Train-Validation Pandas Dataset Split Note: Huggingface trainer needs the datasets to be compatible with huggingface dataset format

Training set: ceil 85% of 17, 307 samples = 14,710

Validation set: 17,307 samples - training samples =  2,596

In [7]:

seed = 10
generator = np.random.RandomState(seed)
df_size = len(X)
train_proportion = 0.85
validation_proportion = 0.15
train_size = int(df_size * train_proportion)
validation_size = df_size - train_size 
arr_train_idxs = generator.choice(np.arange(len(X)),size=[train_size,],replace=False)

train = X.iloc[arr_train_idxs].reset_index(drop=True)
train_labels = y.iloc[arr_train_idxs].reset_index(drop=True)
validation = X[~X.index.isin(arr_train_idxs)].reset_index(drop=True)
validation_labels = y.iloc[~y.index.isin(arr_train_idxs)].reset_index(drop=True)

In [8]:
train_concat = pd.concat([train,train_labels],axis=1)
validation_concat =pd.concat([validation,validation_labels],axis=1)

In [9]:
# from pandas dataframe datasets to huggingface datasets objects...
dataset_train = Dataset.from_pandas(train_concat)
dataset_val = Dataset.from_pandas(validation_concat)
dataset_test = Dataset.from_pandas(test)

# Combine them into a single DatasetDict
dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val
})
dataset

DatasetDict({
    train: Dataset({
        features: ['full_text', 'score'],
        num_rows: 14710
    })
    val: Dataset({
        features: ['full_text', 'score'],
        num_rows: 2597
    })
})

# Fine Tuning Model for Essay Score Assesment 

## Experiment Configuration Parameters

LLM Hyperparameters:
<br>max_length => Context size : 2048
<br>d_model => Embedding dimension : 768


Optimizer Hyperparameters:
<br>Optimizer: AdamW
<br>First moment coefficient (Beta1) : 0.9
<br>Second moment coefficient (Beta2) :  0.999
<br>lr => learning rate : 6e-5
<br>weight_decay : 0.01


In [10]:
config = {
    'model': "microsoft/deberta-v3-base",
    'max_length': 2048, #Force it to be a nice number, a.k.a. divisible by a number powered of 2
    'batch_size': 4, 
    'epochs': 10,
    'lr': 6e-5,
    "weight_decay" : 0.01
}

## Importing model & tokenizer

We import the trained models from Hugging Face using the AutoModelForSequenceClassification class.
Tokens are procesed in the model and its output is the output layer's logits for the number of classes (Multi-Class Classification) or a single logit (Regression).

We utilized the tokenizer used to train DeBERTa V3. It consists of 128,100 unique tokens.

In [11]:
torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

torch.set_float32_matmul_precision('high')

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    config["model"],
    #quantization_config=quantization_config,
    num_labels=1#class_weights.shape[0]
)

""" use_compile = True
if use_compile:
    model = torch.compile(model) """

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


' use_compile = True\nif use_compile:\n    model = torch.compile(model) '

In [13]:
model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

### Freeze Model

The Word Embedding Lookup Table's parameters were freezed and we only allow for the context pooler layer and output layer to be trainable.

In [14]:
for param in model.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True
  
for param in model.pooler.parameters():
    param.requires_grad = True


model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

Load the tokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained(config["model"], add_prefix_space=True)

### Testing the tokenizer...

In [20]:
text = train["full_text"][1]
text

'in 1976 nasa\'s viking 1 took a photo of a landform that resembled a face. this excited the public. nasa has gone back to that place twice and taken higer defenition photos to make sure that it is not a face or some monument made by aliens. they have fond that it is a common landform, a mesa or butte. if it were more, nasa would tell the public right away. news of life on mars would excite the public, and provide nasa with funding for future explorations. the first photo of the landform was taken in 1976 by viking 1. each pixel in the photo represents 43 meaters. it is difficult to see deitalies in this photo. nasa believed that it was thier duty to make sure this was not a sign of extraterestrial life. in 1998, mars orbiter camera took a picture ten times sharper than the original. this picture showed more deatail, but the public was not satisfied. in 2001 nasa took another photo of the spot. in this photo a pixel is equal to 1.56 meters. this image is so sharp that if anything unusu

In [21]:
encoded = tokenizer.encode(text)
len(encoded)

448

In [22]:
decoded = tokenizer.decode(encoded) 
decoded

'[CLS] in 1976 nasa\'s viking 1 took a photo of a landform that resembled a face. this excited the public. nasa has gone back to that place twice and taken higer defenition photos to make sure that it is not a face or some monument made by aliens. they have fond that it is a common landform, a mesa or butte. if it were more, nasa would tell the public right away. news of life on mars would excite the public, and provide nasa with funding for future explorations. the first photo of the landform was taken in 1976 by viking 1. each pixel in the photo represents 43 meaters. it is difficult to see deitalies in this photo. nasa believed that it was thier duty to make sure this was not a sign of extraterestrial life. in 1998, mars orbiter camera took a picture ten times sharper than the original. this picture showed more deatail, but the public was not satisfied. in 2001 nasa took another photo of the spot. in this photo a pixel is equal to 1.56 meters. this image is so sharp that if anything

## Creating Model tokenized dataset from huggingface dataset object

In [16]:
def tokenize_function(examples):
    return tokenizer(examples['full_text'], truncation=True, max_length=config["max_length"])

cols_to_delete = ["full_text"]

tokenized_datasets = dataset.map(tokenize_function, batched=True,remove_columns=cols_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("score", "label")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/14710 [00:00<?, ? examples/s]

Map:   0%|          | 0/2597 [00:00<?, ? examples/s]

## Demo Model Forward pass & Quadratic Weighted Kappa score test

In [30]:
def pad_tokenize_function(examples):
    return tokenizer(examples['full_text'], truncation=True,padding="max_length", max_length=config["max_length"])

cols_to_delete = ["full_text"]

tokenized_datasets_pad = dataset.map(pad_tokenize_function, batched=True,remove_columns=cols_to_delete)
tokenized_datasets_pad = tokenized_datasets_pad.rename_column("score", "label")
tokenized_datasets_pad.set_format("torch")

Map:   0%|          | 0/14710 [00:00<?, ? examples/s]

Map:   0%|          | 0/2597 [00:00<?, ? examples/s]

In [31]:
batch = tokenized_datasets_pad["train"][:10]

In [32]:
inputs = {"input_ids" : torch.tensor([sample.tolist() for sample in batch["input_ids"]],dtype= torch.long).to(device),
 "token_type_ids" : torch.tensor([sample.tolist() for sample in batch["token_type_ids"]],dtype= torch.long).to(device),
 "attention_mask" : torch.tensor([sample.tolist() for sample in batch["attention_mask"]],dtype= torch.long).to(device)
}

In [34]:
outputs = model(**inputs)

In [36]:
outputs["logits"]

tensor([[0.1224],
        [0.0969],
        [0.1041],
        [0.0978],
        [0.1238],
        [0.0992],
        [0.0964],
        [0.0998],
        [0.1214],
        [0.0990]], device='cuda:0', grad_fn=<CompiledFunctionBackward>)

In [37]:
logits = outputs["logits"][:,0].clip(0,class_weights.shape[0] - 1).round().detach().cpu()
logits

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [38]:
labels = batch["label"]
labels

tensor([2., 4., 3., 1., 3., 3., 4., 4., 1., 3.])

In [39]:
cohen_kappa_score(y1=logits,y2=labels,labels=labels_arr,weights="quadratic")

0.0

## Data collator

A data collator prepares batches of data for training or inference in machine learning, ensuring uniform formatting and adherence to model input requirements. This is especially crucial for variable-sized inputs like text sequences.

Functions of Data Collator

1.- Padding: Uniformly pads sequences to the length of the longest sequence using a special token, allowing simultaneous batch processing.

2.- Batching: Groups individual data points into batches for efficient processing.

3.- Handling Special Tokens: Adds necessary special tokens to sequences.

4.- Converting to Tensor: Transforms data into tensors, the required format for machine learning frameworks.

In [17]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer,max_length=config["max_length"])

## Evaluation metrics

Evaluation metrics selected from the scikitlearn library: R2 and Quadratic Weighted Kappa

In [18]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #print(f"predictions : {predictions}")
    #print(f"labels : {labels}")
    predictions = predictions[:,0]

    r2 = r2_score(labels,predictions)
    
    predictions = predictions.clip(1,class_weights.shape[0] ).round()

    #print(f"predictions : {predictions}")
    
    return {'R2' : r2 , 
            'QWK' : cohen_kappa_score(y1=predictions,y2=labels,labels=labels_arr,weights="quadratic")}



## Training Model

CustomTrainer class inherits from the Hugging Face Trainer class, overriding the compute_loss function to compute the required loss function. 

In [19]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = class_weights
            if not isinstance(self.class_weights, torch.FloatTensor):
                self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
            self.class_weights = self.class_weights.to(device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels")

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')[:,0]

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            raise NotImplementedError("Class weights is deprecated in Regression problems")

        else:
            loss = F.mse_loss(logits,labels)

        return (loss, outputs) if return_outputs else loss

In [25]:
seed = 10
training_args = TrainingArguments(
    output_dir = 'tests/DeBERTaV3-No-Qlora-2', 
    logging_dir = "tests/runs/Experiment_2_regression",  
    learning_rate = config["lr"],
    per_device_train_batch_size = config["batch_size"],
    per_device_eval_batch_size = config["batch_size"],
    num_train_epochs = config["epochs"],
    weight_decay = config["weight_decay"],
    evaluation_strategy = 'epoch', 
    #eval_steps=2,
    save_strategy = 'epoch',
    #save_steps = 2,
    load_best_model_at_end = True,
    logging_strategy = "epoch",
    #logging_steps = 500,
    data_seed = seed,
    metric_for_best_model="QWK",
    save_total_limit=3
)

In [26]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    #class_weights=class_weights,
    #callbacks= [EarlyStoppingCallback(early_stopping_patience=5,early_stopping_threshold=.001),]
)

In [27]:
train_result = trainer.train("./tests/DeBERTaV3-No-Qlora/checkpoint-25746")
     

Epoch,Training Loss,Validation Loss,R2,Qwk
8,0.4532,0.413789,0.61544,0.715576
9,0.4475,0.432696,0.597869,0.710259
10,0.4462,0.410793,0.618225,0.719947


In [53]:
torch.cuda.empty_cache()