In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd "/content/drive/MyDrive/MCC-i cuarto semestre/Data Analytics/Project/"
!ls -al

In [None]:
!pip install "torch==2.2.2" tensorboard
!pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0" peft

In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
from scipy.stats import f_oneway
import re
sns.set()

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import os

#from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from sklearn.metrics import f1_score, confusion_matrix, classification_report, cohen_kappa_score, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)

In [2]:
import warnings
warnings.filterwarnings('ignore') 

# Login to Hugging face (required only for Llama 3)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Loading Essay Score Dataset & perfom NLP Data Preprocessing

In [3]:
data_path = "data/train.csv"
df = pd.read_csv(data_path)

data_path = "data/test.csv"
df_test = pd.read_csv(data_path)

In [4]:
X = df.drop(columns=["score","essay_id"])
y = df["score"] - 1
test = df_test.drop(columns=["essay_id"])


## Classes weights

In [5]:
class_weights = y.value_counts(normalize=True).sort_index().to_list()
class_weights = torch.Tensor(class_weights)
class_weights=(1 - class_weights) / (1 - class_weights).sum()
print(class_weights)

labels_arr = np.arange(class_weights.shape[0])

tensor([0.1855, 0.1454, 0.1274, 0.1546, 0.1888, 0.1982])


## NLP Preprocessing

In [6]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)


cList = {#"dont" : "do not", "doesnt" : "does not", "thats" : "that is"
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
    "he'd": "he would",  ## --> he had or he would
    "he'd've": "he would have","he'll": "he will", "he'll've": "he will have", "he's": "he is", 
    "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    "I'd": "I would",   ## --> I had or I would
    "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
    "it'd": "it had",   ## --> It had or It would
    "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is",
    "let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have",
    "must've": "must have","mustn't": "must not","mustn't've": "must not have",
    "needn't": "need not","needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not","oughtn't've": "ought not have",
    "shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
    "she'd": "she would",   ## --> It had or It would
    "she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
    "should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
    "so've": "so have","so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have","that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have","there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
    "to've": "to have","wasn't": "was not","weren't": "were not",
    "we'd": "we had",
    "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
    "what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have",
    "when's": "when is","when've": "when have",
    "where'd": "where did","where's": "where is","where've": "where have",
    "who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have",
    "will've": "will have","won't": "will not","won't've": "will not have",
    "would've": "would have","wouldn't": "would not","wouldn't've": "would not have",
    "y'all": "you all","y'alls": "you alls","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
    "y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have",
    "you're": "you are",  "you've": "you have"
}
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    #x = re.sub("'\d+", '',x)
    #x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Remove \xa0
    x = x.replace(u'\xa0',' ')
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    x = expandContractions(x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
#     x = re.sub(r'[^\w\s.,;:""''?!]', '', x)
  #replace \'s with 's
    #print(re.findall("\\'s", x))
    #x = re.sub(r"\[\]'s", "'s", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [7]:
X["full_text"] = X.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)
test["full_text"] = test.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)


In [21]:
len(X)

17307

## Create Train-Validation Pandas Dataset Split Note: Huggingface trainer needs the datasets to be compatible with huggingface dataset format

In [8]:

seed = 10
generator = np.random.RandomState(seed)
df_size = len(X)
train_proportion = 0.85
validation_proportion = 0.15
train_size = int(df_size * train_proportion)
validation_size = df_size - train_size 
arr_train_idxs = generator.choice(np.arange(len(X)),size=[train_size,],replace=False)

train = X.iloc[arr_train_idxs].reset_index(drop=True)
train_labels = y.iloc[arr_train_idxs].reset_index(drop=True)
validation = X[~X.index.isin(arr_train_idxs)].reset_index(drop=True)
validation_labels = y.iloc[~y.index.isin(arr_train_idxs)].reset_index(drop=True)

In [23]:
test.head()

Unnamed: 0,full_text
0,many people have car where they live. the thin...
1,i am a scientist at nasa that is discussing th...
2,people always wish they had the same technolog...


In [9]:
train_concat = pd.concat([train,train_labels],axis=1)
validation_concat =pd.concat([validation,validation_labels],axis=1)

In [10]:
# from pandas dataframe datasets to huggingface datasets objects...
dataset_train = Dataset.from_pandas(train_concat)
dataset_val = Dataset.from_pandas(validation_concat)
dataset_test = Dataset.from_pandas(test)

# Combine them into a single DatasetDict
dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val
})
dataset

DatasetDict({
    train: Dataset({
        features: ['full_text', 'score'],
        num_rows: 14710
    })
    val: Dataset({
        features: ['full_text', 'score'],
        num_rows: 2597
    })
})

# Fine Tuning Model for Essay Score Assesment 

## Experiment Configuration Parameters

In [11]:
config = {
    'model': "microsoft/deberta-v3-base",
    'max_length': 1900,
    'batch_size': 15, 
    'epochs': 10,
    'lr': 1e-4,
    'enable_scheduler': True,
    'scheduler': 'CosineAnnealingWarmRestarts',
    'gradient_accumulation_steps': 2,
    'adam_eps': 1e-6, # 1e-8 default
    'freeze_encoder': True
}

## Importing model & tokenizer

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    config["model"],
    #quantization_config=quantization_config,
    num_labels=class_weights.shape[0]
)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

prepare_model_for_kbit_training() function to preprocess the quantized model for training.

### Freeze Model

In [14]:
for param in model.parameters():
    param.requires_grad = False

#model.resize_position_embeddings(config["max_length"])

#for param in model.pre_classifier.parameters():
#    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True
  
for param in model.pooler.parameters():
    param.requires_grad = True


model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

Load the tokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained(config["model"], add_prefix_space=True)

### Testing the tokenizer...

In [52]:
text = train["full_text"][1]
text

'in 1976 nasa\'s viking 1 took a photo of a landform that resembled a face. this excited the public. nasa has gone back to that place twice and taken higer defenition photos to make sure that it is not a face or some monument made by aliens. they have fond that it is a common landform, a mesa or butte. if it were more, nasa would tell the public right away. news of life on mars would excite the public, and provide nasa with funding for future explorations. the first photo of the landform was taken in 1976 by viking 1. each pixel in the photo represents 43 meaters. it is difficult to see deitalies in this photo. nasa believed that it was thier duty to make sure this was not a sign of extraterestrial life. in 1998, mars orbiter camera took a picture ten times sharper than the original. this picture showed more deatail, but the public was not satisfied. in 2001 nasa took another photo of the spot. in this photo a pixel is equal to 1.56 meters. this image is so sharp that if anything unusu

In [28]:
encoded = tokenizer.encode(text)
len(encoded)

220

In [29]:
decoded = tokenizer.decode(encoded) 
decoded

'[CLS] the author of "the challenge of exploring venus" suggests that exploring and studying venus must be done despite the dangers. they support this idea by stating how similar venus is to earth and that venus could have been like earth many years ago. they do not ignore the dangers though. the author makes a point to mention the dangers of conducting research on venus with statements like "almost 97 percent carbon dioxide," and "highly corrosive sulfuric acide in venus\'s atmosphere." they still combat this with reasons to continue though with statements about how similar it is to earth, and that it "could have supported various forms of life". venus could have supported life forms and we might be able to research them if we were able to get to them. the author brings up computers that are entirely mechanical. these computers could be build to withstand the intense conditions of venus. it is possable for us to research venus further but how many lives is that worth? our author did a

## Creating Model tokenized dataset from huggingface dataset object

In [16]:
def tokenize_function(examples):
    return tokenizer(examples['full_text'], truncation=True, max_length=config["max_length"])

cols_to_delete = ["full_text"]

tokenized_datasets = dataset.map(tokenize_function, batched=True,remove_columns=cols_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("score", "label")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/14710 [00:00<?, ? examples/s]

Map:   0%|          | 0/2597 [00:00<?, ? examples/s]

## Demo Model Forward pass & Quadratic Weighted Kappa score test

In [153]:
def pad_tokenize_function(examples):
    return tokenizer(examples['full_text'], truncation=True,padding="max_length", max_length=config["max_length"])

cols_to_delete = ["full_text"]

tokenized_datasets_pad = dataset.map(pad_tokenize_function, batched=True,remove_columns=cols_to_delete)
tokenized_datasets_pad = tokenized_datasets_pad.rename_column("score", "label")
tokenized_datasets_pad.set_format("torch")

Map:   0%|          | 0/14710 [00:00<?, ? examples/s]

Map:   0%|          | 0/2597 [00:00<?, ? examples/s]

In [178]:
batch = tokenized_datasets_pad["train"][:10]

In [179]:
inputs = {"input_ids" : torch.tensor([sample.tolist() for sample in batch["input_ids"]],dtype= torch.long).to(device),
 "token_type_ids" : torch.tensor([sample.tolist() for sample in batch["token_type_ids"]],dtype= torch.long).to(device),
 "attention_mask" : torch.tensor([sample.tolist() for sample in batch["attention_mask"]],dtype= torch.long).to(device)
}

In [180]:
outputs = model(**inputs)

In [181]:
outputs["logits"]

tensor([[ 0.0684,  0.1053, -0.0780, -0.1338, -0.0658,  0.0420],
        [ 0.0543,  0.0875, -0.0636, -0.1003, -0.0724,  0.0438],
        [ 0.0616,  0.0917, -0.0675, -0.1117, -0.0787,  0.0589],
        [ 0.0685,  0.0880, -0.0632, -0.0966, -0.0815,  0.0543],
        [ 0.0782,  0.1089, -0.0784, -0.1327, -0.0621,  0.0359],
        [ 0.0581,  0.0969, -0.0637, -0.1144, -0.0855,  0.0616],
        [ 0.0557,  0.0905, -0.0674, -0.1028, -0.0809,  0.0509],
        [ 0.0539,  0.0913, -0.0596, -0.1063, -0.0806,  0.0460],
        [ 0.0728,  0.0992, -0.0768, -0.1310, -0.0643,  0.0502],
        [ 0.0552,  0.0938, -0.0569, -0.1053, -0.0788,  0.0525]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [182]:
logits = torch.argmax(outputs["logits"],-1).cpu()
logits

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [183]:
labels = batch["label"]
labels

tensor([1, 3, 2, 0, 2, 2, 3, 3, 0, 2])

In [184]:
cohen_kappa_score(y1=logits,y2=labels,labels=labels_arr,weights="quadratic")

0.0

## Data collator

A data collator prepares batches of data for training or inference in machine learning, ensuring uniform formatting and adherence to model input requirements. This is especially crucial for variable-sized inputs like text sequences.

Functions of Data Collator

1.- Padding: Uniformly pads sequences to the length of the longest sequence using a special token, allowing simultaneous batch processing.

2.- Batching: Groups individual data points into batches for efficient processing.

3.- Handling Special Tokens: Adds necessary special tokens to sequences.

4.- Converting to Tensor: Transforms data into tensors, the required format for machine learning frameworks.

In [17]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer,max_length=config["max_length"])

## Evaluation metrics

In [18]:
from collections import Counter

In [17]:
x = np.random.randint(1,6,size=[10,])
Counter(x)

Counter({2: 5, 4: 4, 1: 1})

In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels),
    'QWK' : cohen_kappa_score(y1=predictions,y2=labels,labels=labels_arr,weights="quadratic")}



## Training Model

In [20]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = class_weights
            if not isinstance(self.class_weights, torch.FloatTensor):
                self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
            self.class_weights = self.class_weights.to(device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [21]:
seed = 10
training_args = TrainingArguments(
    output_dir = 'tests/DeBERTaV3-No-Qlora', #<----- modificar aqui
    logging_dir = "tests/runs/Experiment_1_crossentroypy_class_weights",  #<----- modificar aqui
    learning_rate = config["lr"],
    per_device_train_batch_size = config["batch_size"],
    per_device_eval_batch_size = config["batch_size"],
    num_train_epochs = config["epochs"],
    weight_decay = 0.01,
    evaluation_strategy = 'epoch', 
    #eval_steps=2,
    save_strategy = 'epoch',
    #save_steps = 2,
    load_best_model_at_end = False,
    logging_strategy = "epoch",
    #logging_steps = 2,
    data_seed = seed,
    metric_for_best_model="accuracy"
)

In [25]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    #class_weights=class_weights,
    #callbacks= [EarlyStoppingCallback(early_stopping_patience=3,early_stopping_threshold=.001),]
)

In [26]:
train_result = trainer.train()
     

Step,Training Loss,Validation Loss,Balanced Accuracy,Accuracy,Qwk
2,1.7145,1.682766,0.365422,0.365422,0.0


KeyboardInterrupt: 

In [None]:
torch.cuda.empty_cache()