In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd "/content/drive/MyDrive/MCC-i cuarto semestre/Data Analytics/Project/"
!ls -al

In [None]:
!pip install "torch==2.2.2" tensorboard
!pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0" peft

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
from scipy.stats import f_oneway
import re
sns.set()

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import os

from torch.utils.data import Dataset, DataLoader, IterableDataset
from torch.utils.tensorboard import SummaryWriter
from packaging import version

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from sklearn.metrics import f1_score, confusion_matrix, classification_report, cohen_kappa_score, balanced_accuracy_score, accuracy_score, roc_auc_score,r2_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, TaskType

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    AutoModel,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    PretrainedConfig,
    PreTrainedModel,
    DefaultDataCollator,
    DataCollatorForLanguageModeling 
    
)

import warnings
warnings.filterwarnings('ignore') 

from collections import Counter

import nltk
import spacy
nlp = spacy.load("en_core_web_sm")

import torch.nn as nn

import time
from typing import Dict, List, NamedTuple, Optional, Tuple, Union

from transformers.deepspeed import deepspeed_init

from transformers.file_utils import (
    CONFIG_NAME,
    WEIGHTS_NAME,
    PushToHubMixin,
    is_apex_available,
    is_datasets_available,
    is_in_notebook,
    is_sagemaker_dp_enabled,
    is_sagemaker_mp_enabled,
    is_training_run_on_sagemaker,
)

if is_sagemaker_mp_enabled():
    from transformers.trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_torch_generator_available = True
    _is_native_amp_available = True
    from torch.cuda.amp import autocast

from transformers.trainer_pt_utils import (
    DistributedLengthGroupedSampler,
    DistributedSamplerWithLoop,
    DistributedTensorGatherer,
    IterableDatasetShard,
    LabelSmoother,
    LengthGroupedSampler,
    SequentialDistributedSampler,
    ShardSampler,
    distributed_broadcast_scalars,
    distributed_concat,
    find_batch_size,
    get_parameter_names,
    nested_concat,
    nested_detach,
    nested_numpify,
    nested_truncate,
    nested_xla_mesh_reduce,
    reissue_pt_warnings,
)

from transformers.trainer_utils import (
    PREFIX_CHECKPOINT_DIR,
    BestRun,
    EvalLoopOutput,
    EvalPrediction,
    HPSearchBackend,
    PredictionOutput,
    TrainerMemoryTracker,
    TrainOutput,
    default_compute_objective,
    denumpify_detensorize,
    get_last_checkpoint,
    number_of_arguments,
    set_seed,
    speed_metrics,
)

import math

In [None]:
nltk.download('punkt')

# Login to Hugging face (required only for Llama 3)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Loading Essay Score Dataset & perfom NLP Data Preprocessing

In [None]:
data_path = "data/train.csv"
df = pd.read_csv(data_path)

data_path = "data/test.csv"
df_test = pd.read_csv(data_path)

data_features = "data/features_df.csv"
df_features = pd.read_csv(data_features).drop(["score","Unnamed: 0"],axis=1)
col_df_features = df_features.columns.to_list()

#concat text with new textstat,linguistic, and mispelling features
df = pd.concat([df,df_features],axis=1)



In [None]:
X = df.drop(columns=["score","essay_id"])
y = df["score"].astype(float)
test = df_test.drop(columns=["essay_id"])


## Classes weights

In [None]:
class_weights = y.value_counts(normalize=True).sort_index().to_list()
class_weights = torch.Tensor(class_weights)
class_weights=(1 - class_weights) / (1 - class_weights).sum()
print(class_weights)

labels_arr = np.arange(class_weights.shape[0])

## NLP Preprocessing

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)


cList = {#"dont" : "do not", "doesnt" : "does not", "thats" : "that is"
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
    "he'd": "he would",  ## --> he had or he would
    "he'd've": "he would have","he'll": "he will", "he'll've": "he will have", "he's": "he is", 
    "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    "I'd": "I would",   ## --> I had or I would
    "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
    "it'd": "it had",   ## --> It had or It would
    "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is",
    "let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have",
    "must've": "must have","mustn't": "must not","mustn't've": "must not have",
    "needn't": "need not","needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not","oughtn't've": "ought not have",
    "shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
    "she'd": "she would",   ## --> It had or It would
    "she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
    "should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
    "so've": "so have","so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have","that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have","there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
    "to've": "to have","wasn't": "was not","weren't": "were not",
    "we'd": "we had",
    "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
    "what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have",
    "when's": "when is","when've": "when have",
    "where'd": "where did","where's": "where is","where've": "where have",
    "who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have",
    "will've": "will have","won't": "will not","won't've": "will not have",
    "would've": "would have","wouldn't": "would not","wouldn't've": "would not have",
    "y'all": "you all","y'alls": "you alls","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
    "y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have",
    "you're": "you are",  "you've": "you have"
}
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    #x = re.sub("'\d+", '',x)
    #x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Remove \xa0
    x = x.replace(u'\xa0',' ')
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    x = expandContractions(x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
#     x = re.sub(r'[^\w\s.,;:""''?!]', '', x)
  #replace \'s with 's
    x = re.sub(r"\'", "'", x)
    
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [None]:
def nlp_pipeline(text):
    doc = nltk.word_tokenize(text)
    text = " ".join(token for token in doc)

    def delete_extra_spacing(text):
        # Reemplaza " . " con ". "
        text = re.sub(r'\s*\.\s*', '. ', text)
        # Reemplaza dos o más espacios con un solo espacio
        text = re.sub(r'\s{2,}', ' ', text)
        return text

    text = delete_extra_spacing(text)

    return text

res = X["full_text"][:10].apply(nlp_pipeline)

In [None]:
def reemplazar_apostrofes(cadena):
    # Reemplaza todas las ocurrencias de \'
    nueva_cadena = re.sub(r"\'", "'", cadena)
    return nueva_cadena

mi_cadena = "Este es un ejemplo con\'s en la cadena."
resultado = reemplazar_apostrofes(mi_cadena)
print(resultado)  # Salida: "Este es un ejemplo con ' en la cadena."

In [None]:
X["full_text"] = X.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)
test["full_text"] = test.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)


In [None]:
test.head()

In [None]:
len(X)

## Create Train-Validation Pandas Dataset Split Note: Huggingface trainer needs the datasets to be compatible with huggingface dataset format

In [None]:

seed = 10
generator = np.random.RandomState(seed)
df_size = len(X)
train_proportion = 0.85
validation_proportion = 0.15
train_size = int(df_size * train_proportion)
validation_size = df_size - train_size 
arr_train_idxs = generator.choice(np.arange(len(X)),size=[train_size,],replace=False)

train = X.iloc[arr_train_idxs].reset_index(drop=True)
train_labels = y.iloc[arr_train_idxs].reset_index(drop=True)
validation = X[~X.index.isin(arr_train_idxs)].reset_index(drop=True)
validation_labels = y.iloc[~y.index.isin(arr_train_idxs)].reset_index(drop=True)

In [None]:
train_concat = pd.concat([train,train_labels],axis=1)
validation_concat =pd.concat([validation,validation_labels],axis=1)

In [None]:
# from pandas dataframe datasets to huggingface datasets objects...
dataset_train = Dataset.from_pandas(train_concat)
dataset_val = Dataset.from_pandas(validation_concat)
dataset_test = Dataset.from_pandas(test)

# Combine them into a single DatasetDict
dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val
})
dataset

# Fine Tuning Model for Essay Score Assesment 

## Experiment Configuration Parameters

In [None]:
vm_path = "tests/DeBERTaV3-No-Qlora"
log_vm_path = "tests/DeBERTaV3-No-Qlora/log/session1"
config = {
    'model': "microsoft/deberta-v3-base",
    "model_dir" : vm_path,
    'log_dir' : log_vm_path,
    'max_length': 1900,
    'batch_size': 15, 
    'epochs': 20,
    'freeze_encoder': True,
    'hidden_dim' : 1024,
    'out_dim' : 1,
    'eng_feats_dim' :len(df_features.columns.to_list()),
    'backbone_out_dim' : class_weights.shape[0],
    'lr': 1e-4,
    'enable_scheduler': True,
    'scheduler': 'CosineAnnealingWarmRestarts',
    'gradient_accumulation_steps': 2,
    'adam_eps': 1e-6, # 1e-8 default
}

## Importing model & tokenizer

In [None]:
base_model = AutoModelForSequenceClassification.from_pretrained(
    config["model"],
    #quantization_config=quantization_config,
    num_labels=class_weights.shape[0]
)

### Freeze Model

In [None]:
for param in base_model.parameters():
    param.requires_grad = False

#model.resize_position_embeddings(config["max_length"])

#for param in model.pre_classifier.parameters():
#    param.requires_grad = True

for param in base_model.classifier.parameters():
    param.requires_grad = True
  
for param in base_model.pooler.parameters():
    param.requires_grad = True


#base_model.to(device)

### Construct custom model with backbone encoder

In [None]:
cfg ={
    "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  } ,
"label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  } 
} 

class CustomConfig(PretrainedConfig):
    model_type = "DeBERTaV3"
    def __init__(self,model_name = None,freeze_encoder = None,
        backbone_out_dim = None,eng_feats_dim = None,hidden_dim = None,out_dim = None,**kwargs):
        
        self.model_name = model_name
        self.freeze_encoder = freeze_encoder
        self.eng_feats_dim = eng_feats_dim
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim
        self.backbone_out_dim = backbone_out_dim
        super().__init__(**kwargs)

hf_config = CustomConfig(config["model"],config["freeze_encoder"],config["backbone_out_dim"],config["eng_feats_dim"],
config["hidden_dim"],config["out_dim"] )
#config.save_pretrained("./custom-debertav3")
#AutoConfig.register("deberta_custom", DebertaV2Config)


In [None]:
class CustomModel(PreTrainedModel):
    base_model_prefix = "encoder"
    def __init__(self,config) -> None:
        self.config = config
        super().__init__(self.config)
        #self.base_model = base_model
        self.encoder = AutoModelForSequenceClassification.from_pretrained(
            self.config.model_name,
            #quantization_config=quantization_config,
            #self.config=config
            num_labels=self.config.backbone_out_dim
        )

        self.freeze_backbone()

        self.final_classifier = nn.Sequential(
            nn.Linear(self.config.backbone_out_dim + self.config.eng_feats_dim,self.config.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.config.hidden_dim, self.config.out_dim )
        )

    def freeze_backbone(self):
        if self.config.freeze_encoder:
            for param in self.encoder.parameters():
                param.requires_grad = False

            for param in self.encoder.classifier.parameters():
                param.requires_grad = True
            
            for param in self.encoder.pooler.parameters():
                param.requires_grad = True

    def print_log(self):
        for param in self.encoder.parameters():
            print(param)


    def forward(self,**kwargs):
        #utilize encoder as feature extractor to obtain logits of size class num
        eng_features = kwargs.pop("eng_features")

        outputs_encoder = self.base_model(**kwargs)
        x = outputs_encoder.get("logits")

        #concat encoder features + eng features
        x = torch.cat([x,eng_features],dim=-1)

        #forward pass to the classifier layer
        x = self.final_classifier(x)
        return x

    

In [None]:
model = CustomModel.from_pretrained(hf_config.model_name,config=hf_config)
model.to(device)

Load the tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["model"], add_prefix_space=True)

### Testing the tokenizer...

In [None]:
text = train["full_text"][1]
text

In [None]:
encoded = tokenizer.encode(text)
len(encoded)

In [None]:
decoded = tokenizer.decode(encoded) 
decoded

## Creating Model tokenized dataset from huggingface dataset object

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['full_text'], truncation=True,padding="max_length" ,max_length=config["max_length"])

cols_to_delete = ["full_text"]

tokenized_datasets = dataset.map(tokenize_function, batched=True,remove_columns=cols_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("score", "label")
tokenized_datasets.set_format("torch")

## Demo Model Forward pass & Quadratic Weighted Kappa score test

In [None]:
def pad_tokenize_function(examples):
    return tokenizer(examples['full_text'], truncation=True,padding="max_length", max_length=config["max_length"])

cols_to_delete = ["full_text"]

tokenized_datasets_pad = dataset.map(pad_tokenize_function, batched=True,remove_columns=cols_to_delete)
tokenized_datasets_pad = tokenized_datasets_pad.rename_column("score", "label")
tokenized_datasets_pad.set_format("torch")

In [None]:
batch = tokenized_datasets_pad["train"][:3]

In [None]:
def input_split_fn(batch):
    """
    :batch: dict of tensors for each feature
    """
    
    labels = batch["labels"]

    eng_feat_tensor = torch.tensor([  batch[col].tolist() for col in col_df_features],dtype=torch.float32).transpose(0,1).to(device)

    inputs = {"input_ids" : torch.tensor([sample.tolist() for sample in batch["input_ids"]],dtype= torch.long).to(device),
    "token_type_ids" : torch.tensor([sample.tolist() for sample in batch["token_type_ids"]],dtype= torch.long).to(device),
    "attention_mask" : torch.tensor([sample.tolist() for sample in batch["attention_mask"]],dtype= torch.long).to(device)
    }

    del batch


    return inputs, eng_feat_tensor, labels


In [None]:
inputs, eng_features, labels = input_split_fn(batch)

In [None]:
with torch.no_grad():
    model.eval()
    outputs = model(eng_features,**inputs)

In [None]:
outputs

In [None]:
logits = outputs[:,0].clip(0,class_weights.shape[0] - 1).round().detach().cpu()
logits

In [None]:
labels = labels.detach().cpu()
labels

In [None]:
cohen_kappa_score(y1=logits,y2=labels,labels=labels_arr,weights="quadratic")

## Data collator

A data collator prepares batches of data for training or inference in machine learning, ensuring uniform formatting and adherence to model input requirements. This is especially crucial for variable-sized inputs like text sequences.

Functions of Data Collator

1.- Padding: Uniformly pads sequences to the length of the longest sequence using a special token, allowing simultaneous batch processing.

2.- Batching: Groups individual data points into batches for efficient processing.

3.- Handling Special Tokens: Adds necessary special tokens to sequences.

4.- Converting to Tensor: Transforms data into tensors, the required format for machine learning frameworks.

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer,max_length=config["max_length"])

## Evaluation metrics

In [None]:
def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = predictions[:,0]

    r2 = r2_score(labels,predictions)

    predictions = predictions.clip(1,class_weights.shape[0]).round()

    qwk = cohen_kappa_score(y1=predictions,y2=labels,labels=labels_arr,weights="quadratic")
    acc =accuracy_score(labels,predictions)
    balanced_acc = balanced_accuracy_score(labels,predictions)

    return  {
        "r2" : r2,
        "QWK" : qwk,
        "acc" : acc,
        "balanced acc" : balanced_acc}
    

    



## Training Model

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = class_weights
            if not isinstance(self.class_weights, torch.FloatTensor):
                self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
            self.class_weights = self.class_weights.to(device)
        else:
            self.class_weights = None


    def evaluate(
            self,
            eval_dataset = None,
            ignore_keys = None,
            metric_key_prefix: str = "eval",
        ): 
            # memory metrics - must set up as early as possible
            self._memory_tracker.start()

            eval_dataloader = self.get_eval_dataloader(eval_dataset)
            start_time  = time.time()

            eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop

            output = eval_loop(
            eval_dataloader,
            # No point gathering the predictions if there are no metrics, otherwise we defer to
            # self.args.prediction_loss_only
            prediction_loss_only=True if self.compute_metrics is None else None,
            ignore_keys=ignore_keys,
            metric_key_prefix=metric_key_prefix,
            )

            total_batch_size = self.args.eval_batch_size * self.args.world_size
            output.metrics.update(
                speed_metrics(
                    metric_key_prefix,
                    start_time,
                    num_samples=output.num_samples,
                    num_steps=math.ceil(output.num_samples / total_batch_size),
                )
            )

            self.log(output.metrics)

            #if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
            #    # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
            #    xm.master_print(met.metrics_report())

            self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)

            self._memory_tracker.stop_and_update_metrics(output.metrics)

            return output.metrics


            
    
    def evaluation_loop(self,dataloader : DataLoader,prediction_loss_only: Optional[bool] = None,
                        ignore_keys : Optional[List[str]] = None, metric_key_prefix : str = "eval"):
        
        prediction_loss_only = (
            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
        )


        # if eval is called w/o train init deepspeed here
        if self.args.deepspeed and not self.deepspeed:

            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
            # from the checkpoint eventually
            deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None)
            self.model = deepspeed_engine.module
            self.model_wrapped = deepspeed_engine
            self.deepspeed = deepspeed_engine
            # XXX: we don't need optim/sched for inference, but this needs to be sorted out, since
            # for example the Z3-optimizer is a must for zero3 to work even for inference - what we
            # don't need is the deepspeed basic optimizer which is self.optimizer.optimizer
            deepspeed_engine.optimizer.optimizer = None
            deepspeed_engine.lr_scheduler = None

        
        model = self._wrap_model(self.model, training=False)

        # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while
        # ``train`` is running, halve it first and then put on device
        if not self.is_in_train and self.args.fp16_full_eval:
            model = model.half().to(self.args.device)

        batch_size = config["batch_size"]

        model.eval()

        self.callback_handler.eval_dataloader = dataloader

        # Do this before wrapping.
        eval_dataset = dataloader.dataset

        if self.args.past_index >= 0:
            self._past = None

        # Initialize containers
        # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
        losses_host = None
        preds_host = None
        labels_host = None
        # losses/preds/labels on CPU (final containers)
        all_losses = None
        all_preds = None
        all_labels = None
        # Will be useful when we have an iterable dataset so don't know its length.


        observed_num_examples = 0
        # Main evaluation loop
        input_dic = {}
        for step, inputs in enumerate(dataloader):
            # Update the observed num examples
            observed_batch_size = find_batch_size(inputs)

            if observed_batch_size is not None:
                observed_num_examples += observed_batch_size
            # Update the observed num examples
            #split input
            inputs, eng_features, class_labels = input_split_fn(inputs)
            #inputs, class_labels = inputs.get('input_ids'),inputs.get('labels')
            input_dic.update({
                k : v for k,v in inputs.items()
            })

            input_dic.update({
                "eng_features" : eng_features
            })

            input_dic.update({
                "labels" : class_labels
            })

            # Prediction step
            loss, logits, labels = self.prediction_step(model, input_dic, prediction_loss_only, ignore_keys=ignore_keys)

            # Update containers on host
            if loss is not None:
                losses = self._nested_gather(loss.repeat(batch_size))
                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
            if logits is not None:
                #logits = self._pad_across_processes(logits)
                logits = self._nested_gather(logits)
                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
            if labels is not None:
                #labels = self._pad_across_processes(labels)
                labels = self._nested_gather(labels)
                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
            self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control)

            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
            if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0:
                if losses_host is not None:
                    losses = nested_numpify(losses_host)
                    all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
                if preds_host is not None:
                    logits = nested_numpify(preds_host)
                    all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
                if labels_host is not None:
                    labels = nested_numpify(labels_host)
                    all_labels = (
                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
                    )

                # Set back to None to begin a new accumulation
                losses_host, preds_host, labels_host = None, None, None

        if self.args.past_index and hasattr(self, "_past"):
            # Clean the state at the end of the evaluation loop
            delattr(self, "_past")

        # Gather all remaining tensors and put them back on the CPU
        if losses_host is not None:
            losses = nested_numpify(losses_host)
            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
        if preds_host is not None:
            logits = nested_numpify(preds_host)
            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
        if labels_host is not None:
            labels = nested_numpify(labels_host)
            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)

        # Number of samples
        if not isinstance(eval_dataset, IterableDataset):
            num_samples = len(eval_dataset)
        # The instance check is weird and does not actually check for the type, but whether the dataset has the right
        # methods. Therefore we need to make sure it also has the attribute.
        elif isinstance(eval_dataset, IterableDatasetShard) and hasattr(eval_dataset, "num_examples"):
            num_samples = eval_dataset.num_examples
        else:
            num_samples = observed_num_examples

        # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
        # samplers has been rounded to a multiple of batch_size, so we truncate.
        if all_losses is not None:
            pass
            #all_losses = all_losses[:num_samples]
        if all_preds is not None:
            all_preds = nested_truncate(all_preds, num_samples)
        if all_labels is not None:
            all_labels = nested_truncate(all_labels, num_samples)

        # Metrics!
        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
            metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
        else:
            metrics = {}

        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
        metrics = denumpify_detensorize(metrics)

        if all_losses is not None:
            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()

        # Prefix all keys with metric_key_prefix + '_'
        for key in list(metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)

        return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)

            
    def prediction_step(
        self,
        model: nn.Module,
        inputs,
        prediction_loss_only: bool,
        ignore_keys: Optional[List[str]] = None,
    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
        """
        Perform an evaluation step on :obj:`model` using obj:`inputs`.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to evaluate.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (:obj:`bool`):
                Whether or not to return the loss only.
            ignore_keys (:obj:`Lst[str]`, `optional`):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.

        Return:
            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
            logits and labels (each being optional).
        """
        has_labels = all(inputs.get(k) is not None for k in self.label_names)
        inputs = self._prepare_inputs(inputs)
        if ignore_keys is None:
            if hasattr(self.model, "config"):
                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
            else:
                ignore_keys = []

        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
        if has_labels:
            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
            if len(labels) == 1:
                labels = labels[0]
        else:
            labels = None

        with torch.no_grad():
            if is_sagemaker_mp_enabled():
                raw_outputs = smp_forward_only(model, inputs)
                if has_labels:
                    if isinstance(raw_outputs, dict):
                        loss_mb = raw_outputs["loss"]
                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"])
                    else:
                        loss_mb = raw_outputs[0]
                        logits_mb = raw_outputs[1:]

                    loss = loss_mb.reduce_mean().detach().cpu()
                    logits = smp_nested_concat(logits_mb)
                else:
                    loss = None
                    if isinstance(raw_outputs, dict):
                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys)
                    else:
                        logits_mb = raw_outputs
                    logits = smp_nested_concat(logits_mb)
            else:
                if has_labels:
                    loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
                    loss = loss.mean().detach()
                    if isinstance(outputs, dict):
                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
                    else:
                        logits = outputs
                else:
                    loss = None
                    if self.use_amp:
                        with autocast():
                            outputs = model(**inputs)
                    else:
                        outputs = model(**inputs)
                    if isinstance(outputs, dict):
                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
                    else:
                        logits = outputs
                    # TODO: this needs to be fixed and made cleaner later.
                    if self.args.past_index >= 0:
                        self._past = outputs[self.args.past_index - 1]

        if prediction_loss_only:
            return (loss, None, None)

        logits = nested_detach(logits)
        if len(logits) == 1:
            logits = logits[0]

        return (loss, logits, labels)





    def compute_loss(self, model, inputs, return_outputs=False):
        
        # Extract labels and convert them to long type for cross_entropy
        #inputs, eng_features, labels = input_split_fn(inputs)
        #labels = inputs.pop("labels")

        model_inps = inputs

        #get input , feat, label tensors
        if "eng_features" not in inputs:
            inputs, eng_features, labels = input_split_fn(inputs)
            #inputs, class_labels = inputs.get('input_ids'),inputs.get('labels')
            input_dic = {
                    k : v for k,v in inputs.items()
            }

            input_dic.update({
                    "eng_features" : eng_features
                })
            
            model_inps = input_dic
        else:
            labels = model_inps.pop("labels")

        #compare first token essay to the dataset to find similarities
        # Forward pass
        outputs = model(**model_inps)

        # Extract logits assuming they are directly outputted by the model
        #logits = outputs.get('logits')[:,0]

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            raise NotImplementedError("Class weights is deprecated in Regression problems")

        else:
            loss = F.mse_loss(outputs,labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
seed = 10
training_args = TrainingArguments(
    output_dir = config["model_dir"],
    logging_dir = config["log_dir"],
    learning_rate = config["lr"],
    per_device_train_batch_size = config["batch_size"],
    per_device_eval_batch_size = config["batch_size"],
    num_train_epochs = config["epochs"],
    weight_decay = 0.01,
    evaluation_strategy = 'epoch', 
    #eval_steps=2,
    save_strategy = 'epoch',
    #save_steps = 8,
    load_best_model_at_end = True,
    save_total_limit=3,
    logging_strategy = "epoch",
    #logging_steps = 2,
    data_seed = seed,
    remove_unused_columns=False,
    metric_for_best_model="QWK",
    seed = seed
)

In [None]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    #col_df_features=col_df_features,
    #max_seq_len=None,
    #class_weights=class_weights,
    callbacks= [EarlyStoppingCallback(early_stopping_patience=3,early_stopping_threshold=.005),]
)
trainer.label_names.append("labels")

In [None]:
train_result = trainer.train()
     

In [None]:
torch.cuda.empty_cache()