In [13]:
pip install accelerate



In [14]:
pip install sentencepiece



In [15]:
from transformers import AutoModel
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sentencepiece
import gc

from accelerate import Accelerator

# ----------
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [16]:
config = {
    'model': 'microsoft/deberta-v3-base',
    'dropout': 0.5,
    'max_length': 512,
    'batch_size': 8, # anything more results in CUDA OOM [for unfreezed encoder] on Kaggle GPU
    'epochs': 10,
    'lr': 3e-4,
    'enable_scheduler': True,
    'scheduler': 'CosineAnnealingWarmRestarts',
    'gradient_accumulation_steps': 2,
    'adam_eps': 1e-6, # 1e-8 default
    'freeze_encoder': True
}

In [17]:
access_token = "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"
tokenizer = AutoTokenizer.from_pretrained(config['model'], token= access_token)



In [18]:
# Importar los datos de drive
from google.colab import drive
from google.colab import files
import io
drive.mount('/content/drive')

train = pd.read_csv('/content/drive/MyDrive/Tesis/train_modificado.csv')
test = pd.read_csv('/content/drive/MyDrive/Tesis/test_modificado.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
class EssayDataset:
    def __init__(self, df, config, tokenizer=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.classes = ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']
        self.max_len = config['max_length']
        self.tokenizer = tokenizer
        self.is_test = is_test

    def __getitem__(self,idx):
        sample = self.df['full_text'][idx]
        tokenized = tokenizer.encode_plus(sample,
                                          None,
                                          add_special_tokens=True,
                                          max_length=self.max_len,
                                          truncation=True,
                                          padding='max_length'
                                         )
        inputs = {
            "input_ids": torch.tensor(tokenized['input_ids'], dtype=torch.long),
            "token_type_ids": torch.tensor(tokenized['token_type_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'], dtype=torch.long)
        }

        if self.is_test == True:
            return inputs

        label = self.df.loc[idx,self.classes].to_list()
        targets = {
            "labels": torch.tensor(label, dtype=torch.float32),
        }

        return inputs, targets

    def __len__(self):
        return len(self.df)

#Caso 1 conjunto de validación

In [20]:
# Divide los datos en conjuntos de entrenamiento y validación
train_df, val_df = train_test_split(train, test_size=0.1, random_state=42)
print('dataframe shapes:',train_df.shape, val_df.shape)

dataframe shapes: (2815, 50) (313, 50)


In [21]:
train_ds = EssayDataset(train_df, config, tokenizer=tokenizer)
val_ds = EssayDataset(val_df, config, tokenizer=tokenizer)
test_ds = EssayDataset(test, config, tokenizer=tokenizer, is_test = True)

In [22]:
test_ds[0]

{'input_ids': tensor([    1,   278,  5973,   264,   286,   266, 58585, 40955,  4389,   335,
           274,   333,   298,   286,   266, 58585, 40955,  1727, 38710,   274,
           298,  2435,  5973,   262,   355,   263,   306,   551,   267,  5699,
         90999, 10339,   584,  1757,   264,   291,  1548,   401,   335,   262,
           355,   286,   266, 58585, 40955,  4389,   306,   295, 19315, 20198,
           266,   688,   263,   264,   286,   266,  1453,  4389,   269,   379,
           539,   267,   432,   263,   327,   264,   286,   266,  5973,  4389,
           269,   539,   267,   262,  7169,  4673, 13282,   362,   584,   374,
           267,   266,   961,  1867,   297, 45147,   263,   584,   489,   286,
           266,  5973,  4389,   275,   262, 77042,   268,   270,   738,   584,
           394,   266,  2140,   361,   374,   267,   262,   961, 35521, 59722,
           297,   373,   489,   286,   266,  2330,  4389,   263,   584,   428,
           272,   269,   298,  5973,   

In [23]:
train_loader = torch.utils.data.DataLoader(train_ds,
                                           batch_size=config['batch_size'],
                                           shuffle=True,
                                           num_workers=2,
                                           pin_memory=True
                                          )
val_loader = torch.utils.data.DataLoader(val_ds,
                                         batch_size=config['batch_size'],
                                         shuffle=True,
                                         num_workers=2,
                                         pin_memory=True
                                        )


#Model

In [24]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [25]:
class EssayModel(nn.Module):
    def __init__(self,config,num_classes=6):
        super(EssayModel,self).__init__()
        self.model_name = config['model']
        self.freeze = config['freeze_encoder']

        self.encoder = AutoModel.from_pretrained(self.model_name)
        if self.freeze:
            for param in self.encoder.base_model.parameters():
                param.requires_grad = False

        self.pooler = MeanPooling()
        self.dropout = nn.Dropout(config['dropout'])
        self.fc1 = nn.Linear(self.encoder.config.hidden_size,64)
        self.fc2 = nn.Linear(64,num_classes)


    def forward(self,inputs):
        outputs = self.encoder(**inputs,return_dict=True)
        outputs = self.pooler(outputs['last_hidden_state'], inputs['attention_mask'])
        outputs = self.fc1(outputs)
        outputs = self.fc2(outputs)
        return outputs

In [26]:
class Trainer:
    def __init__(self, model, loaders, config, accelerator):
        self.model = model
        self.train_loader, self.val_loader = loaders
        self.config = config
        self.input_keys = ['input_ids','token_type_ids','attention_mask']
        self.accelerator = accelerator

        self.optim = self._get_optim()

        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optim, T_0=5,eta_min=1e-7)

        self.train_losses = []
        self.val_losses = []
        self.best_val_loss = float('inf')
        self.best_model_weights = None

    def prepare(self):
        self.model, self.optim, self.train_loader, self.val_loader, self.scheduler = self.accelerator.prepare(
            self.model,
            self.optim,
            self.train_loader,
            self.val_loader,
            self.scheduler
        )

    def _get_optim(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.config['lr'], eps=self.config['adam_eps'])
        return optimizer


    def loss_fn(self, outputs, targets):
        colwise_rmse = torch.sqrt(torch.mean(torch.square(targets - outputs), dim=0))
        loss = torch.mean(colwise_rmse, dim=0)
        return loss


    def train_one_epoch(self,epoch):

        running_loss = 0.
        progress = tqdm(self.train_loader, total=len(self.train_loader))

        for idx,(inputs,targets) in enumerate(progress):
            with self.accelerator.accumulate(self.model):

                outputs = self.model(inputs)

                loss = self.loss_fn(outputs, targets['labels'])
                running_loss += loss.item()

                self.accelerator.backward(loss)

                self.optim.step()

                if self.config['enable_scheduler']:
                    self.scheduler.step(epoch - 1 + idx / len(self.train_loader))

                self.optim.zero_grad()

                del inputs, targets, outputs, loss


        train_loss = running_loss/len(self.train_loader)
        self.train_losses.append(train_loss)

    @torch.no_grad()
    def valid_one_epoch(self,epoch):

        running_loss = 0.
        progress = tqdm(self.val_loader, total=len(self.val_loader))

        for (inputs, targets) in progress:

            outputs = self.model(inputs)

            loss = self.loss_fn(outputs, targets['labels'])
            running_loss += loss.item()

            del inputs, targets, outputs, loss


        val_loss = running_loss/len(self.val_loader)
        self.val_losses.append(val_loss)
        if running_loss < self.best_val_loss:
            self.best_val_loss = running_loss
            self.best_model_weights = self.model.state_dict().copy()  # Guarda los pesos del modelo


    def test(self, test_loader):

        preds = []
        for (inputs) in test_loader:

            outputs = self.model(inputs)
            preds.append(outputs.detach().cpu())

        preds = torch.concat(preds)
        return preds

    def fit(self):

        self.prepare()

        fit_progress = tqdm(
            range(1, self.config['epochs']+1),
            leave = True,
            desc="Training..."
        )

        for epoch in fit_progress:

            self.model.train()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | training...")
            self.train_one_epoch(epoch)
            self.clear()

            self.model.eval()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | validating...")
            self.valid_one_epoch(epoch)
            self.clear()

            print(f"{'➖️'*10} EPOCH {epoch} / {self.config['epochs']} {'➖️'*10}")
            print(f"train loss: {self.train_losses[-1]}")
            print(f"valid loss: {self.val_losses[-1]}\n\n")

        # Al final del entrenamiento, restablece el modelo a su mejor estado
        if self.best_model_weights is not None:
            self.model.load_state_dict(self.best_model_weights)



    def clear(self):
        gc.collect()
        torch.cuda.empty_cache()

    def predict(self, test_loader):
        """ Realiza predicciones en un conjunto de datos de prueba. """
        self.model.eval()
        predictions = []
        for batch in test_loader:
            inputs = {key: val.to(self.accelerator.device) for key, val in batch.items()}
            outputs = self.model(inputs)
            predictions.append(outputs.detach().cpu().numpy())

        return np.concatenate(predictions, axis=0)

    def save_model(self, file_path):
        """ Guarda el modelo entrenado. """
        torch.save(self.model.state_dict(), file_path)

    def load_model(self, file_path):
        """ Carga los pesos del modelo desde un archivo. """
        # Cargar los pesos guardados en el modelo
        self.model.load_state_dict(torch.load(file_path))

        # Asegúrate de llamar a .to(device) para mover el modelo al dispositivo correcto
        self.model.to(self.accelerator.device)

        print(f"Modelo cargado desde {file_path}")


#Training with HuggingFace Accelerate

In [27]:
accelerator = Accelerator(gradient_accumulation_steps=config['gradient_accumulation_steps'])

In [28]:
%%time
model = EssayModel(config).to(device=accelerator.device )
trainer = Trainer(model, (train_loader, val_loader), config, accelerator)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

CPU times: user 2.49 s, sys: 1.21 s, total: 3.7 s
Wall time: 32.1 s


In [29]:
%%time
trainer.fit()

Training...:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 1 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.7590361891991713
valid loss: 0.48786726370453837




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 2 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4887453227049925
valid loss: 0.4860869340598583




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 3 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.47627955429594626
valid loss: 0.4806759923696518




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 4 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4689658864993941
valid loss: 0.4766233488917351




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 5 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4653329460627653
valid loss: 0.48293154016137124




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 6 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.47194141217253427
valid loss: 0.49199696704745294




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 7 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.46945209123871545
valid loss: 0.468280016630888




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 8 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4604199164631692
valid loss: 0.47205370366573335




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 9 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.45461423542689194
valid loss: 0.4712636634707451




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 10 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4541562015021389
valid loss: 0.47632413133978846


CPU times: user 6min 31s, sys: 6.33 s, total: 6min 37s
Wall time: 6min 32s


#Predicciones del conjunto de prueba

In [30]:
test_loader = torch.utils.data.DataLoader(test_ds,
                                         batch_size=config['batch_size'],
                                         shuffle=False,
                                         num_workers=2,
                                         pin_memory=True
                                        )

In [31]:
predictions = trainer.predict(test_loader)

In [32]:
predictions

array([[2.7116117, 2.4776025, 2.7377305, 2.6017485, 2.3549724, 2.2300577],
       [3.3457706, 3.3796167, 3.4795702, 3.4163852, 3.649065 , 3.4628205],
       [3.8078036, 3.7223563, 3.8940644, 3.9821708, 3.9525478, 3.8748775],
       ...,
       [3.2647026, 3.3814397, 3.4991179, 3.233262 , 3.4052997, 3.3348749],
       [3.7852063, 3.6530154, 3.7854352, 3.8752232, 3.5369692, 3.6974933],
       [2.9166768, 2.5689232, 2.9430265, 2.649731 , 2.2574408, 2.7287843]],
      dtype=float32)

# Guardar el modelo

In [33]:
model_save_path = '/content/drive/MyDrive/Tesis/deberta_base_v1.pth'
trainer.save_model(model_save_path)

# Evaluación  en el conjunto de prueba (no visto antes por el modelo)

In [34]:
preds = pd.DataFrame(predictions, columns = ["cohesion_pred",	"syntax_pred",	"vocabulary_pred",	"phraseology_pred",	"grammar_pred",	"conventions_pred"])

In [36]:
preresults =pd.concat([test.loc[:,"cohesion":"conventions"].reset_index(drop=True), preds], axis=1);preresults

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,cohesion_pred,syntax_pred,vocabulary_pred,phraseology_pred,grammar_pred,conventions_pred
0,3.0,2.5,2.5,2.0,2.0,2.0,2.711612,2.477602,2.737731,2.601748,2.354972,2.230058
1,3.0,2.0,3.0,3.5,3.0,3.0,3.345771,3.379617,3.479570,3.416385,3.649065,3.462821
2,4.0,4.0,3.0,4.0,4.0,4.0,3.807804,3.722356,3.894064,3.982171,3.952548,3.874877
3,3.0,3.0,3.5,3.0,3.5,3.5,3.290562,3.329687,3.444086,3.421665,3.508954,3.466555
4,3.5,3.5,3.5,3.5,3.0,3.5,3.312456,3.173325,3.312030,3.243997,3.210511,2.976244
...,...,...,...,...,...,...,...,...,...,...,...,...
778,2.0,2.5,3.0,3.0,3.5,2.0,3.261501,3.343655,3.462113,3.587848,3.461044,2.872113
779,2.5,2.5,3.0,3.0,2.5,2.5,3.069801,3.086106,3.060426,3.274357,3.319246,3.297066
780,2.0,3.0,3.0,3.0,3.0,2.5,3.264703,3.381440,3.499118,3.233262,3.405300,3.334875
781,4.0,3.5,4.0,3.5,3.5,4.0,3.785206,3.653015,3.785435,3.875223,3.536969,3.697493


In [37]:
preresults["se_cohesion"]=(preresults["cohesion"]-preresults["cohesion_pred"])**2
preresults["se_syntax"]=(preresults["syntax"]-preresults["syntax_pred"])**2
preresults["se_vocabulary"]=(preresults["vocabulary"]-preresults["vocabulary_pred"])**2
preresults["se_phraseology"]=(preresults["phraseology"]-preresults["phraseology_pred"])**2
preresults["se_grammar"]=(preresults["grammar"]-preresults["grammar_pred"])**2
preresults["se_conventions"]=(preresults["conventions"]-preresults["conventions_pred"])**2

In [38]:
results = pd.DataFrame()
results["se_cohesion"]=(preresults["cohesion"]-preresults["cohesion_pred"])**2
results["se_syntax"]=(preresults["syntax"]-preresults["syntax_pred"])**2
results["se_vocabulary"]=(preresults["vocabulary"]-preresults["vocabulary_pred"])**2
results["se_phraseology"]=(preresults["phraseology"]-preresults["phraseology_pred"])**2
results["se_grammar"]=(preresults["grammar"]-preresults["grammar_pred"])**2
results["se_conventions"]=(preresults["conventions"]-preresults["conventions_pred"])**2

In [39]:
preresults

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,cohesion_pred,syntax_pred,vocabulary_pred,phraseology_pred,grammar_pred,conventions_pred,se_cohesion,se_syntax,se_vocabulary,se_phraseology,se_grammar,se_conventions
0,3.0,2.5,2.5,2.0,2.0,2.0,2.711612,2.477602,2.737731,2.601748,2.354972,2.230058,0.083168,0.000502,0.056516,0.362101,0.126005,0.052927
1,3.0,2.0,3.0,3.5,3.0,3.0,3.345771,3.379617,3.479570,3.416385,3.649065,3.462821,0.119557,1.903342,0.229988,0.006991,0.421285,0.214203
2,4.0,4.0,3.0,4.0,4.0,4.0,3.807804,3.722356,3.894064,3.982171,3.952548,3.874877,0.036939,0.077086,0.799351,0.000318,0.002252,0.015656
3,3.0,3.0,3.5,3.0,3.5,3.5,3.290562,3.329687,3.444086,3.421665,3.508954,3.466555,0.084426,0.108694,0.003126,0.177801,0.000080,0.001119
4,3.5,3.5,3.5,3.5,3.0,3.5,3.312456,3.173325,3.312030,3.243997,3.210511,2.976244,0.035173,0.106716,0.035333,0.065538,0.044315,0.274320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,2.0,2.5,3.0,3.0,3.5,2.0,3.261501,3.343655,3.462113,3.587848,3.461044,2.872113,1.591385,0.711754,0.213549,0.345565,0.001518,0.760581
779,2.5,2.5,3.0,3.0,2.5,2.5,3.069801,3.086106,3.060426,3.274357,3.319246,3.297066,0.324673,0.343520,0.003651,0.075272,0.671164,0.635314
780,2.0,3.0,3.0,3.0,3.0,2.5,3.264703,3.381440,3.499118,3.233262,3.405300,3.334875,1.599473,0.145496,0.249119,0.054411,0.164268,0.697016
781,4.0,3.5,4.0,3.5,3.5,4.0,3.785206,3.653015,3.785435,3.875223,3.536969,3.697493,0.046136,0.023414,0.046038,0.140792,0.001367,0.091510


In [40]:
mse = results.mean();mse

se_cohesion       0.276020
se_syntax         0.249912
se_vocabulary     0.218828
se_phraseology    0.249885
se_grammar        0.236744
se_conventions    0.237024
dtype: float64

In [43]:
rmse = np.sqrt(mse);rmse

se_cohesion       0.525376
se_syntax         0.499912
se_vocabulary     0.467791
se_phraseology    0.499885
se_grammar        0.486563
se_conventions    0.486851
dtype: float64

In [44]:
rmse.mean()

0.49439620267019246

#Cargar el modelo

In [45]:
#trainer = Trainer(model, (train_loader, val_loader), config, accelerator)
model_load_path = '/content/drive/MyDrive/Tesis/deberta_base_v1.pth'

# Cargar el modelo previamente entrenado
trainer.load_model(model_load_path)

Modelo cargado desde /content/drive/MyDrive/Tesis/deberta_base_v1.pth
