In [1]:
pip install accelerate

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


In [2]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [3]:
from transformers import AutoModel
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sentencepiece
import gc

from accelerate import Accelerator

# ----------
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
config = {
    'model': 'microsoft/deberta-v3-large',
    'dropout': 0.5,
    'max_length': 512,
    'batch_size': 8, # anything more results in CUDA OOM [for unfreezed encoder] on Kaggle GPU
    'epochs': 10,
    'lr': 3e-4,
    'enable_scheduler': True,
    'scheduler': 'CosineAnnealingWarmRestarts',
    'gradient_accumulation_steps': 2,
    'adam_eps': 1e-6, # 1e-8 default
    'freeze_encoder': True
}

In [5]:
access_token = "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"
tokenizer = AutoTokenizer.from_pretrained(config['model'], token= access_token)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [6]:
# Importar los datos de drive
from google.colab import drive
from google.colab import files
import io
drive.mount('/content/drive')

train = pd.read_csv('/content/drive/MyDrive/Tesis/train_modificado.csv')
test = pd.read_csv('/content/drive/MyDrive/Tesis/test_modificado.csv')

Mounted at /content/drive


In [7]:
class EssayDataset:
    def __init__(self, df, config, tokenizer=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.classes = ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']
        self.max_len = config['max_length']
        self.tokenizer = tokenizer
        self.is_test = is_test

    def __getitem__(self,idx):
        sample = self.df['full_text'][idx]
        tokenized = tokenizer.encode_plus(sample,
                                          None,
                                          add_special_tokens=True,
                                          max_length=self.max_len,
                                          truncation=True,
                                          padding='max_length'
                                         )
        inputs = {
            "input_ids": torch.tensor(tokenized['input_ids'], dtype=torch.long),
            "token_type_ids": torch.tensor(tokenized['token_type_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'], dtype=torch.long)
        }

        if self.is_test == True:
            return inputs

        label = self.df.loc[idx,self.classes].to_list()
        targets = {
            "labels": torch.tensor(label, dtype=torch.float32),
        }

        return inputs, targets

    def __len__(self):
        return len(self.df)

#Caso 1 conjunto de validación

In [8]:
# Divide los datos en conjuntos de entrenamiento y validación
train_df, val_df = train_test_split(train, test_size=0.1, random_state=42)
print('dataframe shapes:',train_df.shape, val_df.shape)

dataframe shapes: (2815, 50) (313, 50)


In [9]:
train_ds = EssayDataset(train_df, config, tokenizer=tokenizer)
val_ds = EssayDataset(val_df, config, tokenizer=tokenizer)
test_ds = EssayDataset(test, config, tokenizer=tokenizer, is_test = True)

In [10]:
test_ds[0]

{'input_ids': tensor([    1,   278,  5973,   264,   286,   266, 58585, 40955,  4389,   335,
           274,   333,   298,   286,   266, 58585, 40955,  1727, 38710,   274,
           298,  2435,  5973,   262,   355,   263,   306,   551,   267,  5699,
         90999, 10339,   584,  1757,   264,   291,  1548,   401,   335,   262,
           355,   286,   266, 58585, 40955,  4389,   306,   295, 19315, 20198,
           266,   688,   263,   264,   286,   266,  1453,  4389,   269,   379,
           539,   267,   432,   263,   327,   264,   286,   266,  5973,  4389,
           269,   539,   267,   262,  7169,  4673, 13282,   362,   584,   374,
           267,   266,   961,  1867,   297, 45147,   263,   584,   489,   286,
           266,  5973,  4389,   275,   262, 77042,   268,   270,   738,   584,
           394,   266,  2140,   361,   374,   267,   262,   961, 35521, 59722,
           297,   373,   489,   286,   266,  2330,  4389,   263,   584,   428,
           272,   269,   298,  5973,   

In [11]:
train_loader = torch.utils.data.DataLoader(train_ds,
                                           batch_size=config['batch_size'],
                                           shuffle=True,
                                           num_workers=2,
                                           pin_memory=True
                                          )
val_loader = torch.utils.data.DataLoader(val_ds,
                                         batch_size=config['batch_size'],
                                         shuffle=True,
                                         num_workers=2,
                                         pin_memory=True
                                        )


#Model

In [12]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [13]:
class EssayModel(nn.Module):
    def __init__(self,config,num_classes=6):
        super(EssayModel,self).__init__()
        self.model_name = config['model']
        self.freeze = config['freeze_encoder']

        self.encoder = AutoModel.from_pretrained(self.model_name)
        if self.freeze:
            for param in self.encoder.base_model.parameters():
                param.requires_grad = False

        self.pooler = MeanPooling()
        self.dropout = nn.Dropout(config['dropout'])
        self.fc1 = nn.Linear(self.encoder.config.hidden_size,64)
        self.fc2 = nn.Linear(64,num_classes)


    def forward(self,inputs):
        outputs = self.encoder(**inputs,return_dict=True)
        outputs = self.pooler(outputs['last_hidden_state'], inputs['attention_mask'])
        outputs = self.fc1(outputs)
        outputs = self.fc2(outputs)
        return outputs

In [14]:
class Trainer:
    def __init__(self, model, loaders, config, accelerator):
        self.model = model
        self.train_loader, self.val_loader = loaders
        self.config = config
        self.input_keys = ['input_ids','token_type_ids','attention_mask']
        self.accelerator = accelerator

        self.optim = self._get_optim()

        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optim, T_0=5,eta_min=1e-7)

        self.train_losses = []
        self.val_losses = []
        self.best_val_loss = float('inf')
        self.best_model_weights = None

    def prepare(self):
        self.model, self.optim, self.train_loader, self.val_loader, self.scheduler = self.accelerator.prepare(
            self.model,
            self.optim,
            self.train_loader,
            self.val_loader,
            self.scheduler
        )

    def _get_optim(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.config['lr'], eps=self.config['adam_eps'])
        return optimizer


    def loss_fn(self, outputs, targets):
        colwise_rmse = torch.sqrt(torch.mean(torch.square(targets - outputs), dim=0))
        loss = torch.mean(colwise_rmse, dim=0)
        return loss


    def train_one_epoch(self,epoch):

        running_loss = 0.
        progress = tqdm(self.train_loader, total=len(self.train_loader))

        for idx,(inputs,targets) in enumerate(progress):
            with self.accelerator.accumulate(self.model):

                outputs = self.model(inputs)

                loss = self.loss_fn(outputs, targets['labels'])
                running_loss += loss.item()

                self.accelerator.backward(loss)

                self.optim.step()

                if self.config['enable_scheduler']:
                    self.scheduler.step(epoch - 1 + idx / len(self.train_loader))

                self.optim.zero_grad()

                del inputs, targets, outputs, loss


        train_loss = running_loss/len(self.train_loader)
        self.train_losses.append(train_loss)

    @torch.no_grad()
    def valid_one_epoch(self,epoch):

        running_loss = 0.
        progress = tqdm(self.val_loader, total=len(self.val_loader))

        for (inputs, targets) in progress:

            outputs = self.model(inputs)

            loss = self.loss_fn(outputs, targets['labels'])
            running_loss += loss.item()

            del inputs, targets, outputs, loss


        val_loss = running_loss/len(self.val_loader)
        self.val_losses.append(val_loss)
        if running_loss < self.best_val_loss:
            self.best_val_loss = running_loss
            self.best_model_weights = self.model.state_dict().copy()  # Guarda los pesos del modelo


    def test(self, test_loader):

        preds = []
        for (inputs) in test_loader:

            outputs = self.model(inputs)
            preds.append(outputs.detach().cpu())

        preds = torch.concat(preds)
        return preds

    def fit(self):

        self.prepare()

        fit_progress = tqdm(
            range(1, self.config['epochs']+1),
            leave = True,
            desc="Training..."
        )

        for epoch in fit_progress:

            self.model.train()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | training...")
            self.train_one_epoch(epoch)
            self.clear()

            self.model.eval()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | validating...")
            self.valid_one_epoch(epoch)
            self.clear()

            print(f"{'➖️'*10} EPOCH {epoch} / {self.config['epochs']} {'➖️'*10}")
            print(f"train loss: {self.train_losses[-1]}")
            print(f"valid loss: {self.val_losses[-1]}\n\n")

        # Al final del entrenamiento, restablece el modelo a su mejor estado
        if self.best_model_weights is not None:
            self.model.load_state_dict(self.best_model_weights)



    def clear(self):
        gc.collect()
        torch.cuda.empty_cache()

    def predict(self, test_loader):
        """ Realiza predicciones en un conjunto de datos de prueba. """
        self.model.eval()
        predictions = []
        for batch in test_loader:
            inputs = {key: val.to(self.accelerator.device) for key, val in batch.items()}
            outputs = self.model(inputs)
            predictions.append(outputs.detach().cpu().numpy())

        return np.concatenate(predictions, axis=0)

    def save_model(self, file_path):
        """ Guarda el modelo entrenado. """
        torch.save(self.model.state_dict(), file_path)

    def load_model(self, file_path):
        """ Carga los pesos del modelo desde un archivo. """
        # Cargar los pesos guardados en el modelo
        self.model.load_state_dict(torch.load(file_path))

        # Asegúrate de llamar a .to(device) para mover el modelo al dispositivo correcto
        self.model.to(self.accelerator.device)

        print(f"Modelo cargado desde {file_path}")


#Training with HuggingFace Accelerate

In [15]:
accelerator = Accelerator(gradient_accumulation_steps=config['gradient_accumulation_steps'])

In [16]:
%%time
model = EssayModel(config).to(device=accelerator.device )
trainer = Trainer(model, (train_loader, val_loader), config, accelerator)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

CPU times: user 5.05 s, sys: 2.82 s, total: 7.87 s
Wall time: 9.92 s


In [17]:
%%time
trainer.fit()

Training...:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 1 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.7210800313306126
valid loss: 0.5016659833490849




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 2 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4729707364670255
valid loss: 0.4761117957532406




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 3 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4598547150804238
valid loss: 0.47024282440543175




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 4 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.45394532433287665
valid loss: 0.4708101458847523




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 5 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.45141798664223065
valid loss: 0.4694915361702442




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 6 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4598820624365048
valid loss: 0.4581733845174313




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 7 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.45731759452345694
valid loss: 0.47194369286298754




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 8 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.45229712467301975
valid loss: 0.462417796254158




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 9 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.44630738779563794
valid loss: 0.4616143897175789




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 10 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.44410167719152843
valid loss: 0.4638282284140587


CPU times: user 17min 12s, sys: 1min 50s, total: 19min 3s
Wall time: 18min 55s


#Predicciones del conjunto de prueba

In [18]:
test_loader = torch.utils.data.DataLoader(test_ds,
                                         batch_size=config['batch_size'],
                                         shuffle=False,
                                         num_workers=2,
                                         pin_memory=True
                                        )

In [19]:
predictions = trainer.predict(test_loader)

In [20]:
predictions

array([[2.6325886, 2.478416 , 2.760865 , 2.5536637, 2.442975 , 2.2630107],
       [3.3444936, 3.2869647, 3.3979805, 3.4237866, 3.7357552, 3.5363696],
       [3.5798564, 3.6783319, 3.8625405, 3.9773374, 4.035672 , 3.854491 ],
       ...,
       [3.4089172, 3.3917377, 3.5964983, 3.6191666, 3.6490824, 3.5354903],
       [3.8142467, 3.6808538, 3.8850744, 3.833088 , 3.4459794, 3.6971974],
       [3.036437 , 2.648911 , 3.041634 , 2.7437549, 2.3646538, 2.769757 ]],
      dtype=float32)

# Guardar el modelo

In [21]:
model_save_path = '/content/drive/MyDrive/Tesis/deberta_base_v1.pth'
trainer.save_model(model_save_path)

# Evaluación  en el conjunto de prueba (no visto antes por el modelo)

In [22]:
preds = pd.DataFrame(predictions, columns = ["cohesion_pred",	"syntax_pred",	"vocabulary_pred",	"phraseology_pred",	"grammar_pred",	"conventions_pred"])

In [23]:
preresults =pd.concat([test.loc[:,"cohesion":"conventions"].reset_index(drop=True), preds], axis=1);preresults

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,cohesion_pred,syntax_pred,vocabulary_pred,phraseology_pred,grammar_pred,conventions_pred
0,3.0,2.5,2.5,2.0,2.0,2.0,2.632589,2.478416,2.760865,2.553664,2.442975,2.263011
1,3.0,2.0,3.0,3.5,3.0,3.0,3.344494,3.286965,3.397980,3.423787,3.735755,3.536370
2,4.0,4.0,3.0,4.0,4.0,4.0,3.579856,3.678332,3.862540,3.977337,4.035672,3.854491
3,3.0,3.0,3.5,3.0,3.5,3.5,3.297677,3.239660,3.324011,3.401819,3.505580,3.340407
4,3.5,3.5,3.5,3.5,3.0,3.5,3.370882,3.188081,3.238337,3.339417,3.248489,2.908936
...,...,...,...,...,...,...,...,...,...,...,...,...
778,2.0,2.5,3.0,3.0,3.5,2.0,3.009887,3.023702,3.136825,3.277055,3.269771,2.601897
779,2.5,2.5,3.0,3.0,2.5,2.5,2.985060,3.029440,2.981027,3.163830,3.377934,3.195141
780,2.0,3.0,3.0,3.0,3.0,2.5,3.408917,3.391738,3.596498,3.619167,3.649082,3.535490
781,4.0,3.5,4.0,3.5,3.5,4.0,3.814247,3.680854,3.885074,3.833088,3.445979,3.697197


In [24]:
preresults["se_cohesion"]=(preresults["cohesion"]-preresults["cohesion_pred"])**2
preresults["se_syntax"]=(preresults["syntax"]-preresults["syntax_pred"])**2
preresults["se_vocabulary"]=(preresults["vocabulary"]-preresults["vocabulary_pred"])**2
preresults["se_phraseology"]=(preresults["phraseology"]-preresults["phraseology_pred"])**2
preresults["se_grammar"]=(preresults["grammar"]-preresults["grammar_pred"])**2
preresults["se_conventions"]=(preresults["conventions"]-preresults["conventions_pred"])**2

In [25]:
results = pd.DataFrame()
results["se_cohesion"]=(preresults["cohesion"]-preresults["cohesion_pred"])**2
results["se_syntax"]=(preresults["syntax"]-preresults["syntax_pred"])**2
results["se_vocabulary"]=(preresults["vocabulary"]-preresults["vocabulary_pred"])**2
results["se_phraseology"]=(preresults["phraseology"]-preresults["phraseology_pred"])**2
results["se_grammar"]=(preresults["grammar"]-preresults["grammar_pred"])**2
results["se_conventions"]=(preresults["conventions"]-preresults["conventions_pred"])**2

In [26]:
preresults

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,cohesion_pred,syntax_pred,vocabulary_pred,phraseology_pred,grammar_pred,conventions_pred,se_cohesion,se_syntax,se_vocabulary,se_phraseology,se_grammar,se_conventions
0,3.0,2.5,2.5,2.0,2.0,2.0,2.632589,2.478416,2.760865,2.553664,2.442975,2.263011,0.134991,0.000466,0.068051,0.306544,0.196227,0.069175
1,3.0,2.0,3.0,3.5,3.0,3.0,3.344494,3.286965,3.397980,3.423787,3.735755,3.536370,0.118676,1.656278,0.158388,0.005808,0.541336,0.287692
2,4.0,4.0,3.0,4.0,4.0,4.0,3.579856,3.678332,3.862540,3.977337,4.035672,3.854491,0.176521,0.103470,0.743976,0.000514,0.001273,0.021173
3,3.0,3.0,3.5,3.0,3.5,3.5,3.297677,3.239660,3.324011,3.401819,3.505580,3.340407,0.088611,0.057437,0.030972,0.161459,0.000031,0.025470
4,3.5,3.5,3.5,3.5,3.0,3.5,3.370882,3.188081,3.238337,3.339417,3.248489,2.908936,0.016671,0.097294,0.068467,0.025787,0.061747,0.349357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,2.0,2.5,3.0,3.0,3.5,2.0,3.009887,3.023702,3.136825,3.277055,3.269771,2.601897,1.019872,0.274264,0.018721,0.076759,0.053005,0.362281
779,2.5,2.5,3.0,3.0,2.5,2.5,2.985060,3.029440,2.981027,3.163830,3.377934,3.195141,0.235283,0.280306,0.000360,0.026840,0.770768,0.483221
780,2.0,3.0,3.0,3.0,3.0,2.5,3.408917,3.391738,3.596498,3.619167,3.649082,3.535490,1.985048,0.153458,0.355810,0.383367,0.421308,1.072240
781,4.0,3.5,4.0,3.5,3.5,4.0,3.814247,3.680854,3.885074,3.833088,3.445979,3.697197,0.034504,0.032708,0.013208,0.110948,0.002918,0.091689


In [27]:
mse = results.mean();mse

se_cohesion       0.247903
se_syntax         0.233467
se_vocabulary     0.196200
se_phraseology    0.232611
se_grammar        0.221706
se_conventions    0.218525
dtype: float64

In [28]:
rmse = np.sqrt(mse);rmse

se_cohesion       0.497899
se_syntax         0.483184
se_vocabulary     0.442945
se_phraseology    0.482298
se_grammar        0.470857
se_conventions    0.467467
dtype: float64

In [29]:
rmse.mean()

0.4741083315341739

#Cargar el modelo

In [None]:
#trainer = Trainer(model, (train_loader, val_loader), config, accelerator)
model_load_path = '/content/drive/MyDrive/Tesis/deberta_large_v1.pth'

# Cargar el modelo previamente entrenado
trainer.load_model(model_load_path)