In [1]:
pip install accelerate



In [2]:
pip install sentencepiece



In [3]:
from transformers import AutoModel
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sentencepiece
import gc

from accelerate import Accelerator

# ----------
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
config = {
    'model': 'microsoft/deberta-v3-large',
    'dropout': 0.5,
    'max_length': 512,
    'batch_size': 8, # anything more results in CUDA OOM [for unfreezed encoder] on Kaggle GPU
    'epochs': 10,
    'lr': 3e-4,
    'enable_scheduler': True,
    'scheduler': 'CosineAnnealingWarmRestarts',
    'gradient_accumulation_steps': 2,
    'adam_eps': 1e-6, # 1e-8 default
    'freeze_encoder': True
}

In [7]:
access_token = "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"
tokenizer = AutoTokenizer.from_pretrained(config['model'], token= access_token)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [9]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
df = pd.read_csv("/content/drive/MyDrive/Tesis/train.csv")
#test0_df = pd.read_csv("/content/drive/MyDrive/Tesis/test.csv")#df.shape

In [11]:
train_ratio = 0.8
test_ratio = 0.2
val_ratio = 0.1

# Divide los datos en conjuntos de entrenamiento, prueba y validación
main_df, test_df = train_test_split(df, test_size= 1 - train_ratio, random_state=42)


In [12]:
main_df.shape

(3128, 8)

In [13]:
class EssayDataset:
    def __init__(self, df, config, tokenizer=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.classes = ['cohesion','syntax','vocabulary','phraseology','grammar','conventions']
        self.max_len = config['max_length']
        self.tokenizer = tokenizer
        self.is_test = is_test

    def __getitem__(self,idx):
        sample = self.df['full_text'][idx]
        tokenized = tokenizer.encode_plus(sample,
                                          None,
                                          add_special_tokens=True,
                                          max_length=self.max_len,
                                          truncation=True,
                                          padding='max_length'
                                         )
        inputs = {
            "input_ids": torch.tensor(tokenized['input_ids'], dtype=torch.long),
            "token_type_ids": torch.tensor(tokenized['token_type_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'], dtype=torch.long)
        }

        if self.is_test == True:
            return inputs

        label = self.df.loc[idx,self.classes].to_list()
        targets = {
            "labels": torch.tensor(label, dtype=torch.float32),
        }

        return inputs, targets

    def __len__(self):
        return len(self.df)

#Caso 1 conjunto de validación

In [14]:
train_df, val_df = train_test_split(main_df,test_size=val_ratio,random_state=1357,shuffle=True)
print('dataframe shapes:',train_df.shape, val_df.shape)

dataframe shapes: (2815, 8) (313, 8)


In [15]:
train_ds = EssayDataset(train_df, config, tokenizer=tokenizer)
val_ds = EssayDataset(val_df, config, tokenizer=tokenizer)
test_ds = EssayDataset(test_df, config, tokenizer=tokenizer, is_test = True)

In [16]:
test_ds[0]

{'input_ids': tensor([    1,   325,  5973,   264,   286,   266, 58585, 40955,  4389,   335,
           274,   418,   280,   297,   286,   266, 58585, 40955,  1727, 38710,
           274,   298,  2435,  5973,   262,   355,   263,   306,   551,   267,
          5699, 90999, 10339,   260,   273,  1757,   264,   291,  1548,   401,
           335,   262,   355,   286,   266, 58585, 40955,  4389,   306,   295,
         19315, 20198,   266,   688,   263,   264,   286,   266,  1453,  4389,
           269,   379,   539,   267,   432,   263,   327,   264,   286,   266,
          5973,  4389,   269,   539,   267,   262,  7169,  4673, 13282,   260,
          1244,   294,   273,   374,   267,   266,   961,  1867,   297, 45147,
           263,   273,   489,   286,   266,  5973,  4389,   275,   262, 77042,
           268,   270,   738,   261,   273,   394,   266,  2140,   361,   374,
           267,   262,   961, 35521, 59722,   297,   373,   489,   286,   266,
          2330,  4389,   261,   263,   

In [17]:
train_loader = torch.utils.data.DataLoader(train_ds,
                                           batch_size=config['batch_size'],
                                           shuffle=True,
                                           num_workers=2,
                                           pin_memory=True
                                          )
val_loader = torch.utils.data.DataLoader(val_ds,
                                         batch_size=config['batch_size'],
                                         shuffle=True,
                                         num_workers=2,
                                         pin_memory=True
                                        )


#Model

In [18]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [19]:
class EssayModel(nn.Module):
    def __init__(self,config,num_classes=6):
        super(EssayModel,self).__init__()
        self.model_name = config['model']
        self.freeze = config['freeze_encoder']

        self.encoder = AutoModel.from_pretrained(self.model_name,token= access_token)
        if self.freeze:
            for param in self.encoder.base_model.parameters():
                param.requires_grad = False

        self.pooler = MeanPooling()
        self.dropout = nn.Dropout(config['dropout'])
        self.fc1 = nn.Linear(self.encoder.config.hidden_size,64)
        self.fc2 = nn.Linear(64,num_classes)


    def forward(self,inputs):
        outputs = self.encoder(**inputs,return_dict=True)
        outputs = self.pooler(outputs['last_hidden_state'], inputs['attention_mask'])
        outputs = self.fc1(outputs)
        outputs = self.fc2(outputs)
        return outputs

In [20]:
class Trainer:
    def __init__(self, model, loaders, config, accelerator):
        self.model = model
        self.train_loader, self.val_loader = loaders
        self.config = config
        self.input_keys = ['input_ids','token_type_ids','attention_mask']
        self.accelerator = accelerator

        self.optim = self._get_optim()

        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optim, T_0=5,eta_min=1e-7)

        self.train_losses = []
        self.val_losses = []
        self.best_val_loss = float('inf')
        self.best_model_weights = None

    def prepare(self):
        self.model, self.optim, self.train_loader, self.val_loader, self.scheduler = self.accelerator.prepare(
            self.model,
            self.optim,
            self.train_loader,
            self.val_loader,
            self.scheduler
        )

    def _get_optim(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.config['lr'], eps=self.config['adam_eps'])
        return optimizer


    def loss_fn(self, outputs, targets):
        colwise_rmse = torch.sqrt(torch.mean(torch.square(targets - outputs), dim=0))
        loss = torch.mean(colwise_rmse, dim=0)
        return loss


    def train_one_epoch(self,epoch):

        running_loss = 0.
        progress = tqdm(self.train_loader, total=len(self.train_loader))

        for idx,(inputs,targets) in enumerate(progress):
            with self.accelerator.accumulate(self.model):

                outputs = self.model(inputs)

                loss = self.loss_fn(outputs, targets['labels'])
                running_loss += loss.item()

                self.accelerator.backward(loss)

                self.optim.step()

                if self.config['enable_scheduler']:
                    self.scheduler.step(epoch - 1 + idx / len(self.train_loader))

                self.optim.zero_grad()

                del inputs, targets, outputs, loss


        train_loss = running_loss/len(self.train_loader)
        self.train_losses.append(train_loss)

    @torch.no_grad()
    def valid_one_epoch(self,epoch):

        running_loss = 0.
        progress = tqdm(self.val_loader, total=len(self.val_loader))

        for (inputs, targets) in progress:

            outputs = self.model(inputs)

            loss = self.loss_fn(outputs, targets['labels'])
            running_loss += loss.item()

            del inputs, targets, outputs, loss


        val_loss = running_loss/len(self.val_loader)
        self.val_losses.append(val_loss)
        if running_loss < self.best_val_loss:
            self.best_val_loss = running_loss
            self.best_model_weights = self.model.state_dict().copy()  # Guarda los pesos del modelo


    def test(self, test_loader):

        preds = []
        for (inputs) in test_loader:

            outputs = self.model(inputs)
            preds.append(outputs.detach().cpu())

        preds = torch.concat(preds)
        return preds

    def fit(self):

        self.prepare()

        fit_progress = tqdm(
            range(1, self.config['epochs']+1),
            leave = True,
            desc="Training..."
        )

        for epoch in fit_progress:

            self.model.train()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | training...")
            self.train_one_epoch(epoch)
            self.clear()

            self.model.eval()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | validating...")
            self.valid_one_epoch(epoch)
            self.clear()

            print(f"{'➖️'*10} EPOCH {epoch} / {self.config['epochs']} {'➖️'*10}")
            print(f"train loss: {self.train_losses[-1]}")
            print(f"valid loss: {self.val_losses[-1]}\n\n")

        # Al final del entrenamiento, restablece el modelo a su mejor estado
        if self.best_model_weights is not None:
            self.model.load_state_dict(self.best_model_weights)



    def clear(self):
        gc.collect()
        torch.cuda.empty_cache()

    def predict(self, test_loader):
        """ Realiza predicciones en un conjunto de datos de prueba. """
        self.model.eval()
        predictions = []
        for batch in test_loader:
            inputs = {key: val.to(self.accelerator.device) for key, val in batch.items()}
            outputs = self.model(inputs)
            predictions.append(outputs.detach().cpu().numpy())

        return np.concatenate(predictions, axis=0)

    def save_model(self, file_path):
        """ Guarda el modelo entrenado. """
        torch.save(self.model.state_dict(), file_path)

    def load_model(self, file_path):
        """ Carga los pesos del modelo desde un archivo. """
        # Cargar los pesos guardados en el modelo
        self.model.load_state_dict(torch.load(file_path))

        # Asegúrate de llamar a .to(device) para mover el modelo al dispositivo correcto
        self.model.to(self.accelerator.device)

        print(f"Modelo cargado desde {file_path}")


#Training with HuggingFace Accelerate

In [21]:
accelerator = Accelerator(gradient_accumulation_steps=config['gradient_accumulation_steps'])

In [22]:
torch.cuda.is_available()

True

In [23]:
!nvidia-smi

Fri Jan 19 05:17:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              47W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [24]:
%%time
model = EssayModel(config).to(device=accelerator.device )
trainer = Trainer(model, (train_loader, val_loader), config, accelerator)

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

CPU times: user 5.02 s, sys: 2.5 s, total: 7.52 s
Wall time: 11.7 s


In [25]:
%%time
trainer.fit()

Training...:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 1 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.709471162239259
valid loss: 0.46797223314642905




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 2 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4559647766026584
valid loss: 0.4632012270390987




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 3 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.44666856713593006
valid loss: 0.4505397483706474




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 4 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4389815117994493
valid loss: 0.4517032288014889




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 5 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.43621412478387356
valid loss: 0.4466869100928307




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 6 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4492329623211514
valid loss: 0.4424911439418793




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 7 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4420157552442767
valid loss: 0.46406539157032967




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 8 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.43622532596980984
valid loss: 0.4433042176067829




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 9 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.43138949149711564
valid loss: 0.4463036209344864




  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 10 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.4288049481639808
valid loss: 0.4413871198892593


CPU times: user 17min 18s, sys: 1min 49s, total: 19min 7s
Wall time: 18min 59s


#Predicciones del conjunto de prueba

In [26]:
test_loader = torch.utils.data.DataLoader(test_ds,
                                         batch_size=config['batch_size'],
                                         shuffle=False,
                                         num_workers=2,
                                         pin_memory=True
                                        )

In [27]:
predictions = trainer.predict(test_loader)

In [28]:
predictions

array([[2.453324 , 2.380157 , 2.7400353, 2.6173553, 2.4002917, 2.224858 ],
       [3.068142 , 3.0486913, 3.2079198, 3.2479656, 3.4572613, 3.1409218],
       [3.9016776, 3.8475196, 4.054422 , 4.112753 , 4.153437 , 4.169511 ],
       ...,
       [3.4760432, 3.3815591, 3.6754284, 3.537327 , 3.625142 , 3.6150746],
       [3.995781 , 3.9099548, 3.9641156, 3.8919897, 3.5769277, 3.8430085],
       [2.9846303, 2.6429102, 3.0410733, 2.7233336, 2.403415 , 2.7276092]],
      dtype=float32)

# Evaluación  en el conjunto de prueba (no visto antes por el modelo)

In [29]:
#predictions = trainer.predict(test_loader)

In [30]:
preds = pd.DataFrame(predictions, columns = ["cohesion_pred",	"syntax_pred",	"vocabulary_pred",	"phraseology_pred",	"grammar_pred",	"conventions_pred"])

In [31]:
preresults =pd.concat([test_df.loc[:,"cohesion":"conventions"].reset_index(drop=True), preds], axis=1);preresults

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,cohesion_pred,syntax_pred,vocabulary_pred,phraseology_pred,grammar_pred,conventions_pred
0,3.0,2.5,2.5,2.0,2.0,2.0,2.453324,2.380157,2.740035,2.617355,2.400292,2.224858
1,3.0,2.0,3.0,3.5,3.0,3.0,3.068142,3.048691,3.207920,3.247966,3.457261,3.140922
2,4.0,4.0,3.0,4.0,4.0,4.0,3.901678,3.847520,4.054422,4.112753,4.153437,4.169511
3,3.0,3.0,3.5,3.0,3.5,3.5,3.253090,3.246344,3.347249,3.403274,3.461086,3.185355
4,3.5,3.5,3.5,3.5,3.0,3.5,3.161548,3.014560,3.144690,3.184979,3.128968,2.763420
...,...,...,...,...,...,...,...,...,...,...,...,...
778,2.0,2.5,3.0,3.0,3.5,2.0,2.905302,2.935480,3.191362,3.183585,3.274277,2.619195
779,2.5,2.5,3.0,3.0,2.5,2.5,3.060672,3.114566,3.065203,3.167434,3.455990,3.247301
780,2.0,3.0,3.0,3.0,3.0,2.5,3.476043,3.381559,3.675428,3.537327,3.625142,3.615075
781,4.0,3.5,4.0,3.5,3.5,4.0,3.995781,3.909955,3.964116,3.891990,3.576928,3.843009


In [32]:
preresults["se_cohesion"]=(preresults["cohesion"]-preresults["cohesion_pred"])**2
preresults["se_syntax"]=(preresults["syntax"]-preresults["syntax_pred"])**2
preresults["se_vocabulary"]=(preresults["vocabulary"]-preresults["vocabulary_pred"])**2
preresults["se_phraseology"]=(preresults["phraseology"]-preresults["phraseology_pred"])**2
preresults["se_grammar"]=(preresults["grammar"]-preresults["grammar_pred"])**2
preresults["se_conventions"]=(preresults["conventions"]-preresults["conventions_pred"])**2

In [33]:
results = pd.DataFrame()
results["se_cohesion"]=(preresults["cohesion"]-preresults["cohesion_pred"])**2
results["se_syntax"]=(preresults["syntax"]-preresults["syntax_pred"])**2
results["se_vocabulary"]=(preresults["vocabulary"]-preresults["vocabulary_pred"])**2
results["se_phraseology"]=(preresults["phraseology"]-preresults["phraseology_pred"])**2
results["se_grammar"]=(preresults["grammar"]-preresults["grammar_pred"])**2
results["se_conventions"]=(preresults["conventions"]-preresults["conventions_pred"])**2

In [34]:
preresults

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,cohesion_pred,syntax_pred,vocabulary_pred,phraseology_pred,grammar_pred,conventions_pred,se_cohesion,se_syntax,se_vocabulary,se_phraseology,se_grammar,se_conventions
0,3.0,2.5,2.5,2.0,2.0,2.0,2.453324,2.380157,2.740035,2.617355,2.400292,2.224858,0.298855,0.014362,0.057617,0.381128,0.160233,0.050561
1,3.0,2.0,3.0,3.5,3.0,3.0,3.068142,3.048691,3.207920,3.247966,3.457261,3.140922,0.004643,1.099753,0.043231,0.063521,0.209088,0.019859
2,4.0,4.0,3.0,4.0,4.0,4.0,3.901678,3.847520,4.054422,4.112753,4.153437,4.169511,0.009667,0.023250,1.111806,0.012713,0.023543,0.028734
3,3.0,3.0,3.5,3.0,3.5,3.5,3.253090,3.246344,3.347249,3.403274,3.461086,3.185355,0.064054,0.060685,0.023333,0.162630,0.001514,0.099001
4,3.5,3.5,3.5,3.5,3.0,3.5,3.161548,3.014560,3.144690,3.184979,3.128968,2.763420,0.114550,0.235652,0.126245,0.099238,0.016633,0.542550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,2.0,2.5,3.0,3.0,3.5,2.0,2.905302,2.935480,3.191362,3.183585,3.274277,2.619195,0.819572,0.189643,0.036619,0.033704,0.050951,0.383402
779,2.5,2.5,3.0,3.0,2.5,2.5,3.060672,3.114566,3.065203,3.167434,3.455990,3.247301,0.314353,0.377691,0.004251,0.028034,0.913917,0.558459
780,2.0,3.0,3.0,3.0,3.0,2.5,3.476043,3.381559,3.675428,3.537327,3.625142,3.615075,2.178704,0.145587,0.456204,0.288720,0.390803,1.243391
781,4.0,3.5,4.0,3.5,3.5,4.0,3.995781,3.909955,3.964116,3.891990,3.576928,3.843009,0.000018,0.168063,0.001288,0.153656,0.005918,0.024646


In [35]:
mse = results.mean();mse

se_cohesion       0.237262
se_syntax         0.214710
se_vocabulary     0.197547
se_phraseology    0.221157
se_grammar        0.216224
se_conventions    0.205373
dtype: float64

In [36]:
rmse = np.sqrt(mse);rmse

se_cohesion       0.487095
se_syntax         0.463369
se_vocabulary     0.444462
se_phraseology    0.470273
se_grammar        0.464999
se_conventions    0.453181
dtype: float64

In [37]:
rmse.mean()

0.46389660733457133

# Guardar el modelo

In [38]:
model_save_path = '/content/drive/MyDrive/Tesis/deberta_large_v1.pth'
trainer.save_model(model_save_path)

#Cargar el modelo

In [39]:
#trainer = Trainer(model, (train_loader, val_loader), config, accelerator)
model_load_path = '/content/drive/MyDrive/Tesis/deberta_large_v1.pth'

# Cargar el modelo previamente entrenado
trainer.load_model(model_load_path)

Modelo cargado desde /content/drive/MyDrive/Tesis/deberta_large_v1.pth
