Fuentes: https://medium.com/nlplanet/fine-tuning-distilbert-on-senator-tweets-a6f2425ca50e

#### **Instalar Modulos**

conda install datasets=="2.20.0"

conda install transformers=="4.40.1"

conda install numpy=="1.26.4" # La última versión no funciona bien


In [1]:
# Data processing
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import copy

import time
import datetime

from sklearn.metrics import confusion_matrix, cohen_kappa_score

from datasets import Dataset,  DatasetDict

import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

# Modeling
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW, get_scheduler

# Progress bar
from tqdm.auto import tqdm

from utils import plot_confusion_matrix, get_artifact_filename

from joblib import load, dump

# Verificamos que CUDA está funcional
torch.cuda.is_available()

  from .autonotebook import tqdm as notebook_tqdm


True

**Bajamos el modelo**

In [2]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')



**Armado de los Datasets**

In [3]:
# Paths
BASE_DIR = '../'
PATH_TO_TRAIN = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/train/train.csv")
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "work/optuna_temp_artifacts")
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "work/optuna_artifacts")

# Parametros y variables
SEED = 42
TEST_SIZE = 0.2

BATCH_SIZE = 64

MODEL_NAME = '06 Bert'

MODEL_VERSION = '1.0'

In [4]:
# Cargar los datos
df = pd.read_csv(PATH_TO_TRAIN)
df = df[df['Description'].notnull()]
df['labels'] = df["AdoptionSpeed"]

# Dividir los datos usando sklearn
#train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=SEED, stratify=df.AdoptionSpeed)

study_lgb = optuna.create_study(direction='maximize',
                            storage="sqlite:///../work/db.sqlite3",  # Specify the storage URL here.
                            study_name="04 - LGB Multiclass CV",
                           load_if_exists = True)

lgb_test_dataset = load(os.path.join(PATH_TO_OPTUNA_ARTIFACTS,get_artifact_filename(study_lgb,'test')))

train_df = df[~df.PetID.isin(lgb_test_dataset.PetID)].reset_index(drop=True)
test_df = df[df.PetID.isin(lgb_test_dataset.PetID)].reset_index(drop=True)

# Convertir a Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Combinar en un DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'val': test_dataset
})

# Codificar la columna de etiquetas como clases
dataset = dataset.class_encode_column('labels')

# Hacer una lista de columnas para remover antes de la tokenización
cols_to_remove = [col for col in dataset["train"].column_names if col != 'labels']
print(cols_to_remove)

[I 2025-04-24 19:53:19,385] Using an existing study with name '04 - LGB Multiclass CV' instead of creating a new one.
Stringifying the column: 100%|██████████| 11984/11984 [00:00<00:00, 449035.09 examples/s]
Casting to class labels: 100%|██████████| 11984/11984 [00:00<00:00, 413655.65 examples/s]
Stringifying the column: 100%|██████████| 2996/2996 [00:00<00:00, 374114.58 examples/s]
Casting to class labels: 100%|██████████| 2996/2996 [00:00<00:00, 304029.20 examples/s]

['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID', 'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed']





In [5]:
# Tokenize and encode the dataset
def tokenize(batch):
    from transformers import DistilBertTokenizerFast
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    tokenized_batch = tokenizer(batch["Description"], padding=True, truncation=True, max_length=512)
    return tokenized_batch

dataset_enc = dataset.map(tokenize, batched=True, remove_columns=cols_to_remove, num_proc=4)

# Set dataset format for PyTorch
dataset_enc.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Check the output
print(dataset_enc["train"].column_names)
     


Map (num_proc=4): 100%|██████████| 11984/11984 [00:01<00:00, 6421.50 examples/s]
Map (num_proc=4): 100%|██████████| 2996/2996 [00:00<00:00, 4877.98 examples/s]

['labels', 'input_ids', 'attention_mask']





In [6]:
# Instantiate a data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create data loaders for to reshape data for PyTorch model
train_dataloader = DataLoader(
    dataset_enc["train"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    dataset_enc["val"], batch_size=BATCH_SIZE, collate_fn=data_collator
)

In [7]:
test_sample_ids =[i for i in test_df.PetID] 

In [8]:
# Dynamically set number of class labels based on dataset
num_labels = dataset["train"].features['labels'].num_classes
print(f"Number of labels: {num_labels}")

# Load model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                           num_labels=num_labels)

Number of labels: 5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:

# Set the device automatically (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Move model to device
model.to(device)

cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [10]:
def train_val(model, dataloaders, datasets, device, num_epochs=4, lr=0.001, trial=None):
    
    since = time.time()

    # Create the optimizer
    optimizer = AdamW(model.parameters(), lr=lr)

    # Further define learning rate scheduler
    num_training_batches = len(train_dataloader)
    num_training_steps = num_epochs * num_training_batches
    lr_scheduler = get_scheduler(
        "linear",                   # linear decay
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )


    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_kappa =  -999

    train_losses = []
    val_losses = []

    try:
        previous_best = study.best_value
    except:
        previous_best = -999


    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        kappa_labels_true = []
        kappa_labels_predicted = []
        output_scores = []

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for batch in tqdm(dataloaders[phase]):
                batch = batch.to(device)
                #inputs = inputs.to(device)
                labels = batch.labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(**batch)
                    loss = outputs.loss

                    preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
                    preds_labels = torch.argmax(preds, dim=-1)


                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    elif phase == 'val':
                        kappa_labels_true.extend(labels.cpu().numpy().tolist())
                        kappa_labels_predicted.extend(preds_labels.cpu().numpy().tolist())
                        outputs_np = preds.cpu().numpy()
                        output_scores.extend([outputs_np[i,:] for i in range(outputs_np.shape[0])])

                # Statistics
                running_loss += loss.item() * labels.size(0)
                running_corrects += torch.sum(preds_labels == labels.data)
                
                #END OF BATCH

            epoch_loss = running_loss / len(datasets[phase])
            epoch_acc = running_corrects.double() / len(datasets[phase])
            
            if phase == 'train':
                train_losses.append(epoch_loss)
                kappa_score = np.nan
            else:
                val_losses.append(epoch_loss)
                kappa_score = cohen_kappa_score(kappa_labels_true,
                                  kappa_labels_predicted,
                                  weights = 'quadratic')
                    


            print(f'{phase.title()} Loss: {epoch_loss:.4f} Acc: {epoch_acc*100:.2f}% Kappa: {kappa_score:.3f}')

            # If this is the best Epoch so far -> Deep copy the model
            if phase == 'val' and kappa_score > best_kappa:
                best_acc = epoch_acc
                best_kappa = kappa_score
                best_model_wts = copy.deepcopy(model.state_dict())


                #Best Epoch within a trial and better than previous trials
                if trial is not None and best_kappa > previous_best:

                    #Save test dataset with predictions
                    predicted_filename = os.path.join(PATH_TO_TEMP_FILES,f'test_{trial.study.study_name}_{trial.number}.joblib')
                    predicted_df = pd.DataFrame({'PetID':test_sample_ids,
                                'pred':output_scores}).merge(test_df, on='PetID')
                    dump(predicted_df, predicted_filename)

                    #Generate and save CM 
                    cm_filename = os.path.join(PATH_TO_TEMP_FILES,f'cm_{trial.study.study_name}_{trial.number}.jpg')
                    plot_confusion_matrix(kappa_labels_true,kappa_labels_predicted).write_image(cm_filename)

            #END OF PHASE

        #END OF EPOCH

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:.2f}%'.format(best_acc * 100))

    # Load best model weights
    model.load_state_dict(best_model_wts)

    # Save in optuna trial the best test dataset, cm and model weights
    if trial is not None and best_kappa > previous_best:
        upload_artifact(trial, predicted_filename, artifact_store)   

        upload_artifact(trial, cm_filename, artifact_store)

        file_name = f'{MODEL_NAME}_{MODEL_VERSION}_{trial.number}.pth'
        model_path = os.path.join(PATH_TO_TEMP_FILES, file_name)
        torch.save(model, model_path) # Podemos guardar solo los pesos si queremos: best_model.state_dict()
        upload_artifact(trial, model_path, artifact_store)

    return model,best_kappa



In [11]:

# Dynamically set number of class labels based on dataset
num_labels = dataset["train"].features['labels'].num_classes
print(f"Number of labels: {num_labels}")


Number of labels: 5


In [12]:

best_model,_ = train_val(model,
                       dataloaders={'train': train_dataloader, 
                                    'val': eval_dataloader}, 
                       datasets=dataset_enc, 
                       device=device, 
                       lr = 5e-5,
                       num_epochs=15)




Epoch 0/14
----------


100%|██████████| 188/188 [02:04<00:00,  1.51it/s]


Train Loss: 1.4527 Acc: 31.19% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.05it/s]


Val Loss: 1.4241 Acc: 34.58% Kappa: 0.128
Epoch 1/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 1.3797 Acc: 37.33% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.03it/s]


Val Loss: 1.3992 Acc: 36.68% Kappa: 0.204
Epoch 2/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 1.2412 Acc: 46.23% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 1.4458 Acc: 36.92% Kappa: 0.226
Epoch 3/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.9928 Acc: 58.66% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 1.6550 Acc: 37.15% Kappa: 0.223
Epoch 4/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.7063 Acc: 71.95% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 2.0008 Acc: 33.71% Kappa: 0.213
Epoch 5/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.4922 Acc: 81.17% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 2.2346 Acc: 35.71% Kappa: 0.215
Epoch 6/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.3659 Acc: 86.32% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 2.4345 Acc: 36.42% Kappa: 0.222
Epoch 7/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.2576 Acc: 90.45% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.01it/s]


Val Loss: 2.7661 Acc: 35.65% Kappa: 0.228
Epoch 8/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.2121 Acc: 91.91% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.01it/s]


Val Loss: 3.0521 Acc: 35.11% Kappa: 0.202
Epoch 9/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.1730 Acc: 93.67% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 3.0954 Acc: 36.72% Kappa: 0.238
Epoch 10/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.1674 Acc: 93.79% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 3.0650 Acc: 35.45% Kappa: 0.227
Epoch 11/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.1486 Acc: 94.42% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 3.1498 Acc: 36.75% Kappa: 0.224
Epoch 12/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.1378 Acc: 94.68% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 3.2532 Acc: 36.48% Kappa: 0.230
Epoch 13/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.1329 Acc: 94.69% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 3.2980 Acc: 35.31% Kappa: 0.222
Epoch 14/14
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.1217 Acc: 95.27% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.01it/s]

Val Loss: 3.3602 Acc: 36.75% Kappa: 0.215
Training complete in 34m 30s
Best val Acc: 36.72%





In [13]:
# Guardo el modelo
run_id = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f'{MODEL_NAME}_{MODEL_VERSION}_{run_id}.pth'
model_path = os.path.join(PATH_TO_TEMP_FILES, file_name)
torch.save(best_model, model_path) # Podemos guardar solo los pesos si queremos: best_model.state_dict()
print(f'Modelo guardado en {model_path}')

Modelo guardado en ../work/optuna_temp_artifacts/06 Bert_1.0_20250424_202754.pth


In [14]:
artifact_store = FileSystemArtifactStore(base_path=PATH_TO_OPTUNA_ARTIFACTS)


def optuna_train(trial):

    epochs = trial.suggest_int('epochs', 1, 2)

    lr = trial.suggest_float('lr', 0.00001, 0.0001, log=True)

    _,best_score = train_val(model, 
                       dataloaders={'train': train_dataloader, 
                                    'val': eval_dataloader}, 
                       datasets=dataset_enc, 
                       device=device, 
                       num_epochs=epochs,
                       lr=lr,
                       trial=trial)


    return(best_score)

  artifact_store = FileSystemArtifactStore(base_path=PATH_TO_OPTUNA_ARTIFACTS)


In [15]:
study = optuna.create_study(direction='maximize',
                            storage="sqlite:///../work/db.sqlite3",  # Specify the storage URL here.
                            study_name=f'{MODEL_NAME}_{MODEL_VERSION}',
                            load_if_exists = True)
study.optimize(optuna_train, n_trials=5)

[I 2025-04-24 20:27:54,822] A new study created in RDB with name: 06 Bert_1.0


Epoch 0/0
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.1173 Acc: 95.36% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 3.2440 Acc: 35.81% Kappa: 0.222
Training complete in 2m 26s
Best val Acc: 35.81%



upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.


upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.


upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.

[I 2025-04-24 20:30:21,652] Trial 0 finished with value: 0.22166862844507484 and parameters: {'epochs': 1, 'lr': 1.2889027985365552e-05}. Best is trial 0 with value: 0.22166862844507484.




Epoch 0/0
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.1435 Acc: 94.66% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]
[I 2025-04-24 20:32:39,591] Trial 1 finished with value: 0.2087732368116748 and parameters: {'epochs': 1, 'lr': 4.454059529458674e-05}. Best is trial 0 with value: 0.22166862844507484.




Val Loss: 3.2626 Acc: 34.58% Kappa: 0.209
Training complete in 2m 18s
Best val Acc: 34.58%
Epoch 0/1
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.0972 Acc: 96.24% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 3.4492 Acc: 36.25% Kappa: 0.243
Epoch 1/1
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.0834 Acc: 96.47% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]

upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.


upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.



Val Loss: 3.5039 Acc: 36.25% Kappa: 0.246
Training complete in 4m 37s
Best val Acc: 36.25%



upload_artifact is experimental (supported from v3.3.0). The interface can change in the future.

[I 2025-04-24 20:37:16,944] Trial 2 finished with value: 0.24564131186114824 and parameters: {'epochs': 2, 'lr': 1.133490942520377e-05}. Best is trial 2 with value: 0.24564131186114824.




Epoch 0/1
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.0908 Acc: 96.41% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 3.4675 Acc: 35.95% Kappa: 0.219
Epoch 1/1
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.0810 Acc: 96.63% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]
[I 2025-04-24 20:41:53,353] Trial 3 finished with value: 0.2217508660430798 and parameters: {'epochs': 2, 'lr': 1.8825720477791556e-05}. Best is trial 2 with value: 0.24564131186114824.




Val Loss: 3.6092 Acc: 36.58% Kappa: 0.222
Training complete in 4m 36s
Best val Acc: 36.58%
Epoch 0/1
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.1939 Acc: 92.65% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]


Val Loss: 3.2824 Acc: 34.75% Kappa: 0.198
Epoch 1/1
----------


100%|██████████| 188/188 [02:06<00:00,  1.49it/s]


Train Loss: 0.2109 Acc: 91.97% Kappa: nan


100%|██████████| 47/47 [00:11<00:00,  4.02it/s]
[I 2025-04-24 20:46:29,646] Trial 4 finished with value: 0.20642553260851848 and parameters: {'epochs': 2, 'lr': 9.47255212993548e-05}. Best is trial 2 with value: 0.24564131186114824.


Val Loss: 2.9462 Acc: 34.85% Kappa: 0.206
Training complete in 4m 36s
Best val Acc: 34.85%
