In [None]:
!pip install datasets
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-an

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DebertaV2ForSequenceClassification
from sklearn.metrics import classification_report
import torch
import numpy as np
from google.colab import drive


In [None]:
!pip install accelerate>=0.21.0




---



---



---



In [None]:
# Definir el modelo y el tokenizador
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True, max_length=512)
model = DebertaV2ForSequenceClassification.from_pretrained(model_name, num_labels=3, ignore_mismatched_sizes=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Cargar los datos de entrenamiento
train_data = pd.read_csv('SMM4H_2024_Task3_Training_1800.csv', nrows=1800, usecols=[0, 1, 2, 3], engine='python')
train_data = train_data.dropna(subset=['text', 'label'])  # Eliminar filas con valores faltantes en 'text' y 'label'

val_data = pd.read_csv('SMM4H_2024_Task3_Validation_600_codalab.csv', usecols=[0, 1, 2, 3], engine='python')
val_data = val_data.dropna(subset=['text', 'label'])  # Eliminar filas con valores faltantes en 'text' y 'label'

# Preprocesar los datos
train_data = train_data.loc[train_data['label'].isin([1, 2, 3])]
val_data = val_data.loc[val_data['label'].isin([1, 2, 3])]

# Convertir etiquetas a un rango de 0 a 2
train_data['label'] -=1
val_data['label'] -=1


print(train_data)
print(val_data)


           id                                keyword  \
1131  d3moq94                                   walk   
1132  d51rpnb                          outside, bike   
1133  d5bzg04                                   walk   
1134  d5ttkj7                                    run   
1135  d6se5to              running, run, horse, walk   
...       ...                                    ...   
1795   gqzye9                     pool, beach,  pool   
1796   env299                     outside , outdoors   
1797  e9bnr1s                                Jogging   
1798   qrmhbe                  walk, swimming,  pool   
1799   mxbsm8  roller blade, outside , roller blades   

                                                   text  label  
1131   Do you feel like the texts that you send back...      0  
1132   I'm gonna do the Pokemon thing to get myself ...      0  
1133   Something that work for me is to expose mysel...      0  
1134  Absolutely! Please encourage your son to do so...      0  
11

In [None]:
train_texts = train_data['text'].tolist()
val_texts = val_data['text'].tolist()
train_labels = train_data['label'].tolist()
val_labels = val_data['label'].tolist()

# Tokenizar los datos
train_encodings = tokenizer(train_texts, truncation=True, max_length=512, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, max_length=512, padding=True)

# Crear un nuevo diccionario con las entradas codificadas y las etiquetas
train_dataset_dict = train_encodings.copy()
train_dataset_dict['labels'] = train_labels

val_dataset_dict = val_encodings.copy()
val_dataset_dict['labels'] = val_labels

# Crear un nuevo conjunto de datos a partir del diccionario
train_dataset = Dataset.from_dict(train_dataset_dict)
val_dataset = Dataset.from_dict(val_dataset_dict)

# Remover la columna 'token_type_ids' si no es necesaria
train_dataset = train_dataset.remove_columns(['token_type_ids'])
val_dataset = val_dataset.remove_columns(['token_type_ids'])

In [None]:
# Función para calcular las métricas
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    accuracy = (predictions == labels).mean()
    report = classification_report(labels, predictions)
    print(report)
    return {"accuracy": accuracy}

In [None]:

# Definir los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-5,  # Reducir la tasa de aprendizaje
    per_device_train_batch_size=8,  # Reducir el tamaño del lote de entrenamiento
    per_device_eval_batch_size=8,  # Reducir el tamaño del lote de evaluación
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)


# Definir el entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Entrenar el modelo
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.944475,0.659193
2,No log,0.91282,0.636771
3,No log,0.973155,0.641256
4,No log,1.120553,0.641256
5,No log,1.079161,0.632287


              precision    recall  f1-score   support

           0       0.56      0.35      0.43        54
           1       0.68      0.89      0.77       131
           2       0.67      0.32      0.43        38

    accuracy                           0.66       223
   macro avg       0.63      0.52      0.54       223
weighted avg       0.65      0.66      0.63       223

              precision    recall  f1-score   support

           0       0.54      0.37      0.44        54
           1       0.68      0.82      0.74       131
           2       0.54      0.37      0.44        38

    accuracy                           0.64       223
   macro avg       0.58      0.52      0.54       223
weighted avg       0.62      0.64      0.62       223

              precision    recall  f1-score   support

           0       0.51      0.50      0.50        54
           1       0.74      0.73      0.73       131
           2       0.51      0.55      0.53        38

    accuracy        

TrainOutput(global_step=420, training_loss=0.4902493431454613, metrics={'train_runtime': 590.6801, 'train_samples_per_second': 5.663, 'train_steps_per_second': 0.711, 'total_flos': 880130165990400.0, 'train_loss': 0.4902493431454613, 'epoch': 5.0})

In [None]:
import shutil

# Monta Google Drive
drive.mount('/content/drive')

# Define la ruta de la carpeta que deseas guardar en Google Drive
carpeta_colab = '/content/results'  # Cambia esto por la ruta de tu carpeta en Colab
carpeta_drive = '/content/drive/MyDrive/FINE_ZSL_SENTIMENT_slideod'  # Cambia esto por la ruta donde deseas guardar la carpeta en Drive

# Copia la carpeta de Colab a Drive
shutil.copytree(carpeta_colab, carpeta_drive)


Mounted at /content/drive


'/content/drive/MyDrive/FINE_ZSL_SENTIMENT_slideod'