In [None]:
!pip install datasets
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DebertaV2ForSequenceClassification
from sklearn.metrics import classification_report
import torch
from sklearn.model_selection import train_test_split
import spacy
from google.colab import drive
!python -m spacy download es_core_news_sm


Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install accelerate>=0.21.0


In [None]:
# Función para obtener los enunciados con las keywords
def extract_keyword_sentences(row):
    keywords = set(row['keyword'].split(', '))  # Convertir las keywords en un conjunto para eliminar duplicados
    text = row['text']
    doc = nlp(text)
    found_sentences = set()  # Conjunto para almacenar las oraciones encontradas
    for sentence in doc.sents:
        # Verificar si la oración contiene al menos una de las keywords
        if any(keyword.lower() in sentence.text.lower() for keyword in keywords):
            found_sentences.add(sentence.text.strip())  # Agregar la oración al conjunto de oraciones encontradas
    return '. '.join(found_sentences) if found_sentences else None  # Unir las oraciones encontradas en un solo texto

In [None]:
# Cargar el modelo de spaCy para tokenizar oraciones
nlp = spacy.load("es_core_news_sm")



---



---



---



In [None]:
# Definir el modelo y el tokenizador
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DebertaV2ForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

# Cargar los datos de entrenamiento
data = pd.read_csv('SMM4H_2024_Task3_Training_1800.csv', nrows=1800, usecols=[0, 1, 2, 3], engine='python')
data = data.dropna(subset=['text', 'label'])  # Eliminar filas con valores faltantes en 'text' y 'label'

# Preprocesar los datos
data['text'] = data.apply(lambda row: extract_keyword_sentences(row), axis=1)
data['label'] = data['label'].apply(lambda x: 1 if x != 0 else 0)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Dividir los datos en entrenamiento y validación
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

train_texts = train_data['text'].tolist()
val_texts = val_data['text'].tolist()
train_labels = train_data['label'].tolist()
val_labels = val_data['label'].tolist()

# Tokenizar los datos
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Crear un nuevo diccionario con las entradas codificadas y las etiquetas
train_dataset_dict = train_encodings.copy()
train_dataset_dict['labels'] = train_labels

val_dataset_dict = val_encodings.copy()
val_dataset_dict['labels'] = val_labels

# Crear un nuevo conjunto de datos a partir del diccionario
train_dataset = Dataset.from_dict(train_dataset_dict)
val_dataset = Dataset.from_dict(val_dataset_dict)

# Remover la columna 'token_type_ids' si no es necesaria
train_dataset = train_dataset.remove_columns(['token_type_ids'])
val_dataset = val_dataset.remove_columns(['token_type_ids'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Función para calcular las métricas
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    accuracy = (predictions == labels).mean()
    report = classification_report(labels, predictions)
    print(report)
    return {"accuracy": accuracy}

In [None]:

# Definir los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-5,  # Reducir la tasa de aprendizaje
    per_device_train_batch_size=8,  # Reducir el tamaño del lote de entrenamiento
    per_device_eval_batch_size=8,  # Reducir el tamaño del lote de evaluación
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)


# Definir el entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Entrenar el modelo
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.400377,0.825
2,No log,0.429457,0.811111
3,0.395000,0.487155,0.844444
4,0.395000,0.568302,0.836111


              precision    recall  f1-score   support

           0       0.88      0.84      0.86       226
           1       0.75      0.80      0.77       134

    accuracy                           0.82       360
   macro avg       0.81      0.82      0.82       360
weighted avg       0.83      0.82      0.83       360



Checkpoint destination directory ./results/checkpoint-180 already exists and is non-empty. Saving will proceed but saved results may be invalid.


              precision    recall  f1-score   support

           0       0.93      0.75      0.83       226
           1       0.69      0.91      0.78       134

    accuracy                           0.81       360
   macro avg       0.81      0.83      0.81       360
weighted avg       0.84      0.81      0.81       360



Checkpoint destination directory ./results/checkpoint-360 already exists and is non-empty. Saving will proceed but saved results may be invalid.


              precision    recall  f1-score   support

           0       0.89      0.85      0.87       226
           1       0.77      0.83      0.80       134

    accuracy                           0.84       360
   macro avg       0.83      0.84      0.84       360
weighted avg       0.85      0.84      0.85       360

              precision    recall  f1-score   support

           0       0.91      0.82      0.86       226
           1       0.74      0.86      0.80       134

    accuracy                           0.84       360
   macro avg       0.82      0.84      0.83       360
weighted avg       0.85      0.84      0.84       360



TrainOutput(global_step=720, training_loss=0.32572136455112033, metrics={'train_runtime': 991.5974, 'train_samples_per_second': 5.809, 'train_steps_per_second': 0.726, 'total_flos': 1684269848010240.0, 'train_loss': 0.32572136455112033, 'epoch': 4.0})

In [None]:
import shutil

# Monta Google Drive
drive.mount('/content/drive')

# Define la ruta de la carpeta que deseas guardar en Google Drive
carpeta_colab = '/content/results'  # Cambia esto por la ruta de tu carpeta en Colab
carpeta_drive = '/content/drive/MyDrive/FINE_3'  # Cambia esto por la ruta donde deseas guardar la carpeta en Drive

# Copia la carpeta de Colab a Drive
shutil.copytree(carpeta_colab, carpeta_drive)


Mounted at /content/drive


'/content/drive/MyDrive/FINE_3'



---



---



### **PRUEBA DE MODELO REEENTRENADO CON DATOS DE ENTRENAMIENTO CODALAB**

In [None]:
# Cargar el modelo reentrenado y el tokenizador
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DebertaV2ForSequenceClassification.from_pretrained('/content/drive/MyDrive/FINE_3/checkpoint-720', num_labels=2)

# Cargar los nuevos datos de validación
new_data = pd.read_csv('SMM4H_2024_Task3_Training_1800.csv', nrows=1800, usecols=[0, 1, 2, 3], engine='python')


In [None]:
# Preprocesar los datos del nuevo archivo
new_data['text'] = new_data.apply(lambda row: extract_keyword_sentences(row), axis=1)
new_data['label'] = new_data['label'].apply(lambda x: 1 if x != 0 else 0)


In [None]:
# Dividir los datos en textos y etiquetas
texts = new_data['text'].tolist()
labels = new_data['label'].tolist()

# Tokenizar los textos
encodings = tokenizer(texts, truncation=True, padding=True)

# Crear un nuevo diccionario con las entradas codificadas y las etiquetas
dataset_dict = encodings.copy()
dataset_dict['labels'] = labels

# Crear un nuevo conjunto de datos a partir del diccionario
dataset = Dataset.from_dict(dataset_dict)

# Remover la columna 'token_type_ids' si no es necesaria
dataset = dataset.remove_columns(['token_type_ids'])

# Función para calcular las métricas
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    accuracy = (predictions == labels).mean()
    report = classification_report(labels, predictions)
    print(report)
    return {"accuracy": accuracy}

# Evaluar el modelo con los nuevos datos
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

results = trainer.predict(dataset)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


              precision    recall  f1-score   support

           0       0.97      0.93      0.95      1131
           1       0.89      0.95      0.92       669

    accuracy                           0.94      1800
   macro avg       0.93      0.94      0.94      1800
weighted avg       0.94      0.94      0.94      1800





---



---

### **PRUEBA DE MODELO REEENTRENADO CON DATOS DE VALIDACION CODALAB**

In [None]:
# Cargar el modelo reentrenado y el tokenizador
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DebertaV2ForSequenceClassification.from_pretrained('/content/drive/MyDrive/FINE_3/checkpoint-720', num_labels=2)

# Cargar los nuevos datos de validación
new_data = pd.read_csv('SMM4H_2024_Task3_Validation_600_codalab.csv', usecols=[0, 1, 2, 3], engine='python')
print(new_data.info)

<bound method DataFrame.info of           id              keyword  \
0    fc6l72u               cruise   
1     8eijpy  runners, run , run    
2    didnpbe       running , golf   
3    dxe6gbb                  sea   
4    f5jqgbz                waves   
..       ...                  ...   
595  edvs552                 walk   
596   ee31pf              outside   
597  eei3pz3        outside, walk   
598  eek8bpk              outside   
599  eeljxq0              outside   

                                                  text  Classification  
0    (1) I had SA but managed to practically eradic...               0  
1     Anyone looking for a friend? Real friend, beh...               0  
2     Breathe, darling. The purpose of dating is to...               0  
3     I've been overweight since 2nd grade and it a...               0  
4     I need this terribly. I’ve been in negative w...               0  
..                                                 ...             ...  
595   The th

In [None]:
# Preprocesar los datos del nuevo archivo
new_data['text'] = new_data.apply(lambda row: extract_keyword_sentences(row), axis=1)
new_data['Classification'] = new_data['Classification'].apply(lambda x: 1 if x != 0 else 0)


In [None]:
print(new_data)

          id              keyword  \
0    fc6l72u               cruise   
1     8eijpy  runners, run , run    
2    didnpbe       running , golf   
3    dxe6gbb                  sea   
4    f5jqgbz                waves   
..       ...                  ...   
595  edvs552                 walk   
596   ee31pf              outside   
597  eei3pz3        outside, walk   
598  eek8bpk              outside   
599  eeljxq0              outside   

                                                  text  Classification  
0    Yesterday I went on a party cruise alone and m...               0  
1    I shouldn't have to say it on this sub, but I'...               0  
2    If you want a long-running relationship, it wi...               0  
3    I kind of accidentally psyched myself out of t...               0  
4                  I’ve been in negative waves lately.               0  
..                                                 ...             ...  
595  I have a job right now which I love so 

In [None]:
# Dividir los datos en textos y etiquetas
texts = new_data['text'].tolist()
texts = [str(text) for text in texts]
labels = new_data['Classification'].tolist()


In [None]:
# Tokenizar los textos
encodings = tokenizer(texts, truncation=True, padding=True)

# Crear un nuevo diccionario con las entradas codificadas y las etiquetas
dataset_dict = encodings.copy()
dataset_dict['labels'] = labels

# Crear un nuevo conjunto de datos a partir del diccionario
dataset = Dataset.from_dict(dataset_dict)

# Remover la columna 'token_type_ids' si no es necesaria
dataset = dataset.remove_columns(['token_type_ids'])

# Función para calcular las métricas
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    accuracy = (predictions == labels).mean()
    report = classification_report(labels, predictions)
    print(report)
    return {"accuracy": accuracy}

# Evaluar el modelo con los nuevos datos
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

results = trainer.predict(dataset)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


              precision    recall  f1-score   support

           0       0.88      0.82      0.84       377
           1       0.72      0.80      0.76       223

    accuracy                           0.81       600
   macro avg       0.80      0.81      0.80       600
weighted avg       0.82      0.81      0.81       600





---



---

### **PRUEBA DE MODELO REEENTRENADO CON DATOS DE TEST CODALAB**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Cargar el modelo de spaCy para tokenizar oraciones
nlp = spacy.load("es_core_news_sm")

keywords = ['run', 'outside', 'swim', 'climb', 'walk', 'running', 'hike', 'park', 'horse',
            'nature', 'camp', 'soccer', 'sun', 'hiking', 'swimming', 'pool', 'forest',
            'riding', 'sea', 'runs', 'grass', 'stream', 'camps', 'tree', 'beach', 'jogging',
            'garden', 'mountains', 'pond', 'bike', 'cruise', 'waterfall', 'bikes',
            'baseball', 'basketball', 'biking', 'mountain', 'skating', 'parks', 'tennis',
            'jog', 'trees', 'boat', 'bench', 'cabin', 'waves', 'parkour', 'golf', 'streams',
            'skate', 'outdoors', 'coast', 'gardening', 'jogs', 'hikes', 'lawn', 'hill',
            'horses', 'lake', 'outside', 'backyard', 'Fresh air', 'Hiking', 'runner',
            'backpacking', 'climb', 'nature', 'run', 'Garden', 'Go for a walk', 'walk',
            'Outdoor', 'park', 'Parks', 'Jogging', 'kayak', 'camping', 'snowboard',
            'Go for a run', 'bicycle', 'Open Space', 'sea', 'pool', 'roller blade']

# Función para obtener los enunciados con las keywords o devolver el texto original
def extract_keyword_sentences(row):
    text = row['text']
    doc = nlp(text)
    found_sentences = set()  # Conjunto para almacenar las oraciones encontradas
    for sentence in doc.sents:
        # Verificar si la oración contiene al menos una de las keywords
        if any(keyword.lower() in sentence.text.lower() for keyword in keywords):
            found_sentences.add(sentence.text.strip())  # Agregar la oración al conjunto de oraciones encontradas
    if found_sentences:
        return '. '.join(found_sentences)
    else:
        return text

# Leer el archivo de datos
data = pd.read_csv('SMM4H2024_Task3_testposts_decoy.csv', nrows=1200, engine='python')

# Aplicar la función a cada fila del DataFrame
data['keyword_sentences'] = data.apply(extract_keyword_sentences, axis=1)

In [None]:
# Cargar el modelo reentrenado y el tokenizador
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DebertaV2ForSequenceClassification.from_pretrained('/content/drive/MyDrive/FINE_3/checkpoint-720', num_labels=2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

In [None]:
print(data)


           id                                               text  \
0      inmdrd  Venting about friend problems. Okay, so years ...   
1      rav4bw  I can’t live with the mind I was given.. Every...   
2     e03jwjy  The best guy at this summer job I quit was an ...   
3      khd8q1  I think ive had SAD all my life and idk how to...   
4      q9uxky   Part of me wants to live in a cottage somewhe...   
...       ...                                                ...   
1195  dt18716   Just do it. Sports don’t require a lot of soc...   
1196   nilic6  I (23M) feel like going outside but i don't kn...   
1197   cz24q1   hanging out with stoners how can you have fun...   
1198  e9xd0uy   I love memes, but this sub is not the place f...   
1199  fusqwja   Definitely start by getting out the house. Go...   

                                      keyword_sentences  
0     Late into high school we also got offered the ...  
1     Staying in my dark room all day Listening to d...  
2     I'm

In [None]:
# Función para obtener las predicciones e imprimir el texto y la clase
def obtener_predicciones(text):
    # Tokenizar el texto
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # Pasar los tokens al modelo
    outputs = model(**inputs)
    # Obtener la predicción
    logits = outputs.logits
    predicted_class = logits.argmax().item()
    # Imprimir el texto y la clase asignada
    print(f"Texto: {text}")
    print(f"Clase asignada: {predicted_class}")
    return predicted_class


In [None]:
# Aplicar la función a cada texto en los datos de prueba
predictions = []
for text in data['keyword_sentences']:
    predictions.append(obtener_predicciones(text))

# Crear un DataFrame con las predicciones
results_df = pd.DataFrame({'id': data['id'], 'text': data['text'], 'label': predictions})

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Texto: Late into high school we also got offered the options to go for a trip to the Czech Republic, after that a sailing camp, etc.. A little while after that they still invited me to go camping with them but I refused that as well.. Okay, so years ago back in high school the first year or so I made a few friends quite quickly, but I never really spent time with them outside of school at the time, it's like it didn't even cross my mind, and when they offered me the choice I was like "well, I can just see them at school can't I?".
Clase asignada: 1
Texto: Staying in my dark room all day Listening to depressing music like Linkin Park, trying to find someone who understands my pain, and the monster inside me.. Every time I walk outside I feel like I have 500 sniper rifles pointed at me.
Clase asignada: 1
Texto: I'm not a loser for sucking at asking people for their credit card details in the middle of their beach holiday, but I did learn I was better than I thought at animatedly telling 

In [None]:
print(results_df)

           id                                               text  label
0      inmdrd  Venting about friend problems. Okay, so years ...      1
1      rav4bw  I can’t live with the mind I was given.. Every...      1
2     e03jwjy  The best guy at this summer job I quit was an ...      0
3      khd8q1  I think ive had SAD all my life and idk how to...      1
4      q9uxky   Part of me wants to live in a cottage somewhe...      1
...       ...                                                ...    ...
1195  dt18716   Just do it. Sports don’t require a lot of soc...      1
1196   nilic6  I (23M) feel like going outside but i don't kn...      1
1197   cz24q1   hanging out with stoners how can you have fun...      1
1198  e9xd0uy   I love memes, but this sub is not the place f...      0
1199  fusqwja   Definitely start by getting out the house. Go...      1

[1200 rows x 3 columns]


In [None]:
# Contar las clases en la columna 'label'
class_counts = results_df['label'].value_counts()

# Mostrar los conteos de clases
print(class_counts)


label
1    871
0    329
Name: count, dtype: int64


In [None]:
# Guardar el DataFrame en un archivo CSV
results_df.to_csv('resultados_clasificacion_outdoor_notoutdoor_test.csv', index=False)