## Add context to models:

In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, DistilBertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel, XLMRobertaModel, AutoTokenizer
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
import re
import string
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pysentimiento.preprocessing import preprocess_tweet

from datasets import Dataset
from datasets import DatasetDict
from transformers import Trainer, TrainingArguments
from transformers import TrainerCallback, TrainerState, TrainerControl
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
from datetime import datetime
import os
import uuid
from pathlib import Path
from openpyxl import load_workbook

## Load Data

In [2]:
def load_data(data):
    # Leer el archivo de datos
    df = pd.read_csv(data)
    return df

## Process Data

In [3]:
def preprocess_function(df, context, tweet_original):

    # Preprocesado de datos
    df['tweet_respuesta'] = df['full_text'].apply(lambda x: preprocess_tweet(x, lang="es"))

    # Añadir columnas para diferentes contextos:
    df['Sin_contexto'] = df['full_text']
    df['Tweet_context'] = tweet_original + " [SEP] " + df['tweet_respuesta']
    df['Full_context'] = tweet_original + " [SEP] " + df['tweet_respuesta'] + " [SEP] " + context

    return df

In [4]:
def filter_by_type(df, label_column):

    if label_column == "Análisis General":
        # Define the specific labels to keep
        #etiquetas = ["Comentario Positivo", "Comentario Negativo", "Comentario Neutro"]
        etiquetas = ["Comentario Positivo", "Comentario Negativo"]
        
        df['Análisis General'] = df['Análisis General'].where(df['Análisis General'].isin(etiquetas))


        # Remove NAs
        df = df.dropna(subset=['Análisis General'])
        

        # Factorize the 'Análisis General' column
        labels, labels_names = pd.factorize(df['Análisis General'])

        # 'labels' now contains the numeric representation of your original labels
        # 'label_names' contains the unique values from your original column in the order they were encoded

        # Replace the original column with the numeric labels
        df['Análisis General'] = labels

        # If you want to keep a record of the mapping from the original labels to the numeric labels
        label_mapping = dict(zip(labels_names, range(len(labels_names))))
        #print("Label Mapping:", label_mapping)

    if label_column == "Contenido Negativo":

        # Filtrar el DataFrame para seleccionar solo los "Comentario Negativo"
        df = df.loc[df['Análisis General'] == 'Comentario Negativo']

        # Define the specific labels to keep
        etiquetas = ["Desprestigiar Víctima", "Desprestigiar Acto", "Insultos", "Desprestigiar Deportista Autora"]
        df['Contenido Negativo'] = df['Contenido Negativo'].where(df['Contenido Negativo'].isin(etiquetas))

        # Remove NAs
        df = df.dropna(subset=['Contenido Negativo'])
        

        # Factorize the 'Análisis General' column
        labels, labels_names = pd.factorize(df['Contenido Negativo'])

        # 'labels' now contains the numeric representation of your original labels
        # 'label_names' contains the unique values from your original column in the order they were encoded

        # Replace the original column with the numeric labels
        df['Contenido Negativo'] = labels

        # If you want to keep a record of the mapping from the original labels to the numeric labels
        label_mapping = dict(zip(labels_names, range(len(labels_names))))
        #print("Label Mapping:", label_mapping)


    if label_column == "Insultos":

        # Filtrar el DataFrame para seleccionar solo los "Comentario Negativo"
        df = df.loc[df['Análisis General'] == 'Comentario Negativo']

        # Define the specific labels to keep
        etiquetas = ["Deseo de Dañar", "Genéricos", "Sexistas/misóginos", ""]

        # Replace labels that are not in the list with "Genéricos"
        df['Insultos'] = df['Insultos'].where(df['Insultos'].isin(etiquetas), other="Genéricos")

        # Remove NAs
        df = df.dropna(subset=['Insultos'])
        

        # Factorize the 'Insultos' column
        labels, labels_names = pd.factorize(df['Insultos'])

        # 'labels' now contains the numeric representation of your original labels
        # 'label_names' contains the unique values from your original column in the order they were encoded

        # Replace the original column with the numeric labels
        df['Insultos'] = labels

        # If you want to keep a record of the mapping from the original labels to the numeric labels
        label_mapping = dict(zip(labels_names, range(len(labels_names))))
        #print("Label Mapping:", label_mapping)

    num_labels = len(etiquetas)



    return df, labels_names, num_labels

## Tokenizer

In [5]:
def load_tokenizer(name):
    
    if name == "dccuchile/bert-base-spanish-wwm-cased":
        tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

    if name == "PlanTL-GOB-ES/roberta-large-bne":
        tokenizer = RobertaTokenizer.from_pretrained('PlanTL-GOB-ES/roberta-large-bne') 

    if name == "bert-base-multilingual-cased":
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    if name == "FacebookAI/xlm-roberta-base":
        tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

    if name == "pysentimiento/robertuito-base-cased":
        tokenizer = AutoTokenizer.from_pretrained('pysentimiento/robertuito-base-cased')

    return tokenizer

In [6]:
# Preparar inputs tokenizados para cada contexto
def prepare_inputs(df, text_column, label_column, tokenizer):
    
    # Tokenización de los textos
    tokenized_data = tokenizer(df[text_column].tolist(), padding=PADDING, truncation=TRUNCATION, max_length=MAX_LENGTH, return_tensors='pt')
    
    # Factorizar las etiquetas si son categóricas
    labels, _ = pd.factorize(df[label_column])
    
    # Convertir las etiquetas a un tensor
    labels = torch.tensor(labels)
    
    # Retorna un diccionario con los inputs tokenizados y las etiquetas
    return {**tokenized_data, 'labels': labels}


## Prepare Dataset

In [7]:
def create_hf_dataset(tokenized_inputs):
    return Dataset.from_dict(tokenized_inputs)

In [8]:
# Función para dividir un dataset en train, validation y test
def split_dataset(dataset, test_size=0.1, val_size=0.3):
    # Dividir primero en train+val y test
    train_val_dataset, test_dataset = dataset.train_test_split(test_size=test_size).values()

    # Ahora dividir train+val en train y val
    train_dataset, val_dataset = train_val_dataset.train_test_split(test_size=val_size / (1 - test_size)).values()

    return DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })


## Model

In [9]:
def load_model(name, num_labels):
    if name == "dccuchile/bert-base-spanish-wwm-cased":
        model = BertForSequenceClassification.from_pretrained(name, num_labels=num_labels)

    if name == "PlanTL-GOB-ES/roberta-large-bne":
        model = RobertaModel.from_pretrained(name, num_labels=num_labels) 

    if name == "bert-base-multilingual-cased":
        model = BertModel.from_pretrained(name, num_labels=num_labels)

    if name == "FacebookAI/xlm-roberta-base":
        model = XLMRobertaModel.from_pretrained(name, num_labels=num_labels)

    if name == "pysentimiento/robertuito-base-cased":
        model = AutoTokenizer.from_pretrained(name, num_labels=num_labels)

    return model

## Training

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Obtener reporte completo
    report = classification_report(labels, predictions, output_dict=True)
    
    # Obtener la matriz de confusión
    conf_matrix = confusion_matrix(labels, predictions)
    
    # Extraer métricas para cada clase y globales
    metrics = {
        'accuracy': report['accuracy'],
        'weighted_f1': report['weighted avg']['f1-score'],
        'weighted_precision': report['weighted avg']['precision'],
        'weighted_recall': report['weighted avg']['recall'],
        # La matriz de confusión no se incluye normalmente como una métrica devuelta porque no es un escalar
        'confusion_matrix': conf_matrix.tolist()  # Convertir a lista para asegurarse de que es serializable si es necesario
    }
    
    # Añadir métricas específicas por clase si se requiere
    for label, scores in report.items():
        if label not in ["accuracy", "macro avg", "weighted avg"]:
            metrics[f'{label}_precision'] = scores['precision']
            metrics[f'{label}_recall'] = scores['recall']
            metrics[f'{label}_f1'] = scores['f1-score']
            metrics[f'{label}_support'] = scores['support']
    
    return metrics

In [11]:
def load_training_args(context_type, num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_steps=10):
    training_args = TrainingArguments(
        output_dir=f'./results/{context_type}',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        warmup_steps=warmup_steps,
        weight_decay=weight_decay,
        logging_dir=f'./logs/{context_type}',
        logging_steps=logging_steps,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True
    )

    return training_args

In [12]:
class SaveResultsCallback(TrainerCallback):
    def __init__(self, excel_path, training_args):
        self.excel_path = excel_path
        self.training_args = vars(training_args)  # Convertir a diccionario si es necesario
        self.rows = []
        self.initialized = False  # Para asegurarse de inicializar el archivo una vez

    def on_train_begin(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        # Esta función se llama al comienzo del entrenamiento
        if state.is_local_process_zero:
            if not self.initialized:
                # Inicializa el archivo Excel solo una vez
                self.init_excel()
                self.initialized = True

    def init_excel(self):
        # Inicializa el archivo Excel creando el archivo si no existe con encabezados
        if not Path(self.excel_path).exists():
            df_header = pd.DataFrame(columns=["epoch", *self.training_args.keys()])
            df_header.to_excel(self.excel_path, index=False)

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics, **kwargs):
        # Esta función se llama después de cada evaluación
        if state.is_local_process_zero:
            self.rows.append({**metrics, "epoch": state.epoch})

    def save_to_excel(self):
        # Guardar todas las métricas y parámetros acumulados en el archivo Excel
        df = pd.DataFrame(self.rows)
        with pd.ExcelWriter(self.excel_path, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
            df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)
            print(f"Data logged to {self.excel_path}")

In [13]:
def log_data(excel_path, run_id, eval_results, training_args, additional_info):
    # Cargar o crear un DataFrame para registrar los datos
    try:
        df = pd.read_excel(excel_path, engine='openpyxl')
    except FileNotFoundError:
        # Si el archivo no existe, inicializar un nuevo DataFrame
        df = pd.DataFrame()

    # Convertir el objeto de argumentos de entrenamiento a diccionario si es necesario
    if not isinstance(training_args, dict):
        training_args = vars(training_args)
    
    # Convertir los resultados de la evaluación a diccionario si es necesario
    if not isinstance(eval_results, dict):
        eval_results = vars(eval_results)

    # Construir la fila de datos a agregar
    data = {
        **{'Run_ID': run_id},
        **training_args,
        **eval_results,
        **additional_info
    }
    
    # Asegurarse de que todos los valores sean serializables, convertir a string si es necesario
    for key, value in data.items():
        if not isinstance(value, (int, float, str)):
            data[key] = str(value)

    # Agregar la nueva fila al DataFrame
    df = df._append(data, ignore_index=True)
    
    # Guardar el DataFrame actualizado en el archivo Excel
    df.to_excel(excel_path, index=False, engine='openpyxl')

# Pipeline

In [14]:
# Data
DATA = "../data/BBDD_SeAcabo.csv"

# Target Column
LABEL_COLUMN = 'Insultos' # ["Análisis General", "Contenido Negativo", "Insultos"]

# Types Dataset
TYPES_DATASET = ["Sin_contexto", "Tweet_context", "Full_context"]

# Tweet original Alexia Putellas
TWEET_ORIGINAL = "Esto es inaceptable. Se acabó. Contigo compañera @Jennihermoso"

# Contexto
CONTEXT = """
                En agosto de 2023, tras la victoria de la Selección femenina de fútbol de España en la Copa Mundial Femenina de Fútbol de 2023, durante la celebración en la entrega de las medallas y tras abrazar efusivamente a varias jugadoras, Luis Rubiales besó en los labios a la centrocampista Jennifer Hermoso mientras sujetaba su cabeza con las manos. Hermoso lo denunció ante la Fiscalía por acoso sexual, coacciones y agresión sexual. La Fiscalía presentó una demanda contra Rubiales ante la Audiencia Nacional en Madrid
            """

# Model
MODEL_NAME = 'dccuchile/bert-base-spanish-wwm-cased' #["dccuchile/bert-base-spanish-wwm-case", "PlanTL-GOB-ES/roberta-large-bne", "bert-base-multilingual-cased", "FacebookAI/xlm-roberta-base", "pysentimiento/robertuito-base-cased"]

# Hyperparameters
PADDING = True
TRUNCATION = True
MAX_LENGTH = 512
NUM_TRAIN_EPOCHS = 3
PER_DEVICE_TRAIN_BATCH_SIZE = 16
PER_DEVICE_EVAL_BATCH_SIZE = 16
WARMUP_STEPS = 500
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 10

# EXCEL
EXCEL_PATH = 'DL'


models = [MODEL_NAME]
for model in models:

    # Load data
    df = load_data(data=DATA)

    # Preprocess data
    df = preprocess_function(df, context=CONTEXT, tweet_original=TWEET_ORIGINAL)

    # Labels
    df, labels_names, num_labels = filter_by_type(df, LABEL_COLUMN)
    print(num_labels)

    # Tokenizer
    tokenizer = load_tokenizer(name=MODEL_NAME)

    for type_d in TYPES_DATASET:
        print(f"{type_d=}")
        # Prepare input
        inputs_dataset = prepare_inputs(df, text_column=type_d, label_column=LABEL_COLUMN, tokenizer=tokenizer)

        # Create datasets
        dataset = create_hf_dataset(inputs_dataset)

        # Split dataset
        dataset = split_dataset(dataset, test_size=0.2, val_size=0.1)
        
        # Load model
        model = load_model(name=MODEL_NAME, num_labels=num_labels)

        # Train
        training_args = load_training_args(context_type=type_d, num_train_epochs=NUM_TRAIN_EPOCHS, per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE, per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE, warmup_steps=WARMUP_STEPS, weight_decay=WEIGHT_DECAY, logging_steps=LOGGING_STEPS)

        save_results_callback = SaveResultsCallback(f"{EXCEL_PATH}_Training_{LABEL_COLUMN}.xlsx", training_args)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset['train'],
            eval_dataset=dataset['validation'],
            compute_metrics=compute_metrics,
            callbacks=[save_results_callback]
        )
        trainer.train()

        # Eval
        eval_results = trainer.evaluate(dataset['test'])
        results_df = pd.DataFrame([eval_results])


        # Información adicional que quieres registrar
        additional_info = {
            'Model_Description': MODEL_NAME,
            'Data_File': DATA,
            'Type': type_d
        }

        # Llamar a log_data para guardar los resultados y la configuración
        log_data(f"{EXCEL_PATH}_{LABEL_COLUMN}.xlsx", str(uuid.uuid4()) , eval_results, training_args, additional_info)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Insultos'] = df['Insultos'].where(df['Insultos'].isin(etiquetas), other="Genéricos")


4
type_d='Sin_contexto'


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'loss': 1.3285, 'grad_norm': 5.798182964324951, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.45}
{'loss': 1.2531, 'grad_norm': 6.038410186767578, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.91}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.0631638765335083, 'eval_accuracy': 0.8181818181818182, 'eval_weighted_f1': 0.7363636363636363, 'eval_weighted_precision': 0.6694214876033059, 'eval_weighted_recall': 0.8181818181818182, 'eval_confusion_matrix': [[36, 0, 0], [4, 0, 0], [4, 0, 0]], 'eval_0_precision': 0.8181818181818182, 'eval_0_recall': 1.0, 'eval_0_f1': 0.9, 'eval_0_support': 36.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 4.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 4.0, 'eval_runtime': 6.0225, 'eval_samples_per_second': 7.306, 'eval_steps_per_second': 0.498, 'epoch': 1.0}
{'loss': 1.0778, 'grad_norm': 7.290902137756348, 'learning_rate': 3e-06, 'epoch': 1.36}
{'loss': 0.9426, 'grad_norm': 5.3352742195129395, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.82}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.677189290523529, 'eval_accuracy': 0.8181818181818182, 'eval_weighted_f1': 0.7363636363636363, 'eval_weighted_precision': 0.6694214876033059, 'eval_weighted_recall': 0.8181818181818182, 'eval_confusion_matrix': [[36, 0, 0], [4, 0, 0], [4, 0, 0]], 'eval_0_precision': 0.8181818181818182, 'eval_0_recall': 1.0, 'eval_0_f1': 0.9, 'eval_0_support': 36.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 4.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 4.0, 'eval_runtime': 5.2308, 'eval_samples_per_second': 8.412, 'eval_steps_per_second': 0.574, 'epoch': 2.0}
{'loss': 0.9157, 'grad_norm': 4.083681583404541, 'learning_rate': 5e-06, 'epoch': 2.27}
{'loss': 0.7642, 'grad_norm': 3.964444637298584, 'learning_rate': 6e-06, 'epoch': 2.73}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6270166635513306, 'eval_accuracy': 0.8181818181818182, 'eval_weighted_f1': 0.7363636363636363, 'eval_weighted_precision': 0.6694214876033059, 'eval_weighted_recall': 0.8181818181818182, 'eval_confusion_matrix': [[36, 0, 0], [4, 0, 0], [4, 0, 0]], 'eval_0_precision': 0.8181818181818182, 'eval_0_recall': 1.0, 'eval_0_f1': 0.9, 'eval_0_support': 36.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 4.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 4.0, 'eval_runtime': 5.3073, 'eval_samples_per_second': 8.29, 'eval_steps_per_second': 0.565, 'epoch': 3.0}
{'train_runtime': 403.5704, 'train_samples_per_second': 2.579, 'train_steps_per_second': 0.164, 'train_loss': 1.0292651870033958, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7365050911903381, 'eval_accuracy': 0.75, 'eval_weighted_f1': 0.6428571428571428, 'eval_weighted_precision': 0.5625, 'eval_weighted_recall': 0.75, 'eval_confusion_matrix': [[33, 0, 0], [5, 0, 0], [6, 0, 0]], 'eval_0_precision': 0.75, 'eval_0_recall': 1.0, 'eval_0_f1': 0.8571428571428571, 'eval_0_support': 33.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 5.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 6.0, 'eval_runtime': 5.2111, 'eval_samples_per_second': 8.444, 'eval_steps_per_second': 0.576, 'epoch': 3.0}
type_d='Tweet_context'


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'loss': 1.3127, 'grad_norm': 7.365809917449951, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.45}
{'loss': 1.234, 'grad_norm': 5.032135486602783, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.91}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.18295156955719, 'eval_accuracy': 0.6136363636363636, 'eval_weighted_f1': 0.4667093469910371, 'eval_weighted_precision': 0.37654958677685957, 'eval_weighted_recall': 0.6136363636363636, 'eval_confusion_matrix': [[27, 0, 0], [7, 0, 0], [10, 0, 0]], 'eval_0_precision': 0.6136363636363636, 'eval_0_recall': 1.0, 'eval_0_f1': 0.7605633802816901, 'eval_0_support': 27.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 7.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 10.0, 'eval_runtime': 5.7226, 'eval_samples_per_second': 7.689, 'eval_steps_per_second': 0.524, 'epoch': 1.0}
{'loss': 1.0713, 'grad_norm': 3.812204122543335, 'learning_rate': 3e-06, 'epoch': 1.36}
{'loss': 0.8683, 'grad_norm': 3.422548770904541, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.82}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.0576763153076172, 'eval_accuracy': 0.6136363636363636, 'eval_weighted_f1': 0.4667093469910371, 'eval_weighted_precision': 0.37654958677685957, 'eval_weighted_recall': 0.6136363636363636, 'eval_confusion_matrix': [[27, 0, 0], [7, 0, 0], [10, 0, 0]], 'eval_0_precision': 0.6136363636363636, 'eval_0_recall': 1.0, 'eval_0_f1': 0.7605633802816901, 'eval_0_support': 27.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 7.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 10.0, 'eval_runtime': 5.6965, 'eval_samples_per_second': 7.724, 'eval_steps_per_second': 0.527, 'epoch': 2.0}
{'loss': 0.6941, 'grad_norm': 4.145359039306641, 'learning_rate': 5e-06, 'epoch': 2.27}
{'loss': 0.9258, 'grad_norm': 7.036401271820068, 'learning_rate': 6e-06, 'epoch': 2.73}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.9483511447906494, 'eval_accuracy': 0.6136363636363636, 'eval_weighted_f1': 0.4667093469910371, 'eval_weighted_precision': 0.37654958677685957, 'eval_weighted_recall': 0.6136363636363636, 'eval_confusion_matrix': [[27, 0, 0], [7, 0, 0], [10, 0, 0]], 'eval_0_precision': 0.6136363636363636, 'eval_0_recall': 1.0, 'eval_0_f1': 0.7605633802816901, 'eval_0_support': 27.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 7.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 10.0, 'eval_runtime': 7.9325, 'eval_samples_per_second': 5.547, 'eval_steps_per_second': 0.378, 'epoch': 3.0}
{'train_runtime': 475.1041, 'train_samples_per_second': 2.191, 'train_steps_per_second': 0.139, 'train_loss': 1.0027097282987651, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6376959681510925, 'eval_accuracy': 0.8181818181818182, 'eval_weighted_f1': 0.7363636363636363, 'eval_weighted_precision': 0.6694214876033059, 'eval_weighted_recall': 0.8181818181818182, 'eval_confusion_matrix': [[36, 0, 0], [2, 0, 0], [6, 0, 0]], 'eval_0_precision': 0.8181818181818182, 'eval_0_recall': 1.0, 'eval_0_f1': 0.9, 'eval_0_support': 36.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 2.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 6.0, 'eval_runtime': 8.307, 'eval_samples_per_second': 5.297, 'eval_steps_per_second': 0.361, 'epoch': 3.0}
type_d='Full_context'


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'loss': 1.3026, 'grad_norm': 8.927135467529297, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.45}
{'loss': 1.1882, 'grad_norm': 5.688700199127197, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.91}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.1201770305633545, 'eval_accuracy': 0.6136363636363636, 'eval_weighted_f1': 0.4667093469910371, 'eval_weighted_precision': 0.37654958677685957, 'eval_weighted_recall': 0.6136363636363636, 'eval_confusion_matrix': [[27, 0, 0], [7, 0, 0], [10, 0, 0]], 'eval_0_precision': 0.6136363636363636, 'eval_0_recall': 1.0, 'eval_0_f1': 0.7605633802816901, 'eval_0_support': 27.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 7.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 10.0, 'eval_runtime': 10.4988, 'eval_samples_per_second': 4.191, 'eval_steps_per_second': 0.286, 'epoch': 1.0}
{'loss': 1.0014, 'grad_norm': 4.197778224945068, 'learning_rate': 3e-06, 'epoch': 1.36}
{'loss': 0.8071, 'grad_norm': 4.340452194213867, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.82}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.0657492876052856, 'eval_accuracy': 0.6136363636363636, 'eval_weighted_f1': 0.4667093469910371, 'eval_weighted_precision': 0.37654958677685957, 'eval_weighted_recall': 0.6136363636363636, 'eval_confusion_matrix': [[27, 0, 0], [7, 0, 0], [10, 0, 0]], 'eval_0_precision': 0.6136363636363636, 'eval_0_recall': 1.0, 'eval_0_f1': 0.7605633802816901, 'eval_0_support': 27.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 7.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 10.0, 'eval_runtime': 11.1375, 'eval_samples_per_second': 3.951, 'eval_steps_per_second': 0.269, 'epoch': 2.0}
{'loss': 0.6774, 'grad_norm': 3.8984687328338623, 'learning_rate': 5e-06, 'epoch': 2.27}
{'loss': 0.9218, 'grad_norm': 4.9572601318359375, 'learning_rate': 6e-06, 'epoch': 2.73}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.935492753982544, 'eval_accuracy': 0.6136363636363636, 'eval_weighted_f1': 0.4667093469910371, 'eval_weighted_precision': 0.37654958677685957, 'eval_weighted_recall': 0.6136363636363636, 'eval_confusion_matrix': [[27, 0, 0], [7, 0, 0], [10, 0, 0]], 'eval_0_precision': 0.6136363636363636, 'eval_0_recall': 1.0, 'eval_0_f1': 0.7605633802816901, 'eval_0_support': 27.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 7.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 10.0, 'eval_runtime': 10.6202, 'eval_samples_per_second': 4.143, 'eval_steps_per_second': 0.282, 'epoch': 3.0}
{'train_runtime': 789.3887, 'train_samples_per_second': 1.319, 'train_steps_per_second': 0.084, 'train_loss': 0.971211881348581, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6613195538520813, 'eval_accuracy': 0.8181818181818182, 'eval_weighted_f1': 0.7363636363636363, 'eval_weighted_precision': 0.6694214876033059, 'eval_weighted_recall': 0.8181818181818182, 'eval_confusion_matrix': [[36, 0, 0], [2, 0, 0], [6, 0, 0]], 'eval_0_precision': 0.8181818181818182, 'eval_0_recall': 1.0, 'eval_0_f1': 0.9, 'eval_0_support': 36.0, 'eval_1_precision': 0.0, 'eval_1_recall': 0.0, 'eval_1_f1': 0.0, 'eval_1_support': 2.0, 'eval_2_precision': 0.0, 'eval_2_recall': 0.0, 'eval_2_f1': 0.0, 'eval_2_support': 6.0, 'eval_runtime': 10.4359, 'eval_samples_per_second': 4.216, 'eval_steps_per_second': 0.287, 'epoch': 3.0}


## Visualize

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Leer los datos de Excel
df = pd.read_excel("path_to_excel.xlsx")

# Asegurarse de que las épocas sean numéricas (excluye la fila de parámetros si está presente)
df = df[df['epoch'] != 'Params']
df['epoch'] = pd.to_numeric(df['epoch'])


In [None]:
# Graficar la precisión
plt.figure(figsize=(10, 5))
plt.plot(df['epoch'], df['accuracy'], label='Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Model Accuracy per Epoch')
plt.legend()
plt.show()

# Graficar el F1 score
plt.figure(figsize=(10, 5))
plt.plot(df['epoch'], df['f1_score'], label='F1 Score', color='orange')
plt.xlabel('Epochs')
plt.ylabel('F1 Score')
plt.title('Model F1 Score per Epoch')
plt.legend()
plt.show()


In [None]:
# Graficar múltiples métricas en el mismo gráfico
plt.figure(figsize=(10, 5))
plt.plot(df['epoch'], df['accuracy'], label='Accuracy')
plt.plot(df['epoch'], df['f1_score'], label='F1 Score')
plt.plot(df['epoch'], df['weighted_precision'], label='Weighted Precision')
plt.plot(df['epoch'], df['weighted_recall'], label='Weighted Recall')
plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.title('Model Performance Metrics per Epoch')
plt.legend()
plt.show()


In [None]:
# Graficar el soporte para cada clase por época
classes = [col for col in df.columns if col.endswith('_support')]
for cls in classes:
    plt.figure(figsize=(10, 5))
    plt.bar(df['epoch'], df[cls], label=cls)
    plt.xlabel('Epochs')
    plt.ylabel('Support')
    plt.title(f'Support for {cls} per Epoch')
    plt.legend()
    plt.show()


### Sin Contexto:

In [None]:
trainer = Trainer(
    model=model,  # Asumiendo que 'model' es tu modelo de transformers ya inicializado
    args=training_args,
    train_dataset=split_datasets_sin_contexto['train'],
    eval_dataset=split_datasets_sin_contexto['validation'],
    compute_metrics=compute_metrics  # Asumiendo que tienes una función para calcular métricas
)
trainer.train()

In [None]:
# Obtener resultados de evaluación
results = trainer.evaluate(split_datasets_sin_contexto['test'])

print(results)

### Con Tweet Original:

In [None]:
trainer = Trainer(
    model=model,  # Asumiendo que 'model' es tu modelo de transformers ya inicializado
    args=training_args,
    train_dataset=split_datasets_tweet_context['train'],
    eval_dataset=split_datasets_tweet_context['validation'],
    compute_metrics=compute_metrics  # Asumiendo que tienes una función para calcular métricas
)
trainer.train()

In [None]:
# Obtener resultados de evaluación
results = trainer.evaluate(split_datasets_tweet_context['test'])

print(results)

### Con Tweet Original y Contexto:

In [None]:
trainer = Trainer(
    model=model,  # Asumiendo que 'model' es tu modelo de transformers ya inicializado
    args=training_args,
    train_dataset=split_datasets_full_context['train'],
    eval_dataset=split_datasets_full_context['validation'],
    compute_metrics=compute_metrics  # Asumiendo que tienes una función para calcular métricas
)
trainer.train()

In [None]:
# Obtener resultados de evaluación
results = trainer.evaluate(split_datasets_full_context['test'])

print(results)

## Probar Modelo

- Cargar el Modelo y el Tokenizer

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Ruta al checkpoint
checkpoint_path = "./results/checkpoint-1000"

# Cargar el tokenizer
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

# Cargar el modelo
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)


- Preparar el Texto para la Predicción

In [None]:
def prepare_prediction_input(text, tokenizer):
    # Preprocesa el texto como lo hiciste antes de entrenar (por ejemplo, limpieza básica, truncar, etc.)
    inputs = tokenizer(text, return_tensors="pt", max_length=MAX_LENGTH, truncation=TRUNCATION, padding=PADDING)
    return inputs


- Realizar la Predicción

In [None]:
def predict(text, tokenizer, model):
    # Preparar el texto para el modelo
    model_inputs = prepare_prediction_input(text, tokenizer)
    
    # Mover el modelo a CPU o GPU según esté configurado
    model.eval()  # Poner el modelo en modo de evaluación
    with torch.no_grad():  # No calcular gradientes
        outputs = model(**model_inputs)
    
    # Obtener logits
    logits = outputs.logits
    
    # Convertir los logits a probabilidades (opcional)
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Obtener la clase predicha
    predicted_class_index = probabilities.argmax().item()
    
    return predicted_class_index, probabilities.numpy()

# Ejemplo de uso
tweet = "@Jennihermoso, ánimo campeona"
text = tweet_original + " [SEP] " + tweet + " [SEP] " + contexto
print(text)
predicted_class, probabilities = predict(text, tokenizer, model)
print(f"Clase predicha: {predicted_class}")
print(f"Probabilidades: {probabilities}")
