## Add context to models:

In [37]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, DistilBertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel, XLMRobertaModel, AutoTokenizer, AutoModelForSequenceClassification, XLMRobertaForSequenceClassification, RobertaForSequenceClassification
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
import re
import string
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pysentimiento.preprocessing import preprocess_tweet

from datasets import Dataset
from datasets import DatasetDict
from transformers import Trainer, TrainingArguments
from transformers import TrainerCallback, TrainerState, TrainerControl
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
from datetime import datetime
import os
import uuid
from pathlib import Path
from sklearn.model_selection import train_test_split
from collections import Counter
import time
import ast

## Load Data

In [38]:
def convert_embedding_to_list(list_embedding):
    num_list = ast.literal_eval(list_embedding)
    return num_list

In [39]:
def load_data(dataset_name, embedding_name):
    # OpenAI Embedding
    if embedding_name == "text-embedding-3-large":
        df = pd.read_csv('C:/Users/jorge/Desktop/UNI/4-CUARTO/4-2-TFG/CODE/Gender-Bias/OpenAI/seacabo_embeddings.csv')
        df['embeddings'] = df['embeddings'].apply(lambda x : convert_embedding_to_list(x))
    else:
        df = pd.read_csv(dataset_name)
    
    return df

## Process Data

In [40]:
def filter_by_lang(df):
    langs = ['es', 'cy', 'ht', 'in', 'lt', 'qam', 'tl', 'und']
    df = df[df['lang'].isin(langs)]
    
    return df

In [41]:
def load_lexicon(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lexicon = {line.strip().lower() for line in file if line.strip()}
    return lexicon

def add_special_tokens(df, NEW_TOKENS):
    # lexicons
    misogyny_list = load_lexicon("../Lexicons/lexicons_train_misogyny_lexicon.txt")
    insults_list = load_lexicon("../Lexicons/lexicons_train_insults_lexicon.txt")
    victim_list = load_lexicon("../Lexicons/lexicons_victim_seacabo.txt")
    aggressor_list = load_lexicon("../Lexicons/lexicons_aggressor_seacabo.txt")

    def replace_words_with_tokens(text):
        words = text.split()
        processed_words = []
        for word in words:
            word_lower = word.lower()
            if word_lower in insults_list or word_lower in misogyny_list:
                processed_words.append(NEW_TOKENS[0]) # [INSULT]
            elif word_lower in victim_list:
                processed_words.append(NEW_TOKENS[1]) # [VICTIM]
            elif word_lower in aggressor_list:
                processed_words.append(NEW_TOKENS[2]) # [AGGRESSOR]
            else:
                processed_words.append(word)
        return ' '.join(processed_words)
    
    # Aplicar la función de procesamiento a la columna especificada
    df['full_text'] = df['full_text'].apply(replace_words_with_tokens)
    return df


In [42]:
def preprocess_function(df, context, tweet_original):

    # Preprocesado de datos
    df['tweet_respuesta'] = df['full_text'].apply(lambda x: preprocess_tweet(x, lang="es"))

    # Añadir columnas para diferentes contextos:
    df['Sin_contexto'] = df['tweet_respuesta'] # cambio full_text por tweet_respuesta
    #df['Tweet_context'] = tweet_original + " [SEP] " + df['tweet_respuesta']
    df['Full_context'] = tweet_original + " [SEP] " + df['tweet_respuesta'] + " [SEP] " + context
    df['Tweet_context'] = tweet_original + " [SEP] " + df['tweet_respuesta']
    #df['Full_context'] = df['tweet_respuesta'] + " [SEP] " + context

    return df

In [43]:
def filter_by_type(df, label_column):

    if label_column == "Análisis General":
        # Define the specific labels to keep
        #etiquetas = ["Comentario Positivo", "Comentario Negativo", "Comentario Neutro"]
        etiquetas = ["Comentario Positivo", "Comentario Negativo"]
        
        df['Análisis General'] = df['Análisis General'].where(df['Análisis General'].isin(etiquetas))


        # Remove NAs
        df = df.dropna(subset=['Análisis General'])
        

        # Factorize the 'Análisis General' column
        labels, labels_names = pd.factorize(df['Análisis General'])

        # 'labels' now contains the numeric representation of your original labels
        # 'label_names' contains the unique values from your original column in the order they were encoded

        # Replace the original column with the numeric labels
        df['Análisis General'] = labels

        # If you want to keep a record of the mapping from the original labels to the numeric labels
        label_mapping = dict(zip(labels_names, range(len(labels_names))))
        print("Label Mapping:", label_mapping)

    if label_column == "Contenido Negativo":

        # Filtrar el DataFrame para seleccionar solo los "Comentario Negativo"
        df = df.loc[df['Análisis General'] == 'Comentario Negativo']

        # Define the specific labels to keep
        etiquetas = ["Desprestigiar Víctima", "Desprestigiar Acto", "Insultos", "Desprestigiar Deportista Autora"]
        df['Contenido Negativo'] = df['Contenido Negativo'].where(df['Contenido Negativo'].isin(etiquetas))

        # Remove NAs
        df = df.dropna(subset=['Contenido Negativo'])
        

        # Factorize the 'Análisis General' column
        labels, labels_names = pd.factorize(df['Contenido Negativo'])

        # 'labels' now contains the numeric representation of your original labels
        # 'label_names' contains the unique values from your original column in the order they were encoded

        # Replace the original column with the numeric labels
        df['Contenido Negativo'] = labels

        # If you want to keep a record of the mapping from the original labels to the numeric labels
        label_mapping = dict(zip(labels_names, range(len(labels_names))))
        print("Label Mapping:", label_mapping)


    if label_column == "Insultos":

        # Filtrar el DataFrame para seleccionar solo los "Comentario Negativo"
        df = df.loc[df['Análisis General'] == 'Comentario Negativo']

        # Filtrar el DataFrame para seleccionar solo los "Insultos" no vacíos
        df = df.loc[df['Insultos'].notna() & (df['Insultos'].str.strip() != '')]

        # Define the specific labels to keep
        etiquetas = ["Deseo de Dañar", "Genéricos", "Sexistas/misóginos"]

        # Replace labels that are not in the list with "Genéricos"
        df['Insultos'] = df['Insultos'].where(df['Insultos'].isin(etiquetas), other="Genéricos")

        # Remove NAs
        df = df.dropna(subset=['Insultos'])
        

        # Factorize the 'Insultos' column
        labels, labels_names = pd.factorize(df['Insultos'])

        # 'labels' now contains the numeric representation of your original labels
        # 'label_names' contains the unique values from your original column in the order they were encoded

        # Replace the original column with the numeric labels
        df['Insultos'] = labels

        # If you want to keep a record of the mapping from the original labels to the numeric labels
        label_mapping = dict(zip(labels_names, range(len(labels_names))))
        print("Label Mapping:", label_mapping)

    num_labels = len(etiquetas)



    return df, labels_names, num_labels

## Tokenizer

In [44]:
def load_tokenizer(name):
    
    if name == "dccuchile/bert-base-spanish-wwm-cased":
        tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

    if name == "PlanTL-GOB-ES/roberta-large-bne":
        tokenizer = RobertaTokenizer.from_pretrained('PlanTL-GOB-ES/roberta-large-bne') 

    if name == "bert-base-multilingual-cased":
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    if name == "FacebookAI/xlm-roberta-base":
        tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

    if name == "pysentimiento/robertuito-base-cased":
        tokenizer = AutoTokenizer.from_pretrained('pysentimiento/robertuito-base-cased')

    return tokenizer

In [45]:
# Preparar inputs tokenizados para cada contexto
def prepare_inputs(df, text_column, label_column, tokenizer):
    
    # Tokenización de los textos
    tokenized_data = tokenizer(df[text_column].tolist(), padding=PADDING, truncation=TRUNCATION, max_length=MAX_LENGTH, return_tensors='pt')
    
    # Factorizar las etiquetas si son categóricas
    labels, _ = pd.factorize(df[label_column])
    
    # Convertir las etiquetas a un tensor
    labels = torch.tensor(labels)
    
    # Retorna un diccionario con los inputs tokenizados y las etiquetas
    return {**tokenized_data, 'labels': labels}


## Prepare Dataset

In [46]:
def create_hf_dataset(tokenized_inputs):
    return Dataset.from_dict(tokenized_inputs)

In [47]:
# Función para dividir un dataset en train, validation y test
def split_dataset(dataset, test_size=0.1, val_size=0.3):
    # Dividir primero en train+val y test
    train_val_dataset, test_dataset = dataset.train_test_split(test_size=test_size).values()

    # Ahora dividir train+val en train y val
    train_dataset, val_dataset = train_val_dataset.train_test_split(test_size=val_size / (1 - test_size)).values()

    return DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })


In [48]:
def stratified_train_test_val_split(dataset, test_size, val_size, seed=0):
    # Convertir a pandas DataFrame para usar la funcionalidad de scikit-learn
    df = dataset.to_pandas()
    
    # Estratificar y dividir el conjunto de datos en entrenamiento+validación y test
    train_val_df, test_df = train_test_split(df, test_size=test_size, stratify=df['labels'], random_state=seed)
    
    # Estratificar y dividir el conjunto de entrenamiento+validación en entrenamiento y validación
    train_df, val_df = train_test_split(train_val_df, test_size=val_size/(1.0-test_size), stratify=train_val_df['labels'], random_state=seed)
    
    # Convertir de nuevo a datasets de HuggingFace
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    # Crear un DatasetDict
    split_dataset = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })
    
    # Calcular el número de ejemplos por clase en cada subconjunto
    train_class_counts = Counter(train_df['labels'])
    val_class_counts = Counter(val_df['labels'])
    test_class_counts = Counter(test_df['labels'])

    # Retornar el DatasetDict y las cuentas de clases
    return split_dataset, {'train': train_class_counts, 'validation': val_class_counts, 'test': test_class_counts}


## Model

In [49]:
def load_model(name, num_labels):
    if name == "dccuchile/bert-base-spanish-wwm-cased":
        model = BertForSequenceClassification.from_pretrained(name, num_labels=num_labels)

    if name == "PlanTL-GOB-ES/roberta-large-bne":
        model = RobertaForSequenceClassification.from_pretrained(name, num_labels=num_labels) 

    if name == "bert-base-multilingual-cased":
        model = BertForSequenceClassification.from_pretrained(name, num_labels=num_labels)

    if name == "FacebookAI/xlm-roberta-base":
        model = XLMRobertaForSequenceClassification.from_pretrained(name, num_labels=num_labels)

    if name == "pysentimiento/robertuito-base-cased":
        model = AutoModelForSequenceClassification.from_pretrained(name, num_labels=num_labels)

    return model

## Training

In [50]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Obtener reporte completo
    report = classification_report(labels, predictions, output_dict=True)
    
    # Obtener la matriz de confusión
    conf_matrix = confusion_matrix(labels, predictions)
    
    # Extraer métricas para cada clase y globales
    metrics = {
        'accuracy': report['accuracy'],
        'weighted_f1': report['weighted avg']['f1-score'],
        'weighted_precision': report['weighted avg']['precision'],
        'weighted_recall': report['weighted avg']['recall'],
        # La matriz de confusión no se incluye normalmente como una métrica devuelta porque no es un escalar
        'confusion_matrix': conf_matrix.tolist()  # Convertir a lista para asegurarse de que es serializable si es necesario
    }
    
    # Añadir métricas específicas por clase si se requiere
    for label, scores in report.items():
        if label not in ["accuracy", "macro avg", "weighted avg"]:
            metrics[f'{label}_precision'] = scores['precision']
            metrics[f'{label}_recall'] = scores['recall']
            metrics[f'{label}_f1'] = scores['f1-score']
            metrics[f'{label}_support'] = scores['support']
    
    return metrics

In [51]:
def load_training_args(context_type, learning_rate_scheduler_type="linear", num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_steps=10):
    training_args = TrainingArguments(
        output_dir=f'./results/{context_type}',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        warmup_steps=warmup_steps,
        weight_decay=weight_decay,
        logging_dir=f'./logs/{context_type}',
        logging_steps=logging_steps,
        evaluation_strategy="epoch",
        #save_strategy="epoch",
        #load_best_model_at_end=True
        save_strategy="no",  # No guardar modelos
        save_total_limit=0,  # No mantener checkpoints
        metric_for_best_model="eval_f1",
        load_best_model_at_end=False,  # No cargar el mejor modelo al final del entrenamiento
    )

    return training_args

In [52]:
class SaveResultsCallback(TrainerCallback):
    def __init__(self, excel_path, training_args, run_id, additional_info, type_d, num_labels):
        self.excel_path = excel_path
        self.training_args = vars(training_args)  # Convert training arguments to dictionary
        self.run_id = run_id
        self.additional_info = additional_info  # Assumed to be a dictionary
        self.type_d = type_d  # Type of dataset or context
        self.rows = []
        self.initialized = False
        self.num_labels = num_labels

    def on_train_begin(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.is_local_process_zero:
            if not self.initialized:
                self.init_excel()
                self.initialized = True

    def init_excel(self):
        # Initialize the Excel file with proper headers if it does not exist
        if not Path(self.excel_path).exists():
            with pd.ExcelWriter(self.excel_path, engine='openpyxl') as writer:
                # Inicializar los encabezados dinámicamente basados en el número de clases
                metric_headers = ['eval_loss', 'eval_accuracy', 'eval_weighted_f1', 'eval_weighted_precision', 'eval_weighted_recall']
                metric_headers.append('eval_confusion_matrix')
                # Agregar encabezados de métricas para cada clase
                for i in range(self.num_labels):
                    metric_headers.extend([
                        f'eval_{i}_precision',
                        f'eval_{i}_recall',
                        f'eval_{i}_f1',
                        f'eval_{i}_support'
                    ])
                
                metric_headers.append('eval_runtime')
                metric_headers.append('eval_samples_per_second')
                metric_headers.append('eval_steps_per_second')

                # Definir encabezados para la creación del archivo Excel
                headers = ["run_id", "type_d", *self.additional_info.keys(), "epoch", *metric_headers, *self.training_args.keys()]
                df_header = pd.DataFrame(columns=headers)
                df_header.to_excel(writer, sheet_name=self.type_d, index=False)

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics, **kwargs):
        if state.is_local_process_zero:
            # Prepare data dictionary for the current epoch
            data = {"run_id": self.run_id, "type_d": self.type_d, **self.additional_info, "epoch": state.epoch, **metrics, **self.training_args}
            self.rows.append(data)

    def on_train_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.is_local_process_zero:
            self.save_to_excel()

    def save_to_excel(self):
        # Save collected data to Excel, appending to the existing sheet or creating it if not exists
        with pd.ExcelWriter(self.excel_path, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
            df = pd.DataFrame(self.rows)
            start_row = writer.sheets[self.type_d].max_row if self.type_d in writer.book.sheetnames else 0
            df.to_excel(writer, sheet_name=self.type_d, index=False, header=not bool(start_row), startrow=start_row)
            print(f"Data logged to {self.excel_path} in sheet {self.type_d}")

In [53]:
def log_data(excel_path, run_id, eval_results, training_args, additional_info, type_d):
    # Define el nombre de la hoja basado en el tipo de dataset
    sheet_name = f"{type_d}"

    # Crear el archivo Excel si no existe
    if not Path(excel_path).exists():
        # Si el archivo no existe, crearlo con una hoja dummy para inicializarlo
        with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
            pd.DataFrame().to_excel(writer, index=False, sheet_name='Sheet1')

    # Cargar o inicializar el DataFrame dependiendo de la existencia de la hoja
    if sheet_name in pd.ExcelFile(excel_path).sheet_names:
        df = pd.read_excel(excel_path, sheet_name=sheet_name)
    else:
        df = pd.DataFrame()

    # Convertir el objeto de argumentos de entrenamiento a diccionario si es necesario
    training_args_dict = vars(training_args) if not isinstance(training_args, dict) else training_args

    # Asegurarte de que los resultados de evaluación son un diccionario
    eval_results_dict = eval_results if isinstance(eval_results, dict) else vars(eval_results)

    # Construir la fila de datos a agregar
    data = {
        **{'Run_ID': run_id},
        **training_args_dict,
        **eval_results_dict,
        **additional_info
    }

    # Convertir todos los valores a strings si no son int, float o string
    for key, value in data.items():
        if not isinstance(value, (int, float, str)):
            data[key] = str(value)

    # Agregar la nueva fila al DataFrame
    df = df._append(data, ignore_index=True)

    # Guardar el DataFrame en la hoja correspondiente
    with pd.ExcelWriter(excel_path, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"Data logged to {excel_path} in sheet {sheet_name}")

# Pipeline

In [57]:
# Data
DATA = "../data/BBDD_SeAcabo.csv"

# Target Column
LABEL_COLUMN = "Insultos" # ["Análisis General", "Contenido Negativo", "Insultos"]

# New Tokens
NEW_TOKENS = ["[INSULT]", "[VICTIM]", "[AGGRESSOR]"]

# Types Dataset
TYPES_DATASET = ["Sin_contexto"] # ["Sin_contexto", "Tweet_context", "Full_context"]

# Tweet original Alexia Putellas
TWEET_ORIGINAL = "Esto es inaceptable. Se acabó. Contigo compañera @Jennihermoso"

# Contexto
CONTEXT =   """
            En agosto de 2023, tras la victoria de la Selección femenina de fútbol de España en la Copa Mundial Femenina de Fútbol de 2023, durante la celebración en la entrega de las medallas y tras abrazar efusivamente a varias jugadoras, Luis Rubiales besó en los labios a la centrocampista Jennifer Hermoso mientras sujetaba su cabeza con las manos. Hermoso lo denunció ante la Fiscalía por acoso sexual, coacciones y agresión sexual. La Fiscalía presentó una demanda contra Rubiales ante la Audiencia Nacional en Madrid
            """

# Model
MODEL_NAME = "PlanTL-GOB-ES/roberta-large-bne" #["dccuchile/bert-base-spanish-wwm-cased", "PlanTL-GOB-ES/roberta-large-bne", "bert-base-multilingual-cased", "FacebookAI/xlm-roberta-base", "pysentimiento/robertuito-base-cased"]

# Hyperparameters
PADDING = True
TRUNCATION = True
MAX_LENGTH = 512
NUM_TRAIN_EPOCHS = 10
PER_DEVICE_TRAIN_BATCH_SIZE = 16
PER_DEVICE_EVAL_BATCH_SIZE = 8
WARMUP_STEPS = 500
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 10
LEARNING_RATE_SCHEDULER_TYPE = "cosine" # "linear", "cosine"

# EXCEL
EXCEL_PATH = 'DL2'


models = [MODEL_NAME]
for model in models:

    

    # Load data
    df = load_data(dataset_name=DATA, embedding_name=None)

    # Filter by lang
    df = filter_by_lang(df)

    ## Add special tokens
    df = add_special_tokens(df, NEW_TOKENS)

    # Preprocess data
    df = preprocess_function(df, context=CONTEXT, tweet_original=TWEET_ORIGINAL)

    # Labels
    df, labels_names, num_labels = filter_by_type(df, LABEL_COLUMN)
    print(num_labels)

    # Tokenizer
    tokenizer = load_tokenizer(name=MODEL_NAME)
    tokenizer.add_tokens(NEW_TOKENS)

    for type_d in TYPES_DATASET:
        print("-"*10)
        print(f"{type_d=}")
        print("-"*10)

        start_time = time.time()
        timestamp = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")


        # Prepare input
        inputs_dataset = prepare_inputs(df, text_column=type_d, label_column=LABEL_COLUMN, tokenizer=tokenizer)

        # Create datasets
        dataset = create_hf_dataset(inputs_dataset)

        # Split dataset
        dataset, class_counts = stratified_train_test_val_split(dataset, test_size=0.2, val_size=0.1)

        # Información adicional para registrar
        run_id = str(uuid.uuid4())
        additional_info = {
            'Model_Description': MODEL_NAME,
            'Data_File': DATA,
            'Type': type_d,
            'Class_Counts': class_counts,
        }
        
        # Load model
        model = load_model(name=MODEL_NAME, num_labels=num_labels)
        model.resize_token_embeddings(len(tokenizer))

        # Train
        training_args = load_training_args(context_type=type_d, num_train_epochs=NUM_TRAIN_EPOCHS, per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE, per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE, warmup_steps=WARMUP_STEPS, weight_decay=WEIGHT_DECAY, logging_steps=LOGGING_STEPS)

        save_results_callback = SaveResultsCallback(f"{EXCEL_PATH}_Training_{LABEL_COLUMN}.xlsx", training_args, run_id, additional_info, type_d, num_labels)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset['train'],
            eval_dataset=dataset['validation'],
            compute_metrics=compute_metrics,
            callbacks=[save_results_callback]
        )
        trainer.train()

        # Eval
        eval_results = trainer.evaluate(dataset['test'])
        results_df = pd.DataFrame([eval_results])

        # Llamar a log_data para guardar los resultados y la configuración
        end_time = time.time()
        exec_time = end_time - start_time
        additional_info['Total Time'] = exec_time
        additional_info['Timestamp'] = timestamp
        log_data(f"{EXCEL_PATH}_{LABEL_COLUMN}.xlsx", run_id, eval_results, training_args, additional_info, type_d)




Label Mapping: {'Sexistas/misóginos': 0, 'Genéricos': 1, 'Deseo de Dañar': 2}
3
----------
type_d='Sin_contexto'
----------


KeyboardInterrupt: 

# Pipeline with OpenAI Embeddings

In [None]:
import time
from datetime import datetime
import uuid
import pandas as pd
import numpy as np
from transformers import Trainer
from datasets import Dataset, DatasetDict, ClassLabel


def load_embeddings(embeddings_path):
    embeddings_df = pd.read_csv(embeddings_path)
    embeddings_df['embeddings'] = embeddings_df['embeddings'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))
    return embeddings_df


def prepare_inputs_openai(df, label_column):
    """
    Prepare inputs from a DataFrame for training, validation, and testing.
    Since we're using precomputed embeddings, this function will bypass tokenization.
    """
    inputs = {
        'input_ids': df['embeddings'].tolist(),
        'labels': df[label_column].tolist()
    }
    return inputs

# Path to your embeddings CSV file
EMBEDDINGS_PATH = 'C:/Users/jorge/Desktop/UNI/4-CUARTO/4-2-TFG/CODE/Gender-Bias/OpenAI/seacabo_embeddings.csv'
# Embedding name
EMBEDDING_NAME = "text-embedding-3-large"

# Load the embeddings
embeddings_df = load_embeddings(EMBEDDINGS_PATH)

# Configuration parameters
DATA = "../data/BBDD_SeAcabo.csv"
LABEL_COLUMN = 'Análisis General'
NEW_TOKENS = ["[INSULT]", "[VICTIM]", "[AGGRESSOR]"]
TYPES_DATASET = ["Sin_contexto", "Tweet_context", "Full_context"]
TWEET_ORIGINAL = "Esto es inaceptable. Se acabó. Contigo compañera @Jennihermoso"
CONTEXT = """
            En agosto de 2023, tras la victoria de la Selección femenina de fútbol de España en la Copa Mundial Femenina de Fútbol de 2023, durante la celebración en la entrega de las medallas y tras abrazar efusivamente a varias jugadoras, Luis Rubiales besó en los labios a la centrocampista Jennifer Hermoso mientras sujetaba su cabeza con las manos. Hermoso lo denunció ante la Fiscalía por acoso sexual, coacciones y agresión sexual. La Fiscalía presentó una demanda contra Rubiales ante la Audiencia Nacional en Madrid
            """
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-cased" #["dccuchile/bert-base-spanish-wwm-cased", "PlanTL-GOB-ES/roberta-large-bne", "bert-base-multilingual-cased", "FacebookAI/xlm-roberta-base", "pysentimiento/robertuito-base-cased"]

# Hyperparameters
PADDING = True
TRUNCATION = True
MAX_LENGTH = 512
NUM_TRAIN_EPOCHS = 10
PER_DEVICE_TRAIN_BATCH_SIZE = 16
PER_DEVICE_EVAL_BATCH_SIZE = 8
WARMUP_STEPS = 500
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 10
LEARNING_RATE_SCHEDULER_TYPE = "cosine" # "linear", "cosine"

# EXCEL
EXCEL_PATH = 'DL2'

models = [MODEL_NAME]
for model in models:
    start_time = time.time()
    timestamp = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")

    # Load data
    df = load_data(dataset_name=DATA, embedding_name=EMBEDDING_NAME)

    # Filter by lang
    df = filter_by_lang(df)

    # Add special tokens
    df = add_special_tokens(df, NEW_TOKENS)

    # Preprocess data
    df = preprocess_function(df, context=CONTEXT, tweet_original=TWEET_ORIGINAL)

    # Labels
    df, labels_names, num_labels = filter_by_type(df, LABEL_COLUMN)
    print(f"{num_labels=}")

    for type_d in TYPES_DATASET:
        print("-"*10)
        print(f"{type_d=}")
        print("-"*10)

        # Prepare input (this step assumes you have a function for tokenizing or other preprocessing)
        inputs_dataset = prepare_inputs_openai(df, label_column=LABEL_COLUMN)

        # Create datasets
        dataset = Dataset.from_dict(inputs_dataset)
        dataset = dataset.class_encode_column("labels")

        # Split dataset into train, validation, and test sets
        train_val_split = dataset.train_test_split(test_size=0.3, stratify_by_column='labels')
        val_test_split = train_val_split['test'].train_test_split(test_size=0.5, stratify_by_column='labels')
        
        train_dataset = train_val_split['train']
        val_dataset = val_test_split['train']
        test_dataset = val_test_split['test']

        # Create DatasetDict
        datasets = DatasetDict({
            'train': train_dataset,
            'validation': val_dataset,
            'test': test_dataset
        })

        # Información adicional para registrar
        run_id = str(uuid.uuid4())
        additional_info = {
            'Model_Description': MODEL_NAME,
            'Data_File': DATA,
            'Type': type_d,
            'Class_Counts': {
                'train': train_dataset.features['labels'].names,
                'validation': val_dataset.features['labels'].names,
                'test': test_dataset.features['labels'].names,
            }
        }

        # Load model
        model = load_model(name=MODEL_NAME, num_labels=num_labels)

        # Define training arguments
        training_args = load_training_args(
            context_type=type_d,
            num_train_epochs=NUM_TRAIN_EPOCHS,
            per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
            per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
            warmup_steps=WARMUP_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_steps=LOGGING_STEPS
        )

        save_results_callback = SaveResultsCallback(f"{EXCEL_PATH}_Training_{LABEL_COLUMN}.xlsx", training_args, run_id, additional_info, type_d, num_labels)

        # Define Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=datasets['train'],
            eval_dataset=datasets['validation'],
            compute_metrics=compute_metrics,
            callbacks=[save_results_callback]
        )

        # Train
        trainer.train()

        # Evaluate
        eval_results = trainer.evaluate(eval_dataset=datasets['test'])
        results_df = pd.DataFrame([eval_results])

        # Log data
        end_time = time.time()
        exec_time = end_time - start_time
        additional_info['Total Time'] = exec_time
        log_data(f"{EXCEL_PATH}_{LABEL_COLUMN}.xlsx", run_id, eval_results, training_args, additional_info, type_d)


# Probar Modelo

- Cargar el Modelo y el Tokenizer

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Ruta al checkpoint
checkpoint_path = "./results/checkpoint-1000"

# Cargar el tokenizer
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

# Cargar el modelo
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)


- Preparar el Texto para la Predicción

In [None]:
def prepare_prediction_input(text, tokenizer):
    # Preprocesa el texto como lo hiciste antes de entrenar (por ejemplo, limpieza básica, truncar, etc.)
    inputs = tokenizer(text, return_tensors="pt", max_length=MAX_LENGTH, truncation=TRUNCATION, padding=PADDING)
    return inputs


- Realizar la Predicción

In [None]:
def predict(text, tokenizer, model):
    # Preparar el texto para el modelo
    model_inputs = prepare_prediction_input(text, tokenizer)
    
    # Mover el modelo a CPU o GPU según esté configurado
    model.eval()  # Poner el modelo en modo de evaluación
    with torch.no_grad():  # No calcular gradientes
        outputs = model(**model_inputs)
    
    # Obtener logits
    logits = outputs.logits
    
    # Convertir los logits a probabilidades (opcional)
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Obtener la clase predicha
    predicted_class_index = probabilities.argmax().item()
    
    return predicted_class_index, probabilities.numpy()

# Ejemplo de uso
tweet = "@Jennihermoso, ánimo campeona"
text = tweet_original + " [SEP] " + tweet + " [SEP] " + contexto
print(text)
predicted_class, probabilities = predict(text, tokenizer, model)
print(f"Clase predicha: {predicted_class}")
print(f"Probabilidades: {probabilities}")
