# Finetune Embeddings

### Libraries

In [1]:
from utils import *
from load_data import *
from process_data import *
from create_embeddings import *
from split_data import *
from create_model import *
from evaluate_model import *
from run_to_excel import *

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
from transformers import BertTokenizer, DistilBertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel, XLMRobertaModel, AutoTokenizer

### Load Data

In [3]:
data = "data/BBDD_SeAcabo.csv" # "data/BBDD_SeAcabo.csv" "AMI_IBEREVAL2018/es_AMI_TrainingSet_NEW.csv"
df = load_data(data)

### Load Tokenizer and Model

Select Name of the Embedding:

In [4]:
# Select Name of the Embedding:
options = ["roberta", "beto", "bert-multi", "xlm-roberta-base"]

# Crear el dropdown widget
embedding_name = widgets.Dropdown(
    options=options,
    value=options[0],  # valor inicial seleccionado
    description='Embedding:',
    disabled=False,
)

# Mostrar el dropdown
display(embedding_name)

Dropdown(description='Embedding:', options=('roberta', 'beto', 'bert-multi', 'xlm-roberta-base'), value='rober…

In [5]:
if embedding_name.value == "roberta":
    tokenizer = RobertaTokenizer.from_pretrained('PlanTL-GOB-ES/roberta-large-bne') 
    model = RobertaModel.from_pretrained("PlanTL-GOB-ES/roberta-large-bne") 

elif embedding_name.value == "beto":
    tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased') #beto
    model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-cased") #beto

elif embedding_name.value == "bert-multi":
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = BertModel.from_pretrained('bert-base-multilingual-cased')

elif embedding_name.value == "xlm-roberta-base":
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
    model = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")

# Asegurarse de que el modelo se ejecute en la GPU si está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

  return self.fget.__get__(instance, owner)()
Some weights of RobertaModel were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-large-bne and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50262, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      

### Process Data

In [6]:
type_id = "insultos" # ["analisis_general", "contenido_negativo", "insultos"]
balance = "None" # ["downsampling", "upsampling", "smote", "adasyn", "None"]
df, labels_names = process_data(df, type_id, balance)


Soporte de etiquetas con nombres originales:
Genéricos: 315
Sexistas/misóginos: 53
Deseo de Dañar: 67


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Insultos'] = df['Insultos'].where(df['Insultos'].isin(etiquetas), other="Genéricos")


In [7]:
texts = df['full_text_processed'].tolist()
labels = df['label'].values

# Tokenizar todos los textos
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, # Agregar '[CLS]' y '[SEP]'
                        max_length = 64,           # Longitud máxima para padear/truncar
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construir las attention masks
                        return_tensors = 'pt',     # Retornar tensores pytorch
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convertir las listas en tensores
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Crear el DataLoader
batch_size = 32
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Finetuning

In [8]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Asumiendo que 'dataloader' es tu DataLoader de entrenamiento
# Aquí se realiza el entrenamiento por épocas, este es un ejemplo muy básico
model.train()
for epoch_i in range(0, 4):  # Para fine-tuning, unas pocas épocas (~2-4) suelen ser suficientes
    for step, batch in enumerate(dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        loss = outputs[0]
        loss.backward()

        optimizer.step()

# Guardar el modelo fine-tuneado
model.save_pretrained(f"new_embeddings/model_{type_id}_{embedding_name.value}")
tokenizer.save_pretrained(f"new_embeddings/embedding_{type_id}_{embedding_name.value}")



TypeError: RobertaModel.forward() got an unexpected keyword argument 'labels'