In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df_train = pd.read_csv("politicES_phase_2_train_public.csv", on_bad_lines='skip')

df_train.drop(['label','ideology_binary','gender','profession'], axis=1, inplace=True)
#df_train = df_train[:3000] #Probamos con 3000 por temas de computo

Si estamos en Google collab:

In [None]:
!pip install datasets
!pip install accelerate



##  Clasificador

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, generation, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
import torch
import tensorflow as tf
from datasets import Dataset
import tqdm as notebook_tqdm
from huggingface_hub import notebook_login, Repository

model_name = 'AIDA-UPM/BERTuit-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, from_tf=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df_train['ideology_multiclass'] = label_encoder.fit_transform(df_train['ideology_multiclass'])

In [None]:
# Conversión del DataFrame de pandas a un Dataset de Hugging Face para pasarlo al modelo
huggingface_dataset = Dataset.from_pandas(df_train)
huggingface_dataset

Dataset({
    features: ['ideology_multiclass', 'tweet'],
    num_rows: 180000
})

In [None]:
MAX_LENGTH = max([len(tokenizer(text).tokens())  for text in huggingface_dataset['tweet']])
print(MAX_LENGTH)

Como es menor de 512, no hace falta truncar

In [None]:
# Función para tokenizar el dataset
def tokenize_function(example):
    return tokenizer(example['tweet'])

In [None]:
# Aplicamos la tokenización a todo el dataset
tokenized_dataset = huggingface_dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/180000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['ideology_multiclass', 'tweet', 'input_ids', 'attention_mask'],
    num_rows: 180000
})

In [None]:
tokenized_dataset = tokenized_dataset.rename_column("ideology_multiclass", "labels")
tokenized_dataset = tokenized_dataset.remove_columns("tweet")  # Nos quedamos solo con los valores que necesita el modelo
tokenized_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 180000
})

In [None]:

from datasets import DatasetDict

# Dividimos los datos en entrenamiento y validación. Dejamos un 30% del dataset para validación y test
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)

# Creamos un DatasetDict para organizar los subconjuntos
final_datasets = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test'],

})

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # Introducimos el data collator para trabajar con padding dinámico


In [None]:

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt

# Definimos la función encargada de evaluar las métricas en el entrenamiento
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}



# Definimos los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    logging_strategy='epoch',
    logging_dir='./logs',
    load_best_model_at_end=True,
    seed=42,
)

# Inicializamos el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_datasets['train'],
    eval_dataset=final_datasets['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0)]
)

# Entrenamiento y evaluación
trainer.train()
training_stats = trainer.state.log_history

# Representación de las pérdidas a lo largo de las épocas
training_loss = [e['loss'] for e in training_stats if 'loss' in e]
validation_loss = [e['eval_loss'] for e in training_stats if 'eval_loss' in e]

plt.figure(figsize=(10, 6))
plt.plot(training_loss, label='Training loss')
plt.plot(validation_loss, label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Imprimimos las métricas del conjunto de test
test_results = trainer.evaluate(eval_dataset=final_datasets['validation'])
print("Resultados en el conjunto de validacion:", test_results)


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0338,0.920942,0.607667,0.5983,0.615455,0.607667
2,0.7909,0.902419,0.633,0.631442,0.631889,0.633


In [None]:
notebook_login()

In [None]:
model.save_pretrained("ideology_ft", push_to_hub=True)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
tokenizer.save_pretrained("ideology_ft", push_to_hub=True)

('ideology_ft/tokenizer_config.json',
 'ideology_ft/special_tokens_map.json',
 'ideology_ft/vocab.json',
 'ideology_ft/merges.txt',
 'ideology_ft/added_tokens.json',
 'ideology_ft/tokenizer.json')