In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import torch
import tensorflow as tf
from datasets import Dataset
import tqdm as notebook_tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

df_test = pd.read_csv("dataset ideology/politicES_phase_2_test_codalab.csv", on_bad_lines='skip')
df_test = df_test[:4000]

In [11]:
df_test.drop('label', axis=1, inplace=True) 
df_test

Unnamed: 0,gender,profession,ideology_binary,ideology_multiclass,tweet
0,male,journalist,right,moderate_right,"Ayer, en Guadalajara: feministas arrojaban áci..."
1,male,journalist,right,moderate_right,"“Chile Vamos”, o sea, la centro-derechita coba..."
2,male,journalist,right,moderate_right,"Se está tratando de silenciar que en Bolivia, ..."
3,male,journalist,right,moderate_right,"@user Lucía, lo siento pero no es así. Hebe ja..."
4,male,journalist,right,moderate_right,JxC no dio quórum para eliminar el impuesto a ...
...,...,...,...,...,...
3995,male,politician,left,left,La diversidad y la pluralidad es una realidad ...
3996,male,politician,left,left,El doble rasero con el que se tratan los confl...
3997,male,politician,left,left,Esperando la condena de @user a los insultos q...
3998,male,politician,left,left,Un minuto. Un minuto esperando dentro de un Ro...


In [12]:

# Initialize the tokenizer
checkpoint_name= 'juan-glez29/ideology_ft'
tokenizer = AutoTokenizer.from_pretrained(checkpoint_name)
# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_name)

tokenizer.json: 100%|██████████| 1.28M/1.28M [00:00<00:00, 5.40MB/s]
model.safetensors: 100%|██████████| 436M/436M [00:35<00:00, 12.4MB/s] 


In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_test['ideology_multiclass'] = label_encoder.fit_transform(df_test['ideology_multiclass'])

In [14]:
def tokenize_function(example):
    return tokenizer(example['tweet'], truncation=True)

# Conversión del DataFrame de pandas a un Dataset de Hugging Face para pasarlo al modelo
huggingface_dataset = Dataset.from_pandas(df_test)
huggingface_dataset

Dataset({
    features: ['gender', 'profession', 'ideology_binary', 'ideology_multiclass', 'tweet'],
    num_rows: 4000
})

In [15]:
# Aplicamos la tokenización a todo el dataset
tokenized_dataset = huggingface_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["tweet","gender","profession","ideology_binary"]) 
test_dataset = tokenized_dataset.rename_column("ideology_multiclass", "labels") 

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 4000/4000 [00:00<00:00, 14835.80 examples/s]


In [16]:
test_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 4000
})

In [17]:
# Define test trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 
test_trainer = Trainer(model, data_collator=data_collator, tokenizer=tokenizer)

predicted_results = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = predicted_results.predictions.argmax(-1) # Get the highest probability prediction

# Calculate accuracy
labels = test_dataset["labels"]
print(f"Accuracy:\t{accuracy_score(y_true=labels, y_pred=y_pred)}")

precision, recall, f1, _ = precision_recall_fscore_support(y_true=labels, y_pred=y_pred, average='weighted') # o weighted si estan desbalceadas las clases
print(f"Precision:\t{precision}")
print(f"Recall:\t\t{recall}")
print(f"F1-Score:\t{f1}")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 500/500 [02:07<00:00,  3.91it/s]

Accuracy:	0.47925
Precision:	0.5026765356903358
Recall:		0.47925
F1-Score:	0.476947635928816





In [18]:
# # Tokenize text inputs
# tokenized_inputs = tokenizer(df_test['tweet'][100], return_tensors="pt")

# # Pass tokenized inputs through the Transformer model
# outputs = model(**tokenized_inputs)

# # Get predictions
# labels = ['izq', 'izq_mod', 'dcha_mod', 'dcha']
# prediction = torch.argmax(outputs.logits)

# print(outputs)

# print(f"The prediction is {labels[prediction]}")