In [1]:
#EDA

In [2]:
import torch

In [3]:
#!pip install pysentimiento transformers datasets accelerate evaluate

In [4]:
import datasets
import evaluate

In [5]:
import numpy as np
from datasets import load_dataset

In [6]:
import ipywidgets as widgets

In [7]:
#Helper Functions

In [8]:
# label to name
def label2name(x):
    if x == 0:
        return "Negative"
    if x == 1:
        return "Neutral"
    if x == 2:
        return "Positive"

In [9]:

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics (eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    return results

In [10]:
#Read Data


In [11]:
data_files = {"train": "data/train.csv", "validation": "data/val.csv", "test": "data/test.csv"}
ds = load_dataset("csv", data_files=data_files)

In [12]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 877
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 109
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 98
    })
})

In [13]:
ds["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'label_name': Value(dtype='string', id=None),
 'tokenized_text': Value(dtype='string', id=None),
 'sent_token_length': Value(dtype='int64', id=None),
 'sent_bert_token_length': Value(dtype='int64', id=None),
 'char_count': Value(dtype='int64', id=None),
 'Character Count': Value(dtype='int64', id=None)}

In [14]:
ds["test"]["label"][:10]

[2, 1, 1, 0, 1, 1, 0, 0, 0, 1]

In [15]:
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
0,"Yo soy Misterio, saco mi fierro y te mato",0,Negative,"Yo soy Misterio, saco mi fierro y te mato",9,10,41,41
1,Ramón cogió alegre y angustiadamente la carabina,1,Neutral,Ramón cogió alegre y angustiadamente la carabina,7,11,48,48
2,"pa qué vamos a buscar problemas, tranquilo",1,Neutral,"pa qué vamos a buscar problemas, tranquilo",7,7,43,43
3,Ahorita vengo,1,Neutral,Ahorita vengo,2,2,13,13
4,"Anda a buscar a tu esposa, canalla",1,Neutral,"Anda a buscar a tu esposa, canalla",7,9,34,34


In [16]:
# encode label and mapping label name
#df["label"] = df["label"].apply(lambda x: label_encode(x))
df["label_name"] = df["label"].apply(lambda x: label2name(x))

In [17]:
# clean text, lowercase and remove punk
#df["text"] = df["text"].apply(lambda x: remove_punct(clean(remove_emoji(x).lower())[0][0]))

In [18]:
df.head()

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
0,"Yo soy Misterio, saco mi fierro y te mato",0,Negative,"Yo soy Misterio, saco mi fierro y te mato",9,10,41,41
1,Ramón cogió alegre y angustiadamente la carabina,1,Neutral,Ramón cogió alegre y angustiadamente la carabina,7,11,48,48
2,"pa qué vamos a buscar problemas, tranquilo",1,Neutral,"pa qué vamos a buscar problemas, tranquilo",7,7,43,43
3,Ahorita vengo,1,Neutral,Ahorita vengo,2,2,13,13
4,"Anda a buscar a tu esposa, canalla",1,Neutral,"Anda a buscar a tu esposa, canalla",7,9,34,34


In [19]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "pysentimiento/robertuito-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-base-uncased and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# columns = ['input_ids', 'attention_mask', 'label']
# ds.set_format(type='torch', columns=columns)
# ds

In [21]:
from pysentimiento.preprocessing import preprocess_tweet
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

In [22]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(
        batch["text"], padding=True, truncation=True
        ),
    batched=True, batch_size=32
)

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

In [23]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 877
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 109
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 98
    })
})

In [24]:
tokenized_ds['train']['input_ids'][0]

[0, 560, 977, 10692, 16, 5713, 507, 27084, 445, 516, 7521, 2, 1, 1, 1, 1]

In [25]:
tokenized_ds['train']['attention_mask'][0]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]

In [26]:
tokenized_ds['train']['text'][0]

'Yo soy Misterio, saco mi fierro y te mato'

In [27]:
#!pip install ipdb

In [28]:
#import torch
#device = "cuda" if torch.cuda.is_available() else "cpu"
#model.cuda()

In [29]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="test_trainer",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [30]:
trainer.train()

{'eval_loss': 0.843215823173523, 'eval_f1': 0.4620462046204621, 'eval_recall': 0.5032467532467533, 'eval_runtime': 0.1924, 'eval_samples_per_second': 566.48, 'eval_steps_per_second': 72.759, 'epoch': 1.0}
{'eval_loss': 0.7259678244590759, 'eval_f1': 0.619645732689211, 'eval_recall': 0.6114718614718614, 'eval_runtime': 0.1888, 'eval_samples_per_second': 577.186, 'eval_steps_per_second': 74.134, 'epoch': 2.0}
{'eval_loss': 0.7197251915931702, 'eval_f1': 0.6196050326485109, 'eval_recall': 0.6114718614718614, 'eval_runtime': 0.2067, 'eval_samples_per_second': 527.24, 'eval_steps_per_second': 67.719, 'epoch': 3.0}
{'eval_loss': 0.7389683127403259, 'eval_f1': 0.6269171239622907, 'eval_recall': 0.6190476190476191, 'eval_runtime': 0.1897, 'eval_samples_per_second': 574.573, 'eval_steps_per_second': 73.798, 'epoch': 4.0}
{'eval_loss': 0.7592655420303345, 'eval_f1': 0.6323719632657702, 'eval_recall': 0.6250773036487322, 'eval_runtime': 0.203, 'eval_samples_per_second': 536.934, 'eval_steps_per_s

TrainOutput(global_step=140, training_loss=0.4893694741385324, metrics={'train_runtime': 23.0427, 'train_samples_per_second': 190.299, 'train_steps_per_second': 6.076, 'train_loss': 0.4893694741385324, 'epoch': 5.0})

In [31]:
#Test on validation set

In [32]:
trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 0.9506278038024902, 'eval_f1': 0.6517857142857143, 'eval_recall': 0.6544372294372294, 'eval_runtime': 0.1587, 'eval_samples_per_second': 617.519, 'eval_steps_per_second': 81.916, 'epoch': 5.0}


{'eval_loss': 0.9506278038024902,
 'eval_f1': 0.6517857142857143,
 'eval_recall': 0.6544372294372294,
 'eval_runtime': 0.1587,
 'eval_samples_per_second': 617.519,
 'eval_steps_per_second': 81.916,
 'epoch': 5.0}

In [33]:
ds['test']['text']

['CARADURA:  ¡Qué bacán!',
 'Flora salió del Davory haciendo sonar sus tacos',
 'Vamos al toque',
 'llora conchatumadre ¿Ves? Ya me muero carajo',
 '¿Y tú tienes hembrita? Sí, estoy medio amarrado',
 'Sus palabras y el tono que empleaba eran fogosos',
 'No le hagas caso a la bestia de tu papá',
 '¿Qué te pasa oye? ¿Por qué estás así todo achorado?',
 '\x97No, eso se fue al carajo',
 'Yo he llegado a meterme ocho polvos en una noche',
 '\x97Apúrate, gordo rosquete',
 'Nada de maricón',
 'Joaquincillo, ¿qué andas haciendo? Tranquilo',
 'si todas esas huevadas vienen',
 'Ya no eras como ellos, Zavalita, ya eras un cholo',
 'pero falta su gaseosa pa\x92 que resbale',
 '\x97Ay, pobre, qué piña',
 'Tipo desleal, el Serrano dijo él, asintiendo',
 'Vas a ver que te voy a cachar rico',
 'Vamos al fresco dijo, y abrió una puerta corrediza',
 'CARADURA:  No voy a morir como un cabro',
 'Me volví a quemar, maldita sea',
 'Eres un soplón',
 'llámame, éste es mi causa El Burro, le dicen así porque h

In [34]:
#model.load_state_dict(torch.load(f'./_BERT_epoch_3.model', map_location=torch.device('cpu')))

In [35]:
#Error Analisys

In [36]:
val_df = ds['validation'].to_pandas()
#val_df

In [37]:
from tqdm.notebook import tqdm

In [38]:
# step by step predictions on dataframe
# We do this to view predictions in the pandas dataframe and easily filter them and perform error analysis.
pred_final = []

# for i, row in tqdm(val_df.iterrows(), total=val_df.shape[0]):
#     predictions = []

#     text = row["text"]
#     encoded_data_test_single = tokenizer.batch_encode_plus([text], 
#     add_special_tokens=config.add_special_tokens, 
#     return_attention_mask=config.return_attention_mask, 
#     pad_to_max_length=config.pad_to_max_length, 
#     max_length=config.seq_length,
#     return_tensors=config.return_tensors
#     )
#     input_ids_test = encoded_data_test_single['input_ids']
#     attention_masks_test = encoded_data_test_single['attention_mask']

    
#     inputs = {'input_ids':      input_ids_test.to(device),
#               'attention_mask':attention_masks_test.to(device),
#              }

#     with torch.no_grad():        
#         outputs = model(**inputs)
    
#     logits = outputs[0]
#     logits = logits.detach().cpu().numpy()
#     predictions.append(logits)
#     predictions = np.concatenate(predictions, axis=0)
#     pred_final.append(np.argmax(predictions, axis=1).flatten()[0])

In [39]:
df.sample(n = 30)

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
710,Yo te esperaré a la una en la puerta del Colegio,1,Neutral,Yo te esperaré a la una en la puerta del Colegio,11,12,48,48
734,"Dos cholos ebrios, gritaron: -Atajen ese globo",0,Negative,"Dos cholos ebrios, gritaron: -Atajen ese globo",7,14,46,46
244,Los dos chicos le parecieron muy guapos,2,Positive,Los dos chicos le parecieron muy guapos,7,8,39,39
231,y dices Ya qué chucha,0,Negative,y dices Ya qué chucha,5,5,22,22
291,El perro oteaba inútilmente incitado por Valencio,1,Neutral,El perro oteaba inútilmente incitado por Valencio,7,12,49,49
319,¿Tú sabes quién soy yo? EL BURRO: Oe suéltalo,0,Negative,¿Tú sabes quién soy yo? EL BURRO: Oe suéltalo,9,14,47,47
355,Todos los cholos son iguales dijo Luis Felipe,1,Neutral,Todos los cholos son iguales dijo Luis Felipe,8,9,45,45
810,El Viejo hablaba,1,Neutral,El Viejo hablaba,3,3,16,16
281,tremenda fiesta se armó en mi depa,2,Positive,tremenda fiesta se armó en mi depa,7,8,34,34
39,"cuidado chibolo, tienes que aprender a respetar",0,Negative,"cuidado chibolo, tienes que aprender a respetar",7,9,47,47


In [40]:
#Inference

In [41]:
#Evaluate the Model Qualitatively (Human Evaluation)

In [42]:
#Evaluate the Model Quantitatively (with F1 Metric)

In [43]:
texts = ds['test'][0:10]['text']
human_baseline_labels = ds['test'][0:10]['label']

In [44]:
texts

['CARADURA:  ¡Qué bacán!',
 'Flora salió del Davory haciendo sonar sus tacos',
 'Vamos al toque',
 'llora conchatumadre ¿Ves? Ya me muero carajo',
 '¿Y tú tienes hembrita? Sí, estoy medio amarrado',
 'Sus palabras y el tono que empleaba eran fogosos',
 'No le hagas caso a la bestia de tu papá',
 '¿Qué te pasa oye? ¿Por qué estás así todo achorado?',
 '\x97No, eso se fue al carajo',
 'Yo he llegado a meterme ocho polvos en una noche']

In [45]:
human_baseline_labels

[2, 1, 1, 0, 1, 1, 0, 0, 0, 1]

In [46]:
#original_model_summaries = []
model_classifications = []

In [47]:
for _, text in enumerate(texts):
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to('cuda')
    logits = model(input_ids).logits
    probabilities = logits.softmax(dim=-1).tolist()[0]
    model_classifications.append(np.argmax(probabilities).flatten()[0])

In [48]:
model_classifications

[2, 1, 2, 0, 1, 0, 0, 0, 0, 1]