In [1]:
#EDA

In [2]:
import torch

In [3]:
#!pip install pysentimiento transformers datasets accelerate evaluate

In [4]:
import datasets
import evaluate

In [5]:
import numpy as np
from datasets import load_dataset

In [6]:
import ipywidgets as widgets

In [7]:
#Helper Functions

In [8]:
# label to name
def label2name(x):
    if x == 0:
        return "Negative"
    if x == 1:
        return "Neutral"
    if x == 2:
        return "Positive"

In [9]:

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics (eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    return results

In [10]:
#Read Data


In [11]:
data_files = {"train": "data/train.csv", "validation": "data/val.csv", "test": "data/test.csv"}
ds = load_dataset("csv", data_files=data_files)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [12]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 547
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 68
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 61
    })
})

In [13]:
ds["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'label_name': Value(dtype='string', id=None),
 'tokenized_text': Value(dtype='string', id=None),
 'sent_token_length': Value(dtype='int64', id=None),
 'sent_bert_token_length': Value(dtype='int64', id=None),
 'char_count': Value(dtype='int64', id=None),
 'Character Count': Value(dtype='int64', id=None)}

In [14]:
ds["test"]["label"][:10]

[1, 0, 0, 0, 1, 0, 2, 2, 0, 0]

In [15]:
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
0,"Ambos se pusieron de pie, roncando de cólera",0,Negative,"Ambos se pusieron de pie, roncando de cólera",8,10,44,44
1,El pobre negro,0,Negative,El pobre negro,3,3,14,14
2,No era un profesional ni mucho menos,0,Negative,No era un profesional ni mucho menos,7,7,36,36
3,Tanta finta,0,Negative,Tanta finta,2,3,11,11
4,además los chibolos de la barra siempre pican ...,0,Negative,además los chibolos de la barra siempre pican ...,11,16,61,61


In [16]:
# encode label and mapping label name
#df["label"] = df["label"].apply(lambda x: label_encode(x))
df["label_name"] = df["label"].apply(lambda x: label2name(x))

In [17]:
# clean text, lowercase and remove punk
#df["text"] = df["text"].apply(lambda x: remove_punct(clean(remove_emoji(x).lower())[0][0]))

In [18]:
df.head()

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
0,"Ambos se pusieron de pie, roncando de cólera",0,Negative,"Ambos se pusieron de pie, roncando de cólera",8,10,44,44
1,El pobre negro,0,Negative,El pobre negro,3,3,14,14
2,No era un profesional ni mucho menos,0,Negative,No era un profesional ni mucho menos,7,7,36,36
3,Tanta finta,0,Negative,Tanta finta,2,3,11,11
4,además los chibolos de la barra siempre pican ...,0,Negative,además los chibolos de la barra siempre pican ...,11,16,61,61


In [19]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "pysentimiento/robertuito-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-base-uncased and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# columns = ['input_ids', 'attention_mask', 'label']
# ds.set_format(type='torch', columns=columns)
# ds

In [21]:
from pysentimiento.preprocessing import preprocess_tweet
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

Map:   0%|          | 0/547 [00:00<?, ? examples/s]

Map:   0%|          | 0/68 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

In [22]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(
        batch["text"], padding=True, truncation=True
        ),
    batched=True, batch_size=32
)

Map:   0%|          | 0/547 [00:00<?, ? examples/s]

Map:   0%|          | 0/68 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

In [23]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 547
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 61
    })
})

In [24]:
tokenized_ds['train']['input_ids'][0]

[0,
 5587,
 475,
 7117,
 413,
 25565,
 3812,
 1767,
 413,
 1083,
 7888,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [25]:
tokenized_ds['train']['attention_mask'][0]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]

In [26]:
tokenized_ds['train']['text'][0]

'Ambos se pusieron de pie, roncando de cólera'

In [27]:
#!pip install ipdb

In [28]:
#import torch
#device = "cuda" if torch.cuda.is_available() else "cpu"
#model.cuda()

In [29]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="test_trainer",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [30]:
trainer.train()

{'eval_loss': 0.8591828346252441, 'eval_f1': 0.3442244224422442, 'eval_recall': 0.39215686274509803, 'eval_runtime': 0.1847, 'eval_samples_per_second': 368.128, 'eval_steps_per_second': 48.723, 'epoch': 1.0}
{'eval_loss': 0.7433059215545654, 'eval_f1': 0.6894191080237592, 'eval_recall': 0.6568627450980392, 'eval_runtime': 0.1836, 'eval_samples_per_second': 370.426, 'eval_steps_per_second': 49.027, 'epoch': 2.0}
{'eval_loss': 0.705886721611023, 'eval_f1': 0.7184063890891199, 'eval_recall': 0.6996744643803469, 'eval_runtime': 0.1799, 'eval_samples_per_second': 377.983, 'eval_steps_per_second': 50.027, 'epoch': 3.0}
{'eval_loss': 0.7159866094589233, 'eval_f1': 0.7043059319482085, 'eval_recall': 0.6906654553713377, 'eval_runtime': 0.1791, 'eval_samples_per_second': 379.577, 'eval_steps_per_second': 50.238, 'epoch': 4.0}
{'eval_loss': 0.7234480977058411, 'eval_f1': 0.6635666022639203, 'eval_recall': 0.6620486032250739, 'eval_runtime': 0.1977, 'eval_samples_per_second': 343.948, 'eval_steps_

TrainOutput(global_step=90, training_loss=0.5162400987413195, metrics={'train_runtime': 23.4949, 'train_samples_per_second': 116.408, 'train_steps_per_second': 3.831, 'train_loss': 0.5162400987413195, 'epoch': 5.0})

In [31]:
#Test on validation set

In [32]:
trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 0.7572020292282104, 'eval_f1': 0.6369281045751635, 'eval_recall': 0.6299922299922299, 'eval_runtime': 0.1311, 'eval_samples_per_second': 465.397, 'eval_steps_per_second': 61.036, 'epoch': 5.0}


{'eval_loss': 0.7572020292282104,
 'eval_f1': 0.6369281045751635,
 'eval_recall': 0.6299922299922299,
 'eval_runtime': 0.1311,
 'eval_samples_per_second': 465.397,
 'eval_steps_per_second': 61.036,
 'epoch': 5.0}

In [33]:
ds['test']['text']

['Pero él sabe que ya no tiene la menor chance',
 'No me toques, caracho',
 'Ahora trae una bolsa, carajo',
 'Me muero, Chacal',
 'Con los años vas aprendiendo a tirar cache',
 '¿Sabes lo que eres? Un soplón',
 'bien chévere',
 'soy recontra taco pero soy bien rica',
 'La gente me alucinaba pastrulazo y yo nada',
 '\x97Cinco pobres diablos, para remate uno de ellos viejo y con soroche',
 'Hoy sales de la categoría pajeros y entras a la categoría cacheritos',
 'Y todos los demás son campanas',
 'y les podría caer un dinerito',
 '\x97Que se achicharren estas calatas sinvergüenzas',
 'Que se vayan al carajo',
 'Raúl era moreno, decía lisuras y escupía a cada rato',
 'No seas fregado',
 'Pégame cabrón que cuando te agarre afuera vas a llorar',
 'llego al edificio con olor a mondonguito y toco el timbre y pienso ojalá que no esté la cara de perro y escucho gabriel, un grito así bien recio y achorado',
 'eres mi causa',
 '\x97¿Qué es eso? \x97Un club de cucufatos pitucos',
 'tanta huevada',


In [34]:
#model.load_state_dict(torch.load(f'./_BERT_epoch_3.model', map_location=torch.device('cpu')))

In [35]:
#Error Analisys

In [36]:
val_df = ds['validation'].to_pandas()
#val_df

In [37]:
from tqdm.notebook import tqdm

In [38]:
# step by step predictions on dataframe
# We do this to view predictions in the pandas dataframe and easily filter them and perform error analysis.
pred_final = []

# for i, row in tqdm(val_df.iterrows(), total=val_df.shape[0]):
#     predictions = []

#     text = row["text"]
#     encoded_data_test_single = tokenizer.batch_encode_plus([text], 
#     add_special_tokens=config.add_special_tokens, 
#     return_attention_mask=config.return_attention_mask, 
#     pad_to_max_length=config.pad_to_max_length, 
#     max_length=config.seq_length,
#     return_tensors=config.return_tensors
#     )
#     input_ids_test = encoded_data_test_single['input_ids']
#     attention_masks_test = encoded_data_test_single['attention_mask']

    
#     inputs = {'input_ids':      input_ids_test.to(device),
#               'attention_mask':attention_masks_test.to(device),
#              }

#     with torch.no_grad():        
#         outputs = model(**inputs)
    
#     logits = outputs[0]
#     logits = logits.detach().cpu().numpy()
#     predictions.append(logits)
#     predictions = np.concatenate(predictions, axis=0)
#     pred_final.append(np.argmax(predictions, axis=1).flatten()[0])

In [39]:
df.sample(n = 30)

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
176,Él la cogió de la mano,2,Positive,Él la cogió de la mano,6,7,22,22
77,"Qué tal raza, papá",0,Negative,"Qué tal raza, papá",4,5,18,18
361,-Te ha dejado plantado,0,Negative,-Te ha dejado plantado,4,5,22,22
90,cualquier huevada pasas la voz y la picamos,0,Negative,cualquier huevada pasas la voz y la picamos,8,10,43,43
368,Dame pero caleta,1,Neutral,Dame pero caleta,3,4,16,16
210,Después de hacerle la cagada a los gorilas,0,Negative,Después de hacerle la cagada a los gorilas,8,9,42,42
177,No es mi hembrita,1,Neutral,No es mi hembrita,4,6,17,17
250,Ahorita regreso,1,Neutral,Ahorita regreso,2,2,15,15
518,Estás fregado,0,Negative,Estás fregado,2,3,13,13
199,ha sido una buena nota,2,Positive,ha sido una buena nota,5,5,22,22


In [40]:
#Inference

In [41]:
#Evaluate the Model Qualitatively (Human Evaluation)

In [42]:
#Evaluate the Model Quantitatively (with F1 Metric)

In [43]:
texts = ds['test'][0:10]['text']
human_baseline_labels = ds['test'][0:10]['label']

In [44]:
texts

['Pero él sabe que ya no tiene la menor chance',
 'No me toques, caracho',
 'Ahora trae una bolsa, carajo',
 'Me muero, Chacal',
 'Con los años vas aprendiendo a tirar cache',
 '¿Sabes lo que eres? Un soplón',
 'bien chévere',
 'soy recontra taco pero soy bien rica',
 'La gente me alucinaba pastrulazo y yo nada',
 '\x97Cinco pobres diablos, para remate uno de ellos viejo y con soroche']

In [45]:
human_baseline_labels

[1, 0, 0, 0, 1, 0, 2, 2, 0, 0]

In [46]:
#original_model_summaries = []
model_classifications = []

In [47]:
for _, text in enumerate(texts):
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to('cuda')
    logits = model(input_ids).logits
    probabilities = logits.softmax(dim=-1).tolist()[0]
    model_classifications.append(np.argmax(probabilities).flatten()[0])

In [48]:
model_classifications

[1, 0, 0, 1, 2, 0, 2, 0, 1, 0]