In [1]:
#EDA

In [2]:
import torch

In [3]:
#!pip install pysentimiento transformers datasets accelerate evaluate

In [4]:
import datasets
import evaluate

In [5]:
import numpy as np
from datasets import load_dataset

In [6]:
import ipywidgets as widgets

In [7]:
#Helper Functions

In [8]:
# label to name
def label2name(x):
    if x == 0:
        return "Negative"
    if x == 1:
        return "Neutral"
    if x == 2:
        return "Positive"

In [9]:

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics (eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    return results

In [10]:
#Read Data


In [11]:
data_files = {"train": "data/train.csv", "validation": "data/val.csv", "test": "data/test.csv"}
ds = load_dataset("csv", data_files=data_files)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [12]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 573
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 71
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 64
    })
})

In [13]:
ds["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'label_name': Value(dtype='string', id=None),
 'tokenized_text': Value(dtype='string', id=None),
 'sent_token_length': Value(dtype='int64', id=None),
 'sent_bert_token_length': Value(dtype='int64', id=None),
 'char_count': Value(dtype='int64', id=None),
 'Character Count': Value(dtype='int64', id=None)}

In [14]:
ds["test"]["label"][:10]

[0, 0, 1, 1, 0, 0, 1, 0, 1, 0]

In [15]:
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
0,Un día me cae un piedrón y la cagada,0,Negative,Un día me cae un piedrón y la cagada,9,11,36,36
1,si no tienes ganas ahorita te lo comes después,1,Neutral,si no tienes ganas ahorita te lo comes después,9,9,46,46
2,"Estabas en piyama, no encontrabas el calzoncil...",1,Neutral,"Estabas en piyama, no encontrabas el calzoncil...",27,38,154,154
3,tanta huevada,0,Negative,tanta huevada,2,3,13,13
4,"Pero si los numerarios se enteran, la cagada",0,Negative,"Pero si los numerarios se enteran, la cagada",8,10,45,45


In [16]:
# encode label and mapping label name
#df["label"] = df["label"].apply(lambda x: label_encode(x))
df["label_name"] = df["label"].apply(lambda x: label2name(x))

In [17]:
# clean text, lowercase and remove punk
#df["text"] = df["text"].apply(lambda x: remove_punct(clean(remove_emoji(x).lower())[0][0]))

In [18]:
df.head()

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
0,Un día me cae un piedrón y la cagada,0,Negative,Un día me cae un piedrón y la cagada,9,11,36,36
1,si no tienes ganas ahorita te lo comes después,1,Neutral,si no tienes ganas ahorita te lo comes después,9,9,46,46
2,"Estabas en piyama, no encontrabas el calzoncil...",1,Neutral,"Estabas en piyama, no encontrabas el calzoncil...",27,38,154,154
3,tanta huevada,0,Negative,tanta huevada,2,3,13,13
4,"Pero si los numerarios se enteran, la cagada",0,Negative,"Pero si los numerarios se enteran, la cagada",8,10,45,45


In [19]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "pysentimiento/robertuito-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-base-uncased and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# columns = ['input_ids', 'attention_mask', 'label']
# ds.set_format(type='torch', columns=columns)
# ds

In [21]:
from pysentimiento.preprocessing import preprocess_tweet
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

Map:   0%|          | 0/573 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

In [22]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(
        batch["text"], padding=True, truncation=True
        ),
    batched=True, batch_size=32
)

Map:   0%|          | 0/573 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

In [23]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 573
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 71
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 64
    })
})

In [24]:
tokenized_ds['train']['input_ids'][0]

[0,
 471,
 783,
 474,
 3816,
 471,
 4822,
 985,
 478,
 445,
 446,
 9473,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [25]:
tokenized_ds['train']['attention_mask'][0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [26]:
tokenized_ds['train']['text'][0]

'Un día me cae un piedrón y la cagada'

In [27]:
#!pip install ipdb

In [28]:
#import torch
#device = "cuda" if torch.cuda.is_available() else "cpu"
#model.cuda()

In [29]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="test_trainer",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [30]:
trainer.train()

{'eval_loss': 0.8980010747909546, 'eval_f1': 0.41108765345218545, 'eval_recall': 0.4360902255639098, 'eval_runtime': 0.1544, 'eval_samples_per_second': 459.968, 'eval_steps_per_second': 58.306, 'epoch': 1.0}
{'eval_loss': 0.7632585167884827, 'eval_f1': 0.5654320987654321, 'eval_recall': 0.5463659147869674, 'eval_runtime': 0.1369, 'eval_samples_per_second': 518.593, 'eval_steps_per_second': 65.737, 'epoch': 2.0}
{'eval_loss': 0.6610168218612671, 'eval_f1': 0.6927934810287751, 'eval_recall': 0.6666666666666666, 'eval_runtime': 0.1403, 'eval_samples_per_second': 505.976, 'eval_steps_per_second': 64.138, 'epoch': 3.0}
{'eval_loss': 0.658694863319397, 'eval_f1': 0.680865800865801, 'eval_recall': 0.6641604010025063, 'eval_runtime': 0.1367, 'eval_samples_per_second': 519.363, 'eval_steps_per_second': 65.835, 'epoch': 4.0}
{'eval_loss': 0.6563352346420288, 'eval_f1': 0.6857594820292094, 'eval_recall': 0.6729323308270677, 'eval_runtime': 0.1344, 'eval_samples_per_second': 528.44, 'eval_steps_pe

TrainOutput(global_step=90, training_loss=0.5623340606689453, metrics={'train_runtime': 25.0078, 'train_samples_per_second': 114.564, 'train_steps_per_second': 3.599, 'train_loss': 0.5623340606689453, 'epoch': 5.0})

In [31]:
#Test on validation set

In [32]:
trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 0.6635341644287109, 'eval_f1': 0.753154160696187, 'eval_recall': 0.746606334841629, 'eval_runtime': 0.1506, 'eval_samples_per_second': 424.996, 'eval_steps_per_second': 53.124, 'epoch': 5.0}


{'eval_loss': 0.6635341644287109,
 'eval_f1': 0.753154160696187,
 'eval_recall': 0.746606334841629,
 'eval_runtime': 0.1506,
 'eval_samples_per_second': 424.996,
 'eval_steps_per_second': 53.124,
 'epoch': 5.0}

In [33]:
ds['test']['text']

['\x97¿Qué te pasa oye? ¿Por qué estás así todo achorado?',
 'No seas maricón',
 'es un chibolo',
 'No me chupo con mayores',
 'Un soplón de porquería',
 'Eres un enfermo, conchatumadre',
 'El próximo sábado hay una pichanga',
 'Fui un huevón',
 'El día que ellos vengan, tú zafas al toque',
 'cuando venga lo voy a cuadrar',
 'Un día me cae un piedrón y la cagada',
 'Es una huachafita con el pelo pintado',
 'Y quizá te manden a la cárcel',
 'Mis viejos me han hecho la vida imposible',
 'Cuando oí los pitazos, los balazos y los carajos salí corriendo hacia ellos, pero me di cuenta que estaban ensartados: en la esquina había tres patrulleros',
 'Cholo miserable, cobarde',
 'el periodismo ha hecho un chongo con el caso del huevón ese',
 'Estaba templado de sus tetas',
 'Ya le dije enantes',
 'Quédate con esa plata y déjame ir',
 'Primero me sales con que eres maricón',
 'le digo pe\x92, le voy a tener miedo a ese huevón',
 'Ahora que manejas plata ya no quieres estudiar',
 'Ahorita vengo',

In [34]:
#model.load_state_dict(torch.load(f'./_BERT_epoch_3.model', map_location=torch.device('cpu')))

In [35]:
#Error Analisys

In [36]:
val_df = ds['validation'].to_pandas()
#val_df

In [37]:
from tqdm.notebook import tqdm

In [38]:
# step by step predictions on dataframe
# We do this to view predictions in the pandas dataframe and easily filter them and perform error analysis.
pred_final = []

# for i, row in tqdm(val_df.iterrows(), total=val_df.shape[0]):
#     predictions = []

#     text = row["text"]
#     encoded_data_test_single = tokenizer.batch_encode_plus([text], 
#     add_special_tokens=config.add_special_tokens, 
#     return_attention_mask=config.return_attention_mask, 
#     pad_to_max_length=config.pad_to_max_length, 
#     max_length=config.seq_length,
#     return_tensors=config.return_tensors
#     )
#     input_ids_test = encoded_data_test_single['input_ids']
#     attention_masks_test = encoded_data_test_single['attention_mask']

    
#     inputs = {'input_ids':      input_ids_test.to(device),
#               'attention_mask':attention_masks_test.to(device),
#              }

#     with torch.no_grad():        
#         outputs = model(**inputs)
    
#     logits = outputs[0]
#     logits = logits.detach().cpu().numpy()
#     predictions.append(logits)
#     predictions = np.concatenate(predictions, axis=0)
#     pred_final.append(np.argmax(predictions, axis=1).flatten()[0])

In [39]:
df.sample(n = 30)

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
513,"Qué tal concha, carajo",0,Negative,"Qué tal concha, carajo",4,5,22,22
70,Sus viejos son una ladilla,0,Negative,Sus viejos son una ladilla,5,5,26,26
131,405 -Soy muy viejo,0,Negative,405 -Soy muy viejo,4,6,18,18
405,"Mi taita, como les contaba, murió",0,Negative,"Mi taita, como les contaba, murió",6,8,33,33
544,Yo no voy a decir nada y por la chamba,1,Neutral,Yo no voy a decir nada y por la chamba,10,10,38,38
321,Y conocí al Viejo en una ocasión inolvidable,2,Positive,Y conocí al Viejo en una ocasión inolvidable,8,8,44,44
188,Te caería bien,2,Positive,Te caería bien,3,4,14,14
29,y tengo ganas de ir a verte a donde carajo estés,2,Positive,y tengo ganas de ir a verte a donde carajo estés,11,11,48,48
81,Vas a ver que te voy a cachar rico,2,Positive,Vas a ver que te voy a cachar rico,9,10,34,34
310,Un cachaco que pasaba por ahí le sacó la pisto...,0,Negative,Un cachaco que pasaba por ahí le sacó la pisto...,31,40,168,168


In [40]:
#Inference

In [41]:
#Evaluate the Model Qualitatively (Human Evaluation)

In [42]:
#Evaluate the Model Quantitatively (with F1 Metric)

In [43]:
texts = ds['test'][0:10]['text']
human_baseline_labels = ds['test'][0:10]['label']

In [44]:
texts

['\x97¿Qué te pasa oye? ¿Por qué estás así todo achorado?',
 'No seas maricón',
 'es un chibolo',
 'No me chupo con mayores',
 'Un soplón de porquería',
 'Eres un enfermo, conchatumadre',
 'El próximo sábado hay una pichanga',
 'Fui un huevón',
 'El día que ellos vengan, tú zafas al toque',
 'cuando venga lo voy a cuadrar']

In [45]:
human_baseline_labels

[0, 0, 1, 1, 0, 0, 1, 0, 1, 0]

In [46]:
#original_model_summaries = []
model_classifications = []

In [47]:
for _, text in enumerate(texts):
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to('cuda')
    logits = model(input_ids).logits
    probabilities = logits.softmax(dim=-1).tolist()[0]
    model_classifications.append(np.argmax(probabilities).flatten()[0])

In [48]:
model_classifications

[0, 0, 0, 1, 0, 0, 0, 0, 1, 0]