In [1]:
#EDA

In [2]:
import torch

In [3]:
#!pip install pysentimiento transformers datasets accelerate evaluate

In [4]:
import datasets
import evaluate

In [5]:
import numpy as np
from datasets import load_dataset

In [6]:
import ipywidgets as widgets

In [7]:
#Helper Functions

In [8]:
# label to name
def label2name(x):
    if x == 0:
        return "Negative"
    if x == 1:
        return "Neutral"
    if x == 2:
        return "Positive"

In [9]:

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics (eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    return results

In [10]:
#Read Data


In [11]:
data_files = {"train": "train.csv", "validation": "val.csv", "test": "test.csv"}
ds = load_dataset("csv", data_files=data_files)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [12]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'char_count', 'Character Count'],
        num_rows: 469
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'char_count', 'Character Count'],
        num_rows: 59
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'char_count', 'Character Count'],
        num_rows: 53
    })
})

In [13]:
ds["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'label_name': Value(dtype='string', id=None),
 'tokenized_text': Value(dtype='string', id=None),
 'sent_token_length': Value(dtype='int64', id=None),
 'char_count': Value(dtype='int64', id=None),
 'Character Count': Value(dtype='int64', id=None)}

In [14]:
ds["test"]["label"][:10]

[0, 1, 2, 0, 2, 0, 0, 1, 0, 1]

In [15]:
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,char_count,Character Count
0,"Nada de llantos, Ambrosio",0,Negative,Nada de llantos Ambrosio,4,25,25
1,"Ay, pobre, qué piña",0,Negative,Ay pobre qué piña,4,20,20
2,405 -Soy muy viejo,0,Negative,405 Soy muy viejo,4,18,18
3,ojalá que esté muerto ese huevón,0,Negative,ojalá que esté muerto ese huevón,6,32,32
4,"Qué rico, lechero eres",2,Positive,Qué rico lechero eres,4,23,23


In [16]:
# encode label and mapping label name
#df["label"] = df["label"].apply(lambda x: label_encode(x))
df["label_name"] = df["label"].apply(lambda x: label2name(x))

In [17]:
# clean text, lowercase and remove punk
#df["text"] = df["text"].apply(lambda x: remove_punct(clean(remove_emoji(x).lower())[0][0]))

In [18]:
df.head()

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,char_count,Character Count
0,"Nada de llantos, Ambrosio",0,Negative,Nada de llantos Ambrosio,4,25,25
1,"Ay, pobre, qué piña",0,Negative,Ay pobre qué piña,4,20,20
2,405 -Soy muy viejo,0,Negative,405 Soy muy viejo,4,18,18
3,ojalá que esté muerto ese huevón,0,Negative,ojalá que esté muerto ese huevón,6,32,32
4,"Qué rico, lechero eres",2,Positive,Qué rico lechero eres,4,23,23


In [19]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "pysentimiento/robertuito-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-base-uncased and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# columns = ['input_ids', 'attention_mask', 'label']
# ds.set_format(type='torch', columns=columns)
# ds

In [21]:
from pysentimiento.preprocessing import preprocess_tweet
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

In [22]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(
        batch["text"], padding=True, truncation=True
        ),
    batched=True, batch_size=32
)

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

In [23]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 469
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 59
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 53
    })
})

In [24]:
tokenized_ds['train']['input_ids'][0]

[0, 817, 413, 13090, 2194, 1848, 937, 414, 2, 1, 1, 1, 1, 1]

In [25]:
tokenized_ds['train']['attention_mask'][0]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

In [26]:
tokenized_ds['train']['text'][0]

'Nada de llantos, Ambrosio'

In [27]:
#!pip install ipdb

In [28]:
#import torch
#device = "cuda" if torch.cuda.is_available() else "cpu"
#model.cuda()

In [29]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="test_trainer",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [30]:
trainer.train()

{'eval_loss': 0.8873393535614014, 'eval_f1': 0.23913043478260868, 'eval_recall': 0.3333333333333333, 'eval_runtime': 0.1103, 'eval_samples_per_second': 534.93, 'eval_steps_per_second': 72.533, 'epoch': 1.0}
{'eval_loss': 0.8017032146453857, 'eval_f1': 0.43240740740740735, 'eval_recall': 0.4401154401154401, 'eval_runtime': 0.1031, 'eval_samples_per_second': 572.369, 'eval_steps_per_second': 77.609, 'epoch': 2.0}
{'eval_loss': 0.752916693687439, 'eval_f1': 0.5872486772486772, 'eval_recall': 0.5746753246753247, 'eval_runtime': 0.1074, 'eval_samples_per_second': 549.55, 'eval_steps_per_second': 74.515, 'epoch': 3.0}
{'eval_loss': 0.7457030415534973, 'eval_f1': 0.6005797101449275, 'eval_recall': 0.5923520923520923, 'eval_runtime': 0.1088, 'eval_samples_per_second': 542.086, 'eval_steps_per_second': 73.503, 'epoch': 4.0}
{'eval_loss': 0.7489935755729675, 'eval_f1': 0.6128801796284956, 'eval_recall': 0.6024531024531025, 'eval_runtime': 0.1063, 'eval_samples_per_second': 555.085, 'eval_steps_p

TrainOutput(global_step=75, training_loss=0.5435673014322917, metrics={'train_runtime': 8.3758, 'train_samples_per_second': 279.973, 'train_steps_per_second': 8.954, 'train_loss': 0.5435673014322917, 'epoch': 5.0})

In [31]:
#Test on validation set

In [32]:
trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 0.825030505657196, 'eval_f1': 0.6683418222976797, 'eval_recall': 0.6626262626262626, 'eval_runtime': 0.0683, 'eval_samples_per_second': 776.278, 'eval_steps_per_second': 102.527, 'epoch': 5.0}


{'eval_loss': 0.825030505657196,
 'eval_f1': 0.6683418222976797,
 'eval_recall': 0.6626262626262626,
 'eval_runtime': 0.0683,
 'eval_samples_per_second': 776.278,
 'eval_steps_per_second': 102.527,
 'epoch': 5.0}

In [33]:
#model.load_state_dict(torch.load(f'./_BERT_epoch_3.model', map_location=torch.device('cpu')))

In [34]:
#Error Analisys

In [35]:
val_df = ds['validation'].to_pandas()
#val_df

In [36]:
from tqdm.notebook import tqdm

In [37]:
# step by step predictions on dataframe
# We do this to view predictions in the pandas dataframe and easily filter them and perform error analysis.
pred_final = []

# for i, row in tqdm(val_df.iterrows(), total=val_df.shape[0]):
#     predictions = []

#     text = row["text"]
#     encoded_data_test_single = tokenizer.batch_encode_plus([text], 
#     add_special_tokens=config.add_special_tokens, 
#     return_attention_mask=config.return_attention_mask, 
#     pad_to_max_length=config.pad_to_max_length, 
#     max_length=config.seq_length,
#     return_tensors=config.return_tensors
#     )
#     input_ids_test = encoded_data_test_single['input_ids']
#     attention_masks_test = encoded_data_test_single['attention_mask']

    
#     inputs = {'input_ids':      input_ids_test.to(device),
#               'attention_mask':attention_masks_test.to(device),
#              }

#     with torch.no_grad():        
#         outputs = model(**inputs)
    
#     logits = outputs[0]
#     logits = logits.detach().cpu().numpy()
#     predictions.append(logits)
#     predictions = np.concatenate(predictions, axis=0)
#     pred_final.append(np.argmax(predictions, axis=1).flatten()[0])

In [38]:
df.sample(n = 30)

Unnamed: 0,text,label,label_name,tokenized_text,sent_token_length,char_count,Character Count
55,Roci cacha riquísimo,2,Positive,Roci cacha riquísimo,3,20,20
73,No vale la pena,0,Negative,No vale la pena,4,15,15
33,"No me toques, caracho",0,Negative,No me toques caracho,4,21,21
445,mi vieja viene volando y tiene pico,0,Negative,mi vieja viene volando y tiene pico,7,35,35
425,"Sería mostro, flaco",2,Positive,Sería mostro flaco,3,20,20
229,Vamos al toque,1,Neutral,Vamos al toque,3,14,14
210,Aliancista y la conchatumadre,0,Negative,Aliancista y la conchatumadre,4,29,29
9,Fúmate un troncho y ándate a dormir,0,Negative,Fúmate un troncho y ándate a dormir,7,35,35
352,"Si la vieja se despierta, nos mata",0,Negative,Si la vieja se despierta nos mata,7,35,35
70,¿cómo? es que no tengo plata,0,Negative,¿cómo es que no tengo plata,6,28,28


In [39]:
#Inference

In [40]:
#Evaluate the Model Qualitatively (Human Evaluation)

In [41]:
#Evaluate the Model Quantitatively (with F1 Metric)

In [42]:
texts = ds['test'][0:10]['text']
human_baseline_labels = ds['test'][0:10]['label']

In [43]:
texts

['\x97Calla, huevón \x97dijo Gustavo\x97',
 'Pero qué tontito habías sido tú, papacito',
 'Ese es mi cachorro',
 '¿Y qué chucha va a pasar?',
 'El panetón es bien rico',
 '\x97Ni que yo fuera un gran cojudo, hombre',
 '\x97Estás verde, choche',
 'Tengo plata',
 'Odio la resaca',
 'YUTAY:  Yo cuando era chibolo']

In [44]:
#original_model_summaries = []
model_classifications = []

In [45]:
for _, text in enumerate(texts):
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to('cuda')
    logits = model(input_ids).logits
    probabilities = logits.softmax(dim=-1).tolist()[0]
    model_classifications.append(np.argmax(probabilities).flatten()[0])

In [46]:
model_classifications

[0, 0, 2, 0, 2, 0, 0, 1, 0, 1]