In [1]:
import torch
import pandas as pd
import datasets
import evaluate
import numpy as np
from datasets import load_dataset
import ipywidgets as widgets
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from pysentimiento.preprocessing import preprocess_tweet
from sklearn.metrics import classification_report

from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix


2024-03-28 15:55:26.457510: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-28 15:55:26.492000: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-28 15:55:26.492027: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-28 15:55:26.493126: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-28 15:55:26.499252: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructio

In [2]:
#pip install torch==2.0.0
#pip install pysentimiento==0.7.2
#pip install evaluate==0.4.0
#pip install datasets==2.14.5

In [3]:
#!pip install pysentimiento transformers datasets accelerate evaluate

In [4]:
#https://huggingface.co/docs/evaluate/base_evaluator

In [5]:
#Three versions of RoBERTuito were
#trained: a cased version which preserves the case found
#in the original tweets, an uncased version, and a deacc
#version, which lower-cases and removes accents on
#tweets.

In [6]:
#Helper Functions

In [7]:
# label to name
def label2name(x):
    if x == 0:
        return "Negative"
    if x == 1:
        return "Neutral"
    if x == 2:
        return "Positive"

In [8]:

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics (eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    return results

In [9]:
#Read Data

In [10]:
data_files = {"train": "data/train.csv", "validation": "data/val.csv", "test": "data/test.csv"}
ds = load_dataset("csv", data_files=data_files)

In [11]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'text_original', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 7594
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'text_original', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 2374
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'text_original', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 1899
    })
})

In [12]:
#push to hugging face

In [13]:
#https://huggingface.co/settings/tokens

In [14]:
# from huggingface_hub import notebook_login
# notebook_login()

In [15]:
#ds.push_to_hub("jairleo95/social-media-peruvian-sentiment")

In [16]:
ds["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'label_name': Value(dtype='string', id=None),
 'text_original': Value(dtype='string', id=None),
 'tokenized_text': Value(dtype='string', id=None),
 'sent_token_length': Value(dtype='int64', id=None),
 'sent_bert_token_length': Value(dtype='int64', id=None),
 'char_count': Value(dtype='int64', id=None),
 'Character Count': Value(dtype='int64', id=None)}

In [17]:
ds["test"]["label"][:10]

[1, 0, 0, 0, 0, 2, 1, 0, 2, 2]

In [18]:
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,text,label,label_name,text_original,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
0,no yuli con eso se va con pendeivis 😞,1,Neutral,No Yuli con eso se va con pendeivis 😞,no yuli con eso se va con pendeivis 😞,9,12,37,37
1,este es más bruto... cosas que necesita el pe...,0,Negative,Este es más bruto... Cosas que necesita el Pe...,este es más bruto cosas que necesita el pe...,27,35,144,144
2,bakan tu video pero el chambar es riko pero pa...,2,Positive,Bakan tu video pero el chambar es riko pero pa...,bakan tu video pero el chambar es riko pero pa...,53,67,267,267
3,no sabes que hacer para llamar la atención par...,0,Negative,no sabes que hacer para llamar la atención par...,no sabes que hacer para llamar la atención par...,18,20,93,93
4,"pe chino dame chamba mano, almenos pa limpiart...",1,Neutral,"Pe CHINO DAME CHAMBA MANO, ALMENOS PA LIMPIART...",pe chino dame chamba mano almenos pa limpiart...,11,14,61,61


In [19]:
# encode label and mapping label name
#df["label"] = df["label"].apply(lambda x: label_encode(x))
df["label_name"] = df["label"].apply(lambda x: label2name(x))

In [20]:
# clean text, lowercase and remove punk
#df["text"] = df["text"].apply(lambda x: remove_punct(clean(remove_emoji(x).lower())[0][0]))

In [21]:
df.sample(10)

Unnamed: 0,text,label,label_name,text_original,tokenized_text,sent_token_length,sent_bert_token_length,char_count,Character Count
1633,eso sí es marketing señores vayan a comprar ...,0,Negative,Eso sí es marketing señores Vayan a comprar ...,eso sí es marketing señores vayan a comprar ...,11,14,60,60
320,maria pia copello o haces el ridiculo estas h...,2,Positive,Maria Pia Copello o haces el ridiculo estas h...,maria pia copello o haces el ridiculo estas h...,35,43,195,195
3766,selena enriquez amor ya no vamos a esconder la...,1,Neutral,Selena Enriquez amor ya no vamos a esconder la...,selena enriquez amor ya no vamos a esconder la...,20,25,110,110
1661,eres tu causa segundo emilio cabrera peña ?? 🤣🤣🤣,1,Neutral,Eres tu causa Segundo Emilio Cabrera Peña ?? 🤣🤣🤣🤣,eres tu causa segundo emilio cabrera peña 🤣🤣🤣,9,9,48,48
5950,"mis amigos son misios y tacaños carajo, pero t...",2,Positive,"Mis amigos son misios y tacaños carajo, pero T...",mis amigos son misios y tacaños carajo pero t...,22,27,94,94
5508,luis enrique rivas salazar no llega a la nota ...,1,Neutral,Luis Enrique Rivas Salazar no llega a la nota ...,luis enrique rivas salazar no llega a la nota ...,21,31,123,123
257,frank docha colazón no se por qué me acorde de...,1,Neutral,Frank Docha colazón No se por qué me acorde de...,frank docha colazón no se por qué me acorde de...,33,38,161,161
5839,debe ser jodido trabajar en la bolsa,1,Neutral,Debe ser jodido trabajar en la bolsa,debe ser jodido trabajar en la bolsa,7,7,36,36
18,jaja así me sentí esa vez que se juntaron evel...,1,Neutral,Jajajaja Así me sentí esa vez que se juntaron ...,jaja así me sentí esa vez que se juntaron evel...,24,31,116,116
6729,viva el. perú carajo 👍👍😁😁,2,Positive,Viva el. Perú carajo 👍👍😁😁❤❤,viva el perú carajo 👍👍😁😁,5,5,25,25


In [22]:
model_name = "pysentimiento/robertuito-base-deacc"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/435M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/859k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [23]:
# columns = ['input_ids', 'attention_mask', 'label']
# ds.set_format(type='torch', columns=columns)
# ds

In [24]:
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

In [25]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(
        batch["text"], padding=True, truncation=True
        ),
    batched=True, batch_size=32
)

Map:   0%|          | 0/7594 [00:00<?, ? examples/s]

Map:   0%|          | 0/2374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

In [26]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'text_original', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7594
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'text_original', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2374
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'text_original', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1899
    })
})

In [27]:
tokenized_ds['train']['input_ids'][0][::10]

[0, 614, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [28]:
tokenized_ds['train']['attention_mask'][0][::10]

[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [29]:
tokenized_ds['train']['text'][10]

'tu hermano gemelo manolo... sobrado la haces como su doble..   emoji cara con mano sobre la boca emoji  emoji cara con mano sobre la boca emoji'

In [30]:
#!pip install ipdb

In [31]:
#import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.cuda()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [32]:
training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="robertuito_deacc/checkpoints",
    warmup_ratio=0.1,
    learning_rate=5e-5,
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [33]:
trainer.train()

{'eval_loss': 0.6313991546630859, 'eval_f1': 0.7198534892139952, 'eval_recall': 0.7223918970370447, 'eval_runtime': 12.5359, 'eval_samples_per_second': 189.377, 'eval_steps_per_second': 23.692, 'epoch': 1.0}
{'eval_loss': 0.6540045738220215, 'eval_f1': 0.7314954208992441, 'eval_recall': 0.7329860853834665, 'eval_runtime': 11.2227, 'eval_samples_per_second': 211.535, 'eval_steps_per_second': 26.464, 'epoch': 2.0}
{'loss': 0.5715, 'grad_norm': 2.7226808071136475, 'learning_rate': 3.221288515406163e-05, 'epoch': 2.1}
{'eval_loss': 0.8846428394317627, 'eval_f1': 0.7200668848797278, 'eval_recall': 0.7196258639770946, 'eval_runtime': 11.404, 'eval_samples_per_second': 208.172, 'eval_steps_per_second': 26.043, 'epoch': 3.0}
{'eval_loss': 1.145634412765503, 'eval_f1': 0.7269458376826164, 'eval_recall': 0.7266206827496301, 'eval_runtime': 11.4615, 'eval_samples_per_second': 207.129, 'eval_steps_per_second': 25.913, 'epoch': 4.0}
{'loss': 0.1362, 'grad_norm': 0.27891427278518677, 'learning_rate'

TrainOutput(global_step=1190, training_loss=0.3031648683948677, metrics={'train_runtime': 629.1552, 'train_samples_per_second': 60.351, 'train_steps_per_second': 1.891, 'train_loss': 0.3031648683948677, 'epoch': 5.0})

In [34]:
# Save trained model
trainer.model.save_pretrained("robertuito_model/model")

In [35]:
#Test on validation set

In [36]:
evaluation_results = trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 1.2779923677444458, 'eval_f1': 0.7199614421342672, 'eval_recall': 0.720481119839072, 'eval_runtime': 10.146, 'eval_samples_per_second': 187.168, 'eval_steps_per_second': 23.458, 'epoch': 5.0}


In [37]:
evaluation_results

{'eval_loss': 1.2779923677444458,
 'eval_f1': 0.7199614421342672,
 'eval_recall': 0.720481119839072,
 'eval_runtime': 10.146,
 'eval_samples_per_second': 187.168,
 'eval_steps_per_second': 23.458,
 'epoch': 5.0}

In [38]:
predictions = trainer.predict(tokenized_ds["test"])
predictions

PredictionOutput(predictions=array([[ 2.1242645 ,  1.1369343 , -3.8242478 ],
       [-1.7109395 , -3.0079389 ,  4.684851  ],
       [ 3.2433321 , -0.76485264, -2.360845  ],
       ...,
       [-2.5961633 , -3.2600904 ,  5.8412876 ],
       [-3.290533  , -2.3925846 ,  5.176704  ],
       [ 5.5061054 , -2.2806666 , -2.993916  ]], dtype=float32), label_ids=array([1, 0, 0, ..., 2, 1, 0]), metrics={'test_loss': 1.2779923677444458, 'test_f1': 0.7199614421342672, 'test_recall': 0.720481119839072, 'test_runtime': 10.1967, 'test_samples_per_second': 186.236, 'test_steps_per_second': 23.341})

In [39]:
len(predictions[0])

1899

In [40]:
true_labels = tokenized_ds["test"]['label']

In [41]:
len(true_labels)

1899

In [42]:
predicted_labels = [np.argmax(pred) for pred in predictions[0]]
classification_rep = classification_report(true_labels, predicted_labels, digits=5)

print(classification_rep)

              precision    recall  f1-score   support

           0    0.80164   0.84217   0.82141       811
           1    0.61569   0.55435   0.58341       552
           2    0.74545   0.76493   0.75506       536

    accuracy                        0.73670      1899
   macro avg    0.72093   0.72048   0.71996      1899
weighted avg    0.73173   0.73670   0.73350      1899



In [43]:
#model.load_state_dict(torch.load(f'./_BERT_epoch_3.model', map_location=torch.device('cpu')))

In [44]:
#Error Analisys

In [45]:
val_df = ds['validation'].to_pandas()
#val_df

In [46]:
# step by step predictions on dataframe
# We do this to view predictions in the pandas dataframe and easily filter them and perform error analysis.
pred_final = []

for i, row in tqdm(val_df.iterrows(), total=val_df.shape[0]):
    predictions = []

    text = row["text"]
    encoded_data_test_single = tokenizer.batch_encode_plus([text], 
    # add_special_tokens=config.add_special_tokens, 
    # return_attention_mask=config.return_attention_mask, 
    # pad_to_max_length=config.pad_to_max_length, 
    max_length=128,
    # return_tensors=config.return_tensors
    return_tensors="pt"
    )
    input_ids_test = encoded_data_test_single['input_ids']
    attention_masks_test = encoded_data_test_single['attention_mask']

    
    inputs = {'input_ids':      input_ids_test.to(device),
              'attention_mask':attention_masks_test.to(device),
             }

    with torch.no_grad():        
        outputs = model(**inputs)
    
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)
    predictions = np.concatenate(predictions, axis=0)
    pred_final.append(np.argmax(predictions, axis=1).flatten()[0])

  0%|          | 0/2374 [00:00<?, ?it/s]

In [47]:
# add pred into val_df
val_df["pred"] = pred_final

In [48]:
#  Add control column for easier wrong and right predictions
control = val_df.pred.values == val_df.label.values
val_df["control"] = control

In [49]:
# filtering false predictions
val_df = val_df[val_df.control == False]

In [50]:
# label to intent mapping
name2label = {"Negative":0,
              "Neutral":1,
             "Positive":2
             }
label2name = {v: k for k, v in name2label.items()}

val_df["pred_name"] = val_df.pred.apply(lambda x: label2name.get(x)) 

In [51]:

# We create a confusion matrix to better observe the classes that the model confuses.
pred_name_values = val_df.pred_name.values
label_values = val_df.label_name.values
confmat = confusion_matrix(label_values, pred_name_values, labels=list(name2label.keys()))

In [52]:
confmat

array([[  0, 215,  38],
       [128,   0, 137],
       [ 50, 170,   0]])

In [53]:
df_confusion_val = pd.crosstab(label_values, pred_name_values)
df_confusion_val

col_0,Negative,Neutral,Positive
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,0,215,38
Neutral,128,0,137
Positive,50,170,0


In [54]:
# save confissuan matrix df
df_confusion_val.to_csv("val_df_confusion.csv")

## Inference

In [55]:
tokenized_ds['train']['text'][:30]

['no yuli con eso se va con pendeivis  emoji cara decepcionada emoji',
 'este es más bruto... cosas que necesita el  perú obio que se importara.. todo lo que tienes que decir para que gané  kk.. mariposa  emoji mariposa emoji  gorda... !',
 'bakan tu video pero el chambar es riko pero para puesto 1 no va ah, el caldo blanco es mucho mas sabroso, sin mencionar la infinidad de sopas, caldos y chupes q se prepara en arequipa como el chaque de tripas, el puchero o el timpo de rabos etc etc, saludos compare  emoji pulgar hacia arriba emoji',
 'no sabes que hacer para llamar la atención para que te dean bola en tus canciones copiada  emoji cara vomitando emoji  emoji cara vomitando emoji  emoji cara vomitando emoji',
 'pe chino dame chamba mano, almenos pa limpiarte las tabas :v emoji cara de por favor emoji',
 'casaca de cuero?? cómo abran sudado esas alicias  emoji cara revolviéndose de la risa emoji  emoji cara revolviéndose de la risa emoji  emoji cara revolviéndose de la risa emoji',
 '

In [56]:
encoded_data_test = tokenizer.batch_encode_plus(
    val_df.text.values, 
    add_special_tokens=config.add_special_tokens, 
    return_attention_mask=config.return_attention_mask, 
    pad_to_max_length=config.pad_to_max_length,
    max_length=128, 
    return_tensors=config.return_tensors
)

NameError: name 'config' is not defined

In [None]:
#Evaluate the Model Qualitatively (Human Evaluation)

In [None]:
#Evaluate the Model Quantitatively (with F1 Metric)

In [None]:
texts = val_df[0:20]['text']
human_baseline_labels = val_df[0:20]['label']

In [None]:
texts

In [None]:
human_baseline_labels

In [None]:
#original_model_summaries = []
model_classifications = []

In [None]:
for _, text in enumerate(texts):
    input_ids = tokenizer(text, return_tensors="pt", max_length=128).input_ids.to('cuda')
    logits = model(input_ids).logits
    probabilities = logits.softmax(dim=-1).tolist()[0]
    model_classifications.append(np.argmax(probabilities).flatten()[0])

In [None]:
model_classifications