In [20]:
#EDA

In [None]:
import torch

In [None]:
#!pip install pysentimiento transformers datasets accelerate evaluate

In [3]:
import datasets
import evaluate

In [4]:
import numpy as np
from datasets import load_dataset

In [21]:
import ipywidgets as widgets

In [22]:
#Helper Functions

In [24]:

import re
import string
from typing import Union, List

class CleanText():
    """ clearing text except digits () . , word character """ 

    def __init__(self, clean_pattern = r"[^A-ZĞÜŞİÖÇIa-zğüı'şöç0-9.\"',()]"):
        self.clean_pattern =clean_pattern

    def __call__(self, text: Union[str, list]) -> List[List[str]]:

        if isinstance(text, str):
            docs = [[text]]

        if isinstance(text, list):
            docs = text

        text = [[re.sub(self.clean_pattern, " ", sent) for sent in sents] for sents in docs]

        return text
    
def remove_emoji(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def tokenize(text):
    """ basic tokenize method with word character, non word character and digits """
    text = re.sub(r" +", " ", str(text))
    text = re.split(r"(\d+|[a-zA-ZğüşıöçĞÜŞİÖÇ]+|\W)", text)
    text = list(filter(lambda x: x != '' and x != ' ', text))
    sent_tokenized = ' '.join(text)
    return sent_tokenized

regex = re.compile('[%s]' % re.escape(string.punctuation))

def remove_punct(text):
    text = regex.sub(" ", text)
    return text

clean = CleanText()

In [37]:
# label to name
def label2name(x):
    if x == 0:
        return "Negative"
    if x == 1:
        return "Neutral"
    if x == 2:
        return "Positive"

In [6]:

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics (eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    return results

In [25]:
#Read Data


In [8]:
data_files = {"train": "train.csv", "validation": "val.csv", "test": "test.csv"}
ds = load_dataset("csv", data_files=data_files)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [26]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 382
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 48
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 43
    })
})

AttributeError: 'DatasetDict' object has no attribute 'columns'

In [10]:
ds["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None)}

In [30]:
ds["test"]["label"][:10]

[0, 1, 2, 0, 2, 0, 2, 0, 2, 0]

In [31]:
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,text,label
0,"Lo traigo ahorita, vas a ver",1
1,No seas fregado,0
2,"Yo soy Misterio, conchasumadre",0
3,"Cómo quieres te entienda, carajo",0
4,"Llámame más tarde, el viejo está aquí",1


In [36]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot

# count of ratings
fig = px.histogram(df,
             x = 'label',
             title = 'Histogram of Sentiment Classification',
             template = 'ggplot2',
             color = 'label',
             color_discrete_sequence= px.colors.sequential.Blues_r,
             opacity = 0.8,
             height = 525,
             width = 835,
            )

fig.update_yaxes(title='Count')
fig.show()

In [38]:
# encode label and mapping label name
#df["label"] = df["label"].apply(lambda x: label_encode(x))
df["label_name"] = df["label"].apply(lambda x: label2name(x))

In [39]:
# clean text, lowercase and remove punk
df["text"] = df["text"].apply(lambda x: remove_punct(clean(remove_emoji(x).lower())[0][0]))

In [40]:
df.head()

Unnamed: 0,text,label,label_name
0,lo traigo ahorita vas a ver,1,Neutral
1,no seas fregado,0,Negative
2,yo soy misterio conchasumadre,0,Negative
3,c mo quieres te entienda carajo,0,Negative
4,ll mame m s tarde el viejo est aqu,1,Neutral


In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "pysentimiento/robertuito-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-base-uncased and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from pysentimiento.preprocessing import preprocess_tweet
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

Map:   0%|          | 0/382 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

In [14]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(batch["text"], padding=False, truncation=True),
    batched=True, batch_size=32
)

Map:   0%|          | 0/382 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

In [15]:
#!pip install ipdb

In [16]:
#import torch
#device = "cuda" if torch.cuda.is_available() else "cpu"
#model.cuda()

In [17]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="test_trainer",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [18]:
trainer.train()

{'eval_loss': 0.8630229830741882, 'eval_f1': 0.24561403508771928, 'eval_recall': 0.3333333333333333, 'eval_runtime': 0.0871, 'eval_samples_per_second': 551.355, 'eval_steps_per_second': 68.919, 'epoch': 1.0}
{'eval_loss': 0.7565199732780457, 'eval_f1': 0.5050505050505051, 'eval_recall': 0.5, 'eval_runtime': 0.0908, 'eval_samples_per_second': 528.86, 'eval_steps_per_second': 66.108, 'epoch': 2.0}
{'eval_loss': 0.6930788159370422, 'eval_f1': 0.6913053778725421, 'eval_recall': 0.6333333333333333, 'eval_runtime': 0.0852, 'eval_samples_per_second': 563.481, 'eval_steps_per_second': 70.435, 'epoch': 3.0}
{'eval_loss': 0.6542004346847534, 'eval_f1': 0.6217583807716635, 'eval_recall': 0.5976190476190476, 'eval_runtime': 0.0841, 'eval_samples_per_second': 570.846, 'eval_steps_per_second': 71.356, 'epoch': 4.0}
{'eval_loss': 0.6425381898880005, 'eval_f1': 0.6126768534238823, 'eval_recall': 0.5976190476190476, 'eval_runtime': 0.0858, 'eval_samples_per_second': 559.575, 'eval_steps_per_second': 69

TrainOutput(global_step=60, training_loss=0.6369218826293945, metrics={'train_runtime': 6.1665, 'train_samples_per_second': 309.74, 'train_steps_per_second': 9.73, 'train_loss': 0.6369218826293945, 'epoch': 5.0})

In [19]:
trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 0.581362783908844, 'eval_f1': 0.7186813186813187, 'eval_recall': 0.7140740740740741, 'eval_runtime': 0.06, 'eval_samples_per_second': 717.257, 'eval_steps_per_second': 100.082, 'epoch': 5.0}


{'eval_loss': 0.581362783908844,
 'eval_f1': 0.7186813186813187,
 'eval_recall': 0.7140740740740741,
 'eval_runtime': 0.06,
 'eval_samples_per_second': 717.257,
 'eval_steps_per_second': 100.082,
 'epoch': 5.0}