In [1]:
#!pip install pysentimiento transformers datasets accelerate evaluate

In [2]:
import torch

In [3]:
import datasets
import evaluate

In [4]:
import numpy as np
from datasets import load_dataset

In [5]:
import ipywidgets as widgets

In [6]:

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics (eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    return results

In [7]:
#load local datasets

In [8]:
data_files = {"train": "train.csv", "validation": "val.csv", "test": "test.csv"}
ds = load_dataset("csv", data_files=data_files)

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 333
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 42
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 37
    })
})

In [10]:
ds["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None)}

In [11]:
ds["test"]["label"][:10]

[0, 2, 2, 2, 1, 1, 0, 0, 0, 1]

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "pysentimiento/robertuito-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-base-uncased and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from pysentimiento.preprocessing import preprocess_tweet
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

In [14]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(batch["text"], padding=False, truncation=True),
    batched=True, batch_size=32
)

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

In [15]:
#!pip install ipdb

In [16]:
#import torch
#device = "cuda" if torch.cuda.is_available() else "cpu"
#model.cuda()

In [17]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="test_trainer",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [18]:
trainer.train()

{'eval_loss': 0.8534154295921326, 'eval_f1': 0.32659932659932656, 'eval_recall': 0.375, 'eval_runtime': 0.076, 'eval_samples_per_second': 552.434, 'eval_steps_per_second': 78.919, 'epoch': 1.0}
{'eval_loss': 0.7370036840438843, 'eval_f1': 0.551048951048951, 'eval_recall': 0.5190740740740741, 'eval_runtime': 0.0688, 'eval_samples_per_second': 610.216, 'eval_steps_per_second': 87.174, 'epoch': 2.0}
{'eval_loss': 0.6634579300880432, 'eval_f1': 0.6087542087542087, 'eval_recall': 0.5798148148148148, 'eval_runtime': 0.0692, 'eval_samples_per_second': 606.832, 'eval_steps_per_second': 86.69, 'epoch': 3.0}
{'eval_loss': 0.6282252073287964, 'eval_f1': 0.7183250013438692, 'eval_recall': 0.6868518518518517, 'eval_runtime': 0.0697, 'eval_samples_per_second': 602.529, 'eval_steps_per_second': 86.076, 'epoch': 4.0}
{'eval_loss': 0.6220354437828064, 'eval_f1': 0.7183250013438692, 'eval_recall': 0.6868518518518517, 'eval_runtime': 0.0686, 'eval_samples_per_second': 612.452, 'eval_steps_per_second': 87

TrainOutput(global_step=55, training_loss=0.567222040349787, metrics={'train_runtime': 4.609, 'train_samples_per_second': 361.251, 'train_steps_per_second': 11.933, 'train_loss': 0.567222040349787, 'epoch': 5.0})

In [19]:
trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 0.6792051196098328, 'eval_f1': 0.6785502079619726, 'eval_recall': 0.6277056277056278, 'eval_runtime': 0.0417, 'eval_samples_per_second': 887.638, 'eval_steps_per_second': 119.951, 'epoch': 5.0}


{'eval_loss': 0.6792051196098328,
 'eval_f1': 0.6785502079619726,
 'eval_recall': 0.6277056277056278,
 'eval_runtime': 0.0417,
 'eval_samples_per_second': 887.638,
 'eval_steps_per_second': 119.951,
 'epoch': 5.0}