In [1]:
import torch
from transformers import AutoTokenizer
#from finetune_vs_scratch.preprocessing import special_tokens


In [2]:
import datasets
import evaluate
import numpy as np
from datasets import load_dataset

In [3]:
special_tokens = ["@usuario", "url", "hashtag", "emoji"]

In [4]:
from transformers import AutoModelForSequenceClassification
model_name = 'dccuchile/albert-base-spanish-finetuned-mldoc'

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128
tokenizer.add_tokens(special_tokens)

Downloading (…)lve/main/config.json:   0%|          | 0.00/562 [00:00<?, ?B/s]

OSError: AIDA-UPM/BERTuit-base does not appear to have a file named pytorch_model.bin but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.

In [None]:

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics (eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    return results

In [None]:
data_files = {"train": "data/train.csv", "validation": "data/val.csv", "test": "data/test.csv"}
ds = load_dataset("csv", data_files=data_files)

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 573
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 71
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 64
    })
})

In [None]:
# label to name
def label2name(x):
    if x == 0:
        return "Negative"
    if x == 1:
        return "Neutral"
    if x == 2:
        return "Positive"

In [None]:
ds["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'label_name': Value(dtype='string', id=None),
 'tokenized_text': Value(dtype='string', id=None),
 'sent_token_length': Value(dtype='int64', id=None),
 'sent_bert_token_length': Value(dtype='int64', id=None),
 'char_count': Value(dtype='int64', id=None),
 'Character Count': Value(dtype='int64', id=None)}

In [None]:
from pysentimiento.preprocessing import preprocess_tweet
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

In [None]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(
        batch["text"], padding=True, truncation=True
        ),
    batched=True, batch_size=32
)

Map:   0%|          | 0/573 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="test_trainer",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [None]:
trainer.train()

{'eval_loss': 0.8674662709236145, 'eval_f1': 0.2679012345679012, 'eval_recall': 0.3508771929824561, 'eval_runtime': 0.1552, 'eval_samples_per_second': 457.457, 'eval_steps_per_second': 57.987, 'epoch': 1.0}
{'eval_loss': 0.7881259918212891, 'eval_f1': 0.4970588235294118, 'eval_recall': 0.5238095238095238, 'eval_runtime': 0.1709, 'eval_samples_per_second': 415.377, 'eval_steps_per_second': 52.653, 'epoch': 2.0}
{'eval_loss': 1.0402268171310425, 'eval_f1': 0.6168179646440516, 'eval_recall': 0.5964912280701754, 'eval_runtime': 0.1555, 'eval_samples_per_second': 456.607, 'eval_steps_per_second': 57.88, 'epoch': 3.0}
{'eval_loss': 0.9365906119346619, 'eval_f1': 0.6764227642276422, 'eval_recall': 0.6766917293233083, 'eval_runtime': 0.1644, 'eval_samples_per_second': 431.863, 'eval_steps_per_second': 54.743, 'epoch': 4.0}
{'eval_loss': 1.0495860576629639, 'eval_f1': 0.6653945537666468, 'eval_recall': 0.6528822055137845, 'eval_runtime': 0.1471, 'eval_samples_per_second': 482.625, 'eval_steps_p

TrainOutput(global_step=90, training_loss=0.5045969645182292, metrics={'train_runtime': 25.4305, 'train_samples_per_second': 112.66, 'train_steps_per_second': 3.539, 'train_loss': 0.5045969645182292, 'epoch': 5.0})

In [None]:
trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 1.114953875541687, 'eval_f1': 0.6810143476810143, 'eval_recall': 0.671945701357466, 'eval_runtime': 0.1483, 'eval_samples_per_second': 431.596, 'eval_steps_per_second': 53.95, 'epoch': 5.0}


{'eval_loss': 1.114953875541687,
 'eval_f1': 0.6810143476810143,
 'eval_recall': 0.671945701357466,
 'eval_runtime': 0.1483,
 'eval_samples_per_second': 431.596,
 'eval_steps_per_second': 53.95,
 'epoch': 5.0}