In [10]:
import torch
from transformers import AutoTokenizer
#from finetune_vs_scratch.preprocessing import special_tokens


In [12]:
import datasets
import evaluate
import numpy as np
from datasets import load_dataset

In [4]:
special_tokens = ["@usuario", "url", "hashtag", "emoji"]

In [23]:
from transformers import AutoModelForSequenceClassification
model_name = 'dccuchile/bert-base-spanish-wwm-uncased'

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128
tokenizer.add_tokens(special_tokens)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

4

In [13]:

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics (eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    return results

In [14]:
data_files = {"train": "data/train.csv", "validation": "data/val.csv", "test": "data/test.csv"}
ds = load_dataset("csv", data_files=data_files)

In [15]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 573
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 71
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'tokenized_text', 'sent_token_length', 'sent_bert_token_length', 'char_count', 'Character Count'],
        num_rows: 64
    })
})

In [16]:
# label to name
def label2name(x):
    if x == 0:
        return "Negative"
    if x == 1:
        return "Neutral"
    if x == 2:
        return "Positive"

In [17]:
ds["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'label_name': Value(dtype='string', id=None),
 'tokenized_text': Value(dtype='string', id=None),
 'sent_token_length': Value(dtype='int64', id=None),
 'sent_bert_token_length': Value(dtype='int64', id=None),
 'char_count': Value(dtype='int64', id=None),
 'Character Count': Value(dtype='int64', id=None)}

In [18]:
from pysentimiento.preprocessing import preprocess_tweet
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

In [19]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(
        batch["text"], padding=True, truncation=True
        ),
    batched=True, batch_size=32
)

Map:   0%|          | 0/573 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

In [25]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="test_trainer",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [26]:
trainer.train()

{'eval_loss': 0.9116514921188354, 'eval_f1': 0.394524959742351, 'eval_recall': 0.4385964912280702, 'eval_runtime': 0.1391, 'eval_samples_per_second': 510.573, 'eval_steps_per_second': 64.72, 'epoch': 1.0}
{'eval_loss': 0.7890140414237976, 'eval_f1': 0.5445840069643079, 'eval_recall': 0.5313283208020051, 'eval_runtime': 0.137, 'eval_samples_per_second': 518.3, 'eval_steps_per_second': 65.7, 'epoch': 2.0}
{'eval_loss': 0.871826171875, 'eval_f1': 0.5974774774774775, 'eval_recall': 0.587719298245614, 'eval_runtime': 0.134, 'eval_samples_per_second': 529.713, 'eval_steps_per_second': 67.147, 'epoch': 3.0}
{'eval_loss': 0.9862819314002991, 'eval_f1': 0.5943019943019943, 'eval_recall': 0.587719298245614, 'eval_runtime': 0.1311, 'eval_samples_per_second': 541.542, 'eval_steps_per_second': 68.646, 'epoch': 4.0}
{'eval_loss': 1.009502649307251, 'eval_f1': 0.6276668445343144, 'eval_recall': 0.6203007518796992, 'eval_runtime': 0.1309, 'eval_samples_per_second': 542.395, 'eval_steps_per_second': 68

TrainOutput(global_step=90, training_loss=0.47921028137207033, metrics={'train_runtime': 23.7794, 'train_samples_per_second': 120.483, 'train_steps_per_second': 3.785, 'train_loss': 0.47921028137207033, 'epoch': 5.0})

In [27]:
trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 1.1125614643096924, 'eval_f1': 0.6377777777777779, 'eval_recall': 0.6184012066365008, 'eval_runtime': 0.1588, 'eval_samples_per_second': 402.97, 'eval_steps_per_second': 50.371, 'epoch': 5.0}


{'eval_loss': 1.1125614643096924,
 'eval_f1': 0.6377777777777779,
 'eval_recall': 0.6184012066365008,
 'eval_runtime': 0.1588,
 'eval_samples_per_second': 402.97,
 'eval_steps_per_second': 50.371,
 'epoch': 5.0}