In [1]:
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install datasets



In [2]:
binary = False

# Carregando Tweets


In [3]:
import pandas as pd

In [4]:
data_df = pd.read_csv("/content/drive/MyDrive/Mestrado/transformers/data/tweetsentbr_train.csv", index_col=0)

In [5]:
label2text = {
    0: "neutro",
    1: "positivo",
    2: "negativo"
}

In [6]:
data_df["label"] = data_df["label"].replace(label2text)

In [7]:
data_df.head()

Unnamed: 0,label,text
5689,positivo,Simplesmente magnífica @fbbreal no Video Show ...
5386,positivo,"Saiu, mas saiu ahazando. Master Chef BR"
869,positivo,Já tô pronta pro Master Chef BR
5432,positivo,Muito amor por é o tchan Altas Horas
24,positivo,QUE PROGRAMA INCRÍVEL O @SBTTheNoite FEZ HOJE!...


In [8]:
if binary:
  data_df = data_df[data_df["label"] != "neutro"]

In [9]:
data_df['label'].unique()

array(['positivo', 'negativo', 'neutro'], dtype=object)

# Carregando PTT5

In [10]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

In [11]:
model_name = "unicamp-dl/ptt5-base-portuguese-vocab"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Datasets para os textos

In [13]:
from datasets import Dataset

In [14]:
ds = Dataset.from_pandas(data_df)

In [15]:
ds

Dataset({
    features: ['label', 'text', '__index_level_0__'],
    num_rows: 9849
})

In [16]:
# 2-3 tokens são o suficiente para codificar os tokens alvo
tokenizer(["positivo", "neutro", "negativo"])

{'input_ids': [[7767, 1], [15154, 1], [10976, 1]], 'attention_mask': [[1, 1], [1, 1], [1, 1]]}

In [17]:
def tokenize_and_prepare_for_generation(examples):

  input_strs = [f"tweet: {tweet}" for tweet in examples["text"]]

  # truncation=True, padding="max_length", max_length=123 para truncar e padronizar os tamanhos de tokens!!!
  input_ids = tokenizer(input_strs)

  label_ids = tokenizer(
            examples["label"],
            max_length=3, # calibrar?
            padding="max_length"
  )

  outputs = input_ids
  outputs["labels"] = label_ids["input_ids"]

  return outputs


In [18]:
ds = ds.map(tokenize_and_prepare_for_generation, batched=True, remove_columns=ds.column_names)

  0%|          | 0/10 [00:00<?, ?ba/s]

In [19]:
ds

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 9849
})

In [20]:
example = ds[0]

In [21]:
example

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'input_ids': [2398,
  1528,
  629,
  46,
  22643,
  105,
  31,
  14295,
  15277,
  31,
  22224,
  245,
  4663,
  5933,
  19,
  7966,
  3218,
  544,
  7776,
  9,
  13,
  62,
  2024,
  8348,
  31,
  13379,
  44,
  44,
  44,
  44,
  397,
  112,
  77,
  839,
  839,
  31,
  2,
  29747,
  29747,
  29747,
  1],
 'labels': [7767, 1, 0]}

In [22]:
tokenizer.decode(example["input_ids"])

'tweet: Simplesmente magnífica @fbbreal no Video Show Ao Vivo o que são essas pernas lindaaaaaas Brasillllll <unk>❤❤❤</s>'

In [23]:
tokenizer.decode(example["labels"])

'positivo</s><pad>'

In [24]:
ds = ds.with_format("torch")

In [25]:
# Split de treino e teste (20%)
ds = ds.train_test_split(0.1)

In [26]:
from transformers import DataCollatorWithPadding

# Collator para Dynamic Padding
collator = DataCollatorWithPadding(tokenizer, padding="longest")

# Configurando Trainer

In [27]:
from transformers import Trainer, TrainingArguments, get_cosine_schedule_with_warmup, AdamW
from datasets import load_metric
import math

In [28]:
bs = 32
epochs = 5
lr = 1e-4
warmup_steps = math.ceil((len(ds["train"])/bs) * epochs * 0.1) #10% of train data for warm-up
train_steps = int(epochs * len(ds["train"])/bs)

In [29]:
warmup_steps, train_steps

(139, 1385)

In [30]:
optimizer = AdamW(model.parameters(), lr=lr) 
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=train_steps, num_cycles=0.5)

In [31]:
import numpy as np
import torch
#debug
logits = None

def compute_metrics(eval_pred):
    global logits
    logits, labels = eval_pred
    predictions = np.argmax(logits[0], axis=2)

    string_y_hat = [tokenizer.decode(pred).strip() for pred in predictions]
    string_y = [tokenizer.decode(target_y).strip() for target_y in labels]

    acc = torch.Tensor([str_y_hat == str_y for str_y_hat, str_y in zip(string_y_hat, string_y)]).float().mean()
    return {"acc": acc.item()}

In [32]:
output_dir = '/content/drive/MyDrive/Mestrado/transformers/trained_models/ptt5_tweetsentbr/binary' if binary else \
             '/content/drive/MyDrive/Mestrado/transformers/trained_models/ptt5_tweetsentbr/all_classes'

In [33]:
output_dir

'/content/drive/MyDrive/Mestrado/transformers/trained_models/ptt5_tweetsentbr/all_classes'

In [34]:
training_args = TrainingArguments(
    output_dir=output_dir,                                                                  # output directory
    num_train_epochs=epochs,                                                                # total number of training epochs
    per_device_train_batch_size=bs,                                                         # batch size per device during training
    per_device_eval_batch_size=bs,                                                          # batch size for evaluation
    # warmup_steps=warmup_steps,                                                              # number of warmup steps for learning rate scheduler
    weight_decay=0.001,                                                                     # strength of weight decay
    save_strategy="epoch",                                                                  # checkpoint save interval
    evaluation_strategy="epoch",                                                            # evaluation interval
    logging_dir='./logs',                                                                   # directory for storing logs
    logging_steps=200,
)


In [35]:
trainer = Trainer(
    model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
    data_collator=collator
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 8864
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1385


Epoch,Training Loss,Validation Loss,Acc
1,5.1649,0.245032,0.684264
2,0.2504,0.253419,0.690355
3,0.1993,0.23209,0.724873
4,0.1744,0.23929,0.720812
5,0.1674,0.246281,0.723858


***** Running Evaluation *****
  Num examples = 985
  Batch size = 32
Saving model checkpoint to /content/drive/MyDrive/Mestrado/transformers/trained_models/ptt5_tweetsentbr/all_classes/checkpoint-277
Configuration saved in /content/drive/MyDrive/Mestrado/transformers/trained_models/ptt5_tweetsentbr/all_classes/checkpoint-277/config.json
Model weights saved in /content/drive/MyDrive/Mestrado/transformers/trained_models/ptt5_tweetsentbr/all_classes/checkpoint-277/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 985
  Batch size = 32
Saving model checkpoint to /content/drive/MyDrive/Mestrado/transformers/trained_models/ptt5_tweetsentbr/all_classes/checkpoint-554
Configuration saved in /content/drive/MyDrive/Mestrado/transformers/trained_models/ptt5_tweetsentbr/all_classes/checkpoint-554/config.json
Model weights saved in /content/drive/MyDrive/Mestrado/transformers/trained_models/ptt5_tweetsentbr/all_classes/checkpoint-554/pytorch_model.bin
***** Running Evaluation *****