In [1]:
!pip3 install transformers
!pip3 install datasets
!pip3 install transformers-interpret

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 4.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.5 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 50.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.7 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

In [2]:
binary = False

# Preparando Dataset

In [3]:
from datasets import Dataset
import pandas as pd

In [5]:
data_df = pd.read_csv("/content/drive/MyDrive/Mestrado/transformers/data/tweetsentbr/tweetsentbr_test.csv", index_col=0)

In [6]:
if binary:
  data_df = data_df[data_df["label"] != 0]
  data_df["label"] -= 1 # 0 será a classe positiva, 1 será a classe negativa

In [7]:
ds = Dataset.from_pandas(data_df)

In [8]:
ds

Dataset({
    features: ['label', 'text', '__index_level_0__'],
    num_rows: 2463
})

# Carregando modelos

In [9]:
from transformers import BertTokenizerFast, BertForSequenceClassification

In [10]:
model = "neuralmind/bert-base-portuguese-cased"
tokenizer = BertTokenizerFast.from_pretrained(model)

Downloading:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [11]:
path_to_checkpoint = "/content/drive/MyDrive/Mestrado/transformers/trained_models/bertimbau_tweetsentbr/binary/checkpoint-520" if binary else \
                     "/content/drive/MyDrive/Mestrado/transformers/trained_models/bertimbau_tweetsentbr/all_classes/checkpoint-695"
path_to_checkpoint

'/content/drive/MyDrive/Mestrado/transformers/trained_models/bertimbau_tweetsentbr/all_classes/checkpoint-695'

In [12]:
model = BertForSequenceClassification.from_pretrained(path_to_checkpoint)

# Pre-processamento de Datasets

In [13]:
from transformers import DataCollatorWithPadding

In [14]:
def tokenizer_function(example):
  # Em caso de uma tarefa de classificação de pares de texto, modificar este valor de retorno
  # truncation=True, padding="max_length", max_length=123 para truncar e padronizar os tamanhos de tokens!!!
  return tokenizer(
      example["text"], truncation=True
  )

In [15]:
# Tokenizando todos os elementos do conjunto de dados em batches
ds = ds.map(tokenizer_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
# Mantendo apenas colunas cujos nomes coincidam com os métodos forward dos transformers
ds = ds.remove_columns(column_names=["text", "__index_level_0__"])
ds = ds.rename_column("label", "labels")
ds = ds.with_format("torch")

In [17]:
ds

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
    num_rows: 2463
})

In [18]:
# Collator para Dynamic Padding
collator = DataCollatorWithPadding(tokenizer, padding="longest")

# Configurando Trainer para predição

In [19]:
from transformers import Trainer

In [20]:
path_to_checkpoint = "/content/drive/MyDrive/Mestrado/transformers/trained_models/bertimbau_tweetsentbr/binary/checkpoint-520" if binary else \
                     "/content/drive/MyDrive/Mestrado/transformers/trained_models/bertimbau_tweetsentbr/all_classes/checkpoint-695"

In [21]:
path_to_checkpoint

'/content/drive/MyDrive/Mestrado/transformers/trained_models/bertimbau_tweetsentbr/all_classes/checkpoint-695'

In [22]:
trainer = Trainer(
    model,
    data_collator=collator
)

In [23]:
predictions = trainer.predict(ds)

***** Running Prediction *****
  Num examples = 2463
  Batch size = 8


In [24]:
logits = predictions.predictions
final_preds = logits.argmax(axis=1)

In [25]:
data_df["prediction"] = final_preds

In [26]:
suffix = "binary" if binary else "all_classes"
data_df.to_csv(f"/content/drive/MyDrive/Mestrado/transformers/data/predicoes/bertimbau_{suffix}_preds.csv")

# Avaliação em termos de métricas de classificação

In [27]:
from sklearn.metrics import classification_report

In [28]:
real = data_df["label"].values
preds = data_df["prediction"].values

In [29]:
print(classification_report(real, preds))

              precision    recall  f1-score   support

           0       0.59      0.53      0.56       627
           1       0.79      0.82      0.81      1109
           2       0.71      0.74      0.73       727

    accuracy                           0.72      2463
   macro avg       0.70      0.70      0.70      2463
weighted avg       0.72      0.72      0.72      2463



# Interpretando resultados

In [30]:
from transformers_interpret import SequenceClassificationExplainer
import numpy as np

In [31]:
model.config.id2label = {0: 'neutro', 1: 'positivo', 2: 'negativo'}

In [32]:
cls_explainer = SequenceClassificationExplainer(
    model,
    tokenizer
)

In [45]:
sample = data_df["text"].values[np.random.randint(len(data_df))]
sample

'Se eu fosse apresentador, acho que adoraria trabalhar em um cenário igual ao É De Casa... super agradável, tudo integrado com áreas externas'

In [46]:
cls_explainer(sample)

[('[CLS]', 0.0),
 ('Se', -0.025149665246053655),
 ('eu', 0.02607725204838802),
 ('fosse', -0.07925034576033128),
 ('apresentador', 0.04090519074907411),
 (',', -0.02436781343735907),
 ('acho', 0.1532408164751208),
 ('que', 0.030602686429584608),
 ('ado', 0.38616608199110525),
 ('##raria', 0.26163367381130803),
 ('trabalhar', 0.2233238959256869),
 ('em', 0.04472121189331127),
 ('um', 0.07280667875143941),
 ('cenário', 0.0837538693251109),
 ('igual', 0.03957241273725108),
 ('ao', -0.023351806822572086),
 ('É', 0.17730485247111524),
 ('De', -0.010541378155751888),
 ('Casa', 0.047868978094962655),
 ('.', 0.08047185514681976),
 ('.', 0.13035800398363398),
 ('.', 0.14573994056349424),
 ('super', 0.4745771168175958),
 ('agradável', 0.5402867174455881),
 (',', 0.21426071430626473),
 ('tudo', 0.17115657593768205),
 ('integrado', 0.07743382668826494),
 ('com', 0.06350962868281645),
 ('áreas', -0.01738003393128907),
 ('externas', 0.025064118541632895),
 ('[SEP]', 0.0)]

In [47]:
cls_explainer.predicted_class_name

'positivo'

In [48]:
_ = cls_explainer.visualize()

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,positivo (0.95),positivo,3.33,"[CLS] Se eu fosse apresentador , acho que ado ##raria trabalhar em um cenário igual ao É De Casa . . . super agradável , tudo integrado com áreas externas [SEP]"
,,,,
