In [1]:
!pip install transformers
!sudo apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/neuralmind/bert-large-portuguese-cased




git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 69 not upgraded.
Error: Failed to call git rev-parse --git-dir: exit status 128 
Git LFS initialized.
Cloning into 'bert-large-portuguese-cased'...
remote: Enumerating objects: 36, done.[K
remote: Total 36 (delta 0), reused 0 (delta 0), pack-reused 36 (from 1)[K
Unpacking objects: 100% (36/36), 102.71 KiB | 5.41 MiB/s, done.
Filtering content: 100% (2/2), 2.49 GiB | 105.25 MiB/s, done.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForPreTraining
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from torch.cuda.amp import autocast, GradScaler
import time, datetime
import torch.nn.functional as F

In [3]:
df = pd.read_csv("../data/hatebr_and_rationales.csv")

In [4]:
TRAIN_SIZE = 0.8
TEST_SIZE = 0.1
VAL_SIZE = 0.1


x_train, x_test_val, y_train, y_test_val = train_test_split(df['normalized_text'], df['label final'], test_size=TEST_SIZE + VAL_SIZE, random_state=0)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=VAL_SIZE/(TEST_SIZE + VAL_SIZE), random_state=0)



In [5]:
# define as funcoes para preparar o dataset para dar entrada para o modelo

def tokenize_corpus(df, tokenizer, max_len):
    
    input_ids = []
    attention_masks = []
    
    for doc in df:
        encoded_dict = tokenizer.encode_plus(
                            doc,  # document to encode.
                            add_special_tokens=True,  # adiciona '[CLS] token para início' e '[SEP] token para fim'
                            max_length=max_len,  # define max length
                            truncation=True,  # trunca mensagens longas
                            padding='max_length',  # adiciona padding (adiciona 0 em sequências menores que o tamanho maximo)
                            return_attention_mask=True,  # cria mascaras de atenção
                            return_tensors='pt'  # retorna tensores pytorch
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])  # o attention mask diferencia o que é padding do que não é

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)
     
def prepare_dataset(features, labels):
    padded_tokens, attention_masks = tokenize_corpus(features.values, tokenizer, 512) # tokeniza as mensagens
    target = np.array(labels.values, dtype=np.int64).reshape(-1, 1) # transforma target em np array
    tensor_df = TensorDataset(padded_tokens, attention_masks, torch.from_numpy(target))

    return tensor_df

In [6]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased',
                                            num_labels=2, return_dict=False)

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.load_state_dict(torch.load("../models/bertimbau-base/my_model", map_location=torch.device('cpu')))

<All keys matched successfully>

In [9]:
def predict_fn(text):
    model.eval()
    padded_tokens, attention_masks = tokenize_corpus(text, tokenizer, 512) # tokeniza as mensagens
    tensor_df = TensorDataset(padded_tokens, attention_masks)
    
    test_dataloader = DataLoader(tensor_df,
                              batch_size=len(text),
                              shuffle=False)
    
    for batch in test_dataloader: # itera nos batches de teste

        input_ids = batch[0].cpu()
        input_mask = batch[1].cpu()

        # nao precisa calcular os gradientes, pois e necessario apenas para o treino
        with torch.no_grad():
            # avalia modelo nos dados de teste
            logits = model(input_ids=input_ids,
                                 attention_mask=input_mask)
            
    probabilities = F.softmax(logits[0].detach().cpu(), dim=-1)
    torch.cuda.empty_cache()
    return probabilities.numpy()

In [10]:
instances = np.where(y_train == 1)[0][0:350]

In [11]:
import shap
class_names = ["Non-hate", "Hate"]
masker = shap.maskers.Text(tokenizer=r"\W+")
explainer = shap.Explainer(predict_fn, masker=masker, output_names=class_names)

shap_values = explainer(x_train.iloc[instances].tolist())

2024-09-14 23:44:00.437392: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-14 23:44:00.437576: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-14 23:44:00.592274: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


  0%|          | 0/306 [00:00<?, ?it/s]

PartitionExplainer explainer:   1%|          | 1/100 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:   4%|▍         | 4/100 [13:51<4:22:06, 163.81s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:   5%|▌         | 5/100 [14:39<3:10:33, 120.35s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:   6%|▌         | 6/100 [22:30<6:19:59, 242.55s/it]

  0%|          | 0/132 [00:00<?, ?it/s]

PartitionExplainer explainer:   9%|▉         | 9/100 [24:33<2:24:10, 95.06s/it] 

  0%|          | 0/210 [00:00<?, ?it/s]

PartitionExplainer explainer:  10%|█         | 10/100 [27:20<2:56:05, 117.40s/it]

  0%|          | 0/6 [00:00<?, ?it/s]

PartitionExplainer explainer:  11%|█         | 11/100 [27:28<2:04:25, 83.88s/it] 

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  12%|█▏        | 12/100 [36:51<5:36:58, 229.76s/it]

  0%|          | 0/210 [00:00<?, ?it/s]

PartitionExplainer explainer:  13%|█▎        | 13/100 [39:42<5:07:24, 212.00s/it]

  0%|          | 0/132 [00:00<?, ?it/s]

PartitionExplainer explainer:  14%|█▍        | 14/100 [41:40<4:22:56, 183.45s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  15%|█▌        | 15/100 [41:52<3:06:41, 131.78s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  16%|█▌        | 16/100 [44:24<3:13:20, 138.10s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  17%|█▋        | 17/100 [53:20<5:56:33, 257.75s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  18%|█▊        | 18/100 [53:42<4:15:05, 186.66s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  19%|█▉        | 19/100 [56:00<3:52:18, 172.08s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  20%|██        | 20/100 [56:21<2:49:11, 126.89s/it]

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  21%|██        | 21/100 [57:00<2:12:18, 100.48s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  22%|██▏       | 22/100 [1:06:19<5:09:30, 238.08s/it]

  0%|          | 0/110 [00:00<?, ?it/s]

PartitionExplainer explainer:  23%|██▎       | 23/100 [1:08:00<4:12:46, 196.96s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  24%|██▍       | 24/100 [1:08:48<3:12:50, 152.24s/it]

  0%|          | 0/6 [00:00<?, ?it/s]

PartitionExplainer explainer:  25%|██▌       | 25/100 [1:08:56<2:16:02, 108.84s/it]

  0%|          | 0/110 [00:00<?, ?it/s]

PartitionExplainer explainer:  26%|██▌       | 26/100 [1:10:37<2:11:17, 106.46s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  27%|██▋       | 27/100 [1:12:53<2:20:31, 115.50s/it]

  0%|          | 0/30 [00:00<?, ?it/s]

PartitionExplainer explainer:  28%|██▊       | 28/100 [1:13:23<1:47:48, 89.85s/it] 

  0%|          | 0/110 [00:00<?, ?it/s]

PartitionExplainer explainer:  29%|██▉       | 29/100 [1:15:03<1:49:56, 92.91s/it]

  0%|          | 0/240 [00:00<?, ?it/s]

PartitionExplainer explainer:  30%|███       | 30/100 [1:18:10<2:21:21, 121.16s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  31%|███       | 31/100 [1:27:01<4:40:43, 244.10s/it]

  0%|          | 0/72 [00:00<?, ?it/s]

PartitionExplainer explainer:  32%|███▏      | 32/100 [1:28:06<3:35:44, 190.37s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  33%|███▎      | 33/100 [1:28:19<2:32:55, 136.95s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  34%|███▍      | 34/100 [1:37:23<4:45:02, 259.12s/it]

  0%|          | 0/342 [00:00<?, ?it/s]

PartitionExplainer explainer:  35%|███▌      | 35/100 [1:42:14<4:51:15, 268.85s/it]

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  36%|███▌      | 36/100 [1:42:52<3:32:46, 199.48s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  37%|███▋      | 37/100 [1:43:38<2:41:15, 153.57s/it]

  0%|          | 0/210 [00:00<?, ?it/s]

PartitionExplainer explainer:  39%|███▉      | 39/100 [1:46:31<1:53:45, 111.89s/it]

  0%|          | 0/90 [00:00<?, ?it/s]

PartitionExplainer explainer:  40%|████      | 40/100 [1:47:53<1:42:46, 102.77s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  41%|████      | 41/100 [1:48:14<1:16:56, 78.25s/it] 

  0%|          | 0/6 [00:00<?, ?it/s]

PartitionExplainer explainer:  42%|████▏     | 42/100 [1:48:21<55:08, 57.04s/it]  

  0%|          | 0/342 [00:00<?, ?it/s]

PartitionExplainer explainer:  43%|████▎     | 43/100 [1:53:11<2:00:23, 126.73s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  44%|████▍     | 44/100 [1:53:23<1:26:15, 92.41s/it] 

  0%|          | 0/272 [00:00<?, ?it/s]

PartitionExplainer explainer:  45%|████▌     | 45/100 [1:57:07<2:00:45, 131.73s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  46%|████▌     | 46/100 [2:04:54<3:29:05, 232.32s/it]

  0%|          | 0/342 [00:00<?, ?it/s]

PartitionExplainer explainer:  47%|████▋     | 47/100 [2:09:50<3:42:18, 251.68s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  48%|████▊     | 48/100 [2:10:37<2:44:54, 190.28s/it]

  0%|          | 0/30 [00:00<?, ?it/s]

PartitionExplainer explainer:  49%|████▉     | 49/100 [2:11:08<2:00:52, 142.20s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  50%|█████     | 50/100 [2:11:29<1:28:15, 105.91s/it]

  0%|          | 0/272 [00:00<?, ?it/s]

PartitionExplainer explainer:  52%|█████▏    | 52/100 [2:15:14<1:19:39, 99.58s/it] 

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  53%|█████▎    | 53/100 [2:17:29<1:26:24, 110.30s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  54%|█████▍    | 54/100 [2:18:16<1:10:00, 91.31s/it] 

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  55%|█████▌    | 55/100 [2:26:04<2:33:11, 204.26s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  56%|█████▌    | 56/100 [2:34:50<3:40:30, 300.70s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  57%|█████▋    | 57/100 [2:37:07<3:00:21, 251.66s/it]

  0%|          | 0/380 [00:00<?, ?it/s]

PartitionExplainer explainer:  58%|█████▊    | 58/100 [2:42:32<3:11:33, 273.67s/it]

  0%|          | 0/240 [00:00<?, ?it/s]

PartitionExplainer explainer:  59%|█████▉    | 59/100 [2:45:39<2:49:17, 247.75s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  60%|██████    | 60/100 [2:46:27<2:05:04, 187.62s/it]

  0%|          | 0/272 [00:00<?, ?it/s]

PartitionExplainer explainer:  61%|██████    | 61/100 [2:50:11<2:09:04, 198.57s/it]

  0%|          | 0/132 [00:00<?, ?it/s]

PartitionExplainer explainer:  62%|██████▏   | 62/100 [2:52:08<1:50:15, 174.09s/it]

  0%|          | 0/90 [00:00<?, ?it/s]

PartitionExplainer explainer:  63%|██████▎   | 63/100 [2:53:28<1:30:01, 145.99s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  64%|██████▍   | 64/100 [2:53:49<1:05:07, 108.55s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  65%|██████▌   | 65/100 [2:54:01<46:22, 79.51s/it]   

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  66%|██████▌   | 66/100 [2:54:22<35:08, 62.01s/it]

  0%|          | 0/240 [00:00<?, ?it/s]

PartitionExplainer explainer:  67%|██████▋   | 67/100 [2:57:26<54:12, 98.55s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  68%|██████▊   | 68/100 [2:57:38<38:39, 72.49s/it]

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  70%|███████   | 70/100 [2:58:19<22:12, 44.40s/it]

  0%|          | 0/90 [00:00<?, ?it/s]

PartitionExplainer explainer:  71%|███████   | 71/100 [2:59:39<26:42, 55.25s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  72%|███████▏  | 72/100 [3:06:46<1:17:44, 166.58s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  73%|███████▎  | 73/100 [3:06:58<54:08, 120.33s/it]  

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  74%|███████▍  | 74/100 [3:07:18<39:09, 90.36s/it] 

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  75%|███████▌  | 75/100 [3:07:40<29:00, 69.60s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  76%|███████▌  | 76/100 [3:08:00<21:59, 55.00s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  77%|███████▋  | 77/100 [3:16:19<1:12:03, 187.98s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  79%|███████▉  | 79/100 [3:16:35<33:31, 95.78s/it] 

  0%|          | 0/6 [00:00<?, ?it/s]

PartitionExplainer explainer:  81%|████████  | 81/100 [3:16:46<15:41, 49.56s/it]

  0%|          | 0/110 [00:01<?, ?it/s]

PartitionExplainer explainer:  82%|████████▏ | 82/100 [3:18:25<19:18, 64.35s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  84%|████████▍ | 84/100 [3:18:49<09:51, 36.97s/it]

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  85%|████████▌ | 85/100 [3:19:26<09:16, 37.11s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  86%|████████▌ | 86/100 [3:19:38<06:51, 29.42s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  87%|████████▋ | 87/100 [3:28:26<38:47, 179.05s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  88%|████████▊ | 88/100 [3:29:13<27:52, 139.39s/it]

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  89%|████████▉ | 89/100 [3:29:52<20:02, 109.30s/it]

  0%|          | 0/132 [00:00<?, ?it/s]

PartitionExplainer explainer:  90%|█████████ | 90/100 [3:31:48<18:33, 111.30s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  91%|█████████ | 91/100 [3:34:02<17:43, 118.22s/it]

  0%|          | 0/72 [00:00<?, ?it/s]

PartitionExplainer explainer:  92%|█████████▏| 92/100 [3:35:07<13:37, 102.17s/it]

  0%|          | 0/342 [00:00<?, ?it/s]

PartitionExplainer explainer:  93%|█████████▎| 93/100 [3:39:59<18:33, 159.03s/it]

  0%|          | 0/182 [00:00<?, ?it/s]

PartitionExplainer explainer:  94%|█████████▍| 94/100 [3:42:28<15:36, 156.08s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  95%|█████████▌| 95/100 [3:51:17<22:20, 268.09s/it]

  0%|          | 0/30 [00:00<?, ?it/s]

PartitionExplainer explainer:  97%|█████████▋| 97/100 [3:51:50<06:55, 138.51s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  98%|█████████▊| 98/100 [3:58:52<07:27, 223.63s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  99%|█████████▉| 99/100 [3:59:13<02:42, 162.79s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer: 100%|██████████| 100/100 [3:59:25<00:00, 117.39s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer: 101it [3:59:45, 143.86s/it]


In [12]:
with open('../results/results_shap_bertimbau_0_a_350.txt', 'w') as f:
    for i in range(len(instances)):
        f.write('[')
        for j, (word, score) in enumerate(zip(shap_values[i,:, class_names[1]].data, shap_values[i,:, class_names[1]].values)):
            f.write(str((word, score)))    
            if j < len(shap_values[i,:, class_names[1]].data)-1:
                f.write(', ')
        f.write(']\n')