In [1]:
!pip install transformers
!sudo apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/neuralmind/bert-large-portuguese-cased




git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Error: Failed to call git rev-parse --git-dir: exit status 128 
Git LFS initialized.
Cloning into 'bert-large-portuguese-cased'...
remote: Enumerating objects: 36, done.[K
remote: Total 36 (delta 0), reused 0 (delta 0), pack-reused 36 (from 1)[K
Unpacking objects: 100% (36/36), 102.71 KiB | 1.24 MiB/s, done.
Filtering content: 100% (2/2), 2.49 GiB | 59.25 MiB/s, done.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForPreTraining
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from torch.cuda.amp import autocast, GradScaler
import time, datetime
import torch.nn.functional as F

In [3]:
df = pd.read_csv("../data/hatebr_and_rationales.csv")

In [4]:
TRAIN_SIZE = 0.8
TEST_SIZE = 0.1
VAL_SIZE = 0.1


x_train, x_test_val, y_train, y_test_val = train_test_split(df['normalized_text'], df['label final'], test_size=TEST_SIZE + VAL_SIZE, random_state=0)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=VAL_SIZE/(TEST_SIZE + VAL_SIZE), random_state=0)

In [5]:
# define as funcoes para preparar o dataset para dar entrada para o modelo

def tokenize_corpus(df, tokenizer, max_len):
    
    input_ids = []
    attention_masks = []
    
    for doc in df:
        encoded_dict = tokenizer.encode_plus(
                            doc,  # document to encode.
                            add_special_tokens=True,  # adiciona '[CLS] token para início' e '[SEP] token para fim'
                            max_length=max_len,  # define max length
                            truncation=True,  # trunca mensagens longas
                            padding='max_length',  # adiciona padding (adiciona 0 em sequências menores que o tamanho maximo)
                            return_attention_mask=True,  # cria mascaras de atenção
                            return_tensors='pt'  # retorna tensores pytorch
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])  # o attention mask diferencia o que é padding do que não é

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)
     
def prepare_dataset(features, labels):
    
    padded_tokens, attention_masks = tokenize_corpus(features.values, tokenizer, 512) # tokeniza as mensagens
    target = np.array(labels.values, dtype=np.int64).reshape(-1, 1) # transforma target em np array
    tensor_df = TensorDataset(padded_tokens, attention_masks, torch.from_numpy(target))

    return tensor_df

In [6]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased',
                                            num_labels=2, return_dict=False)

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.load_state_dict(torch.load("../models/bertimbau-base/my_model", map_location=torch.device('cpu')))

<All keys matched successfully>

In [9]:
from sklearn import metrics
def predict_fn(text):
    model.eval()
    padded_tokens, attention_masks = tokenize_corpus(text, tokenizer, 512) # tokeniza as mensagens
    tensor_df = TensorDataset(padded_tokens, attention_masks)
    
    test_dataloader = DataLoader(tensor_df,
                              batch_size=len(text),
                              shuffle=False)
    
    for batch in test_dataloader: # itera nos batches de teste

        input_ids = batch[0].cpu()
        input_mask = batch[1].cpu()

        # nao precisa calcular os gradientes, pois e necessario apenas para o treino
        with torch.no_grad():
            # avalia modelo nos dados de teste
            logits = model(input_ids=input_ids,
                                 attention_mask=input_mask)

    probabilities = F.softmax(logits[0].detach().cpu(), dim=-1)

    torch.cuda.empty_cache()
    return probabilities.numpy()

In [10]:
instances = np.where(y_train == 1)[0][0:350]

In [11]:
from lime.lime_text import LimeTextExplainer
class_names = ["Non-hate", "Hate"]
explainer = LimeTextExplainer(class_names = class_names)

explainers = []
for i in instances:
    t0 = time.time()
    exp = explainer.explain_instance(x_train.iloc[i], predict_fn, num_features = 10, num_samples=200, labels=(1,)) 
    explainers.append(exp)
    print(time.time() - t0)

260.1972904205322
262.2761564254761
287.9599552154541
255.44984436035156
289.2286500930786
257.30707120895386
257.8127067089081
296.07847929000854
259.18457531929016
292.3346793651581
259.9335811138153
286.3807294368744
256.8715851306915
257.4681746959686
292.27583837509155
266.8023862838745
289.581839799881
257.62728214263916
287.41899251937866
266.81165170669556
261.3011283874512
295.53925347328186
263.5869584083557
294.03266763687134
267.79990696907043
279.9943890571594
268.174352645874
259.13414001464844
288.17861580848694
258.27813601493835
305.064284324646
258.1805930137634
260.09206199645996
294.17777037620544
258.07041907310486
293.87223076820374
267.97010111808777
289.34834265708923
258.31370735168457
264.26147532463074
290.6542546749115
259.8263702392578
302.8007516860962
264.2261731624603
296.9959237575531
269.63784885406494
262.93362379074097
289.81308102607727
264.61209201812744
298.3873550891876
257.1751458644867
286.8626437187195
259.33730030059814
256.51754212379456
293

In [12]:
with open('../results/results_lime_bertimbau_0_a_350.txt', 'w') as f:
    for exp in explainers:
        f.write(str(exp.as_list()))
        f.write('\n')
    