In [None]:
import pandas as pd
import numpy as np
import math
import random 
import time
import string
import glob

def clean_text(text):
    # Handle common misinterpretations from double encoding
    replacements = {
        'Ã¡': 'á', 'Ã©': 'é', 'Ã\xad': 'í', 'Ã³': 'ó', 'Ãº': 'ú',
        'Ã£': 'ã', 'Ãµ': 'õ', 'Ã¢': 'â', 'Ãª': 'ê', 'Ã´': 'ô',
        'Ã§': 'ç', 'Ã ': 'à', 'Ãš': 'Ú', 'Ã\x81': 'Á', 'Ã‰': 'É',
        'Ã\x8d': 'Í', 'Ã“': 'Ó', 'Ãš': 'Ú', 'Ã“': 'Ó'
    }
    for wrong, right in replacements.items():
        text = text.replace(wrong, right)
    return text

def read_files(file, method):
    with open(file, 'r', encoding="latin1") as f:
        lines = f.readlines()
        for l in lines:
            if l.strip():  # This checks that the line is not empty or just a newline
                # Decode and correct the encoding issues before evaluation
                corrected_line = l.encode('latin1').decode('utf-8', errors='replace')
                corrected_line = clean_text(corrected_line)
                
                # Use eval to convert string to list of tuples
                data = list(eval(corrected_line))
                
                data.sort(key=lambda tup: tup[1], reverse=True)
                pred[method].append([i[0].replace(' ', '') for i in data if i[1] > 0])
                
pred = {}

for method in ['lime', 'shap']:
    pred[method] = []
    paths = []
    for name in glob.glob('../results/bertimbau_'+method+'*'):
        paths.append(name)
    print(paths)

    for filename in sorted(paths):    
        read_files(filename, method)

In [None]:
df = pd.read_csv('../data/hatebr_and_rationales.csv', \
                index_col=0, \
                converters={"rationales_offensive_1_normalized": \
                lambda x: x.strip('[]').replace("'", "").split(", "),
                "rationales_offensive_2_normalized": \
                lambda x: x.strip('[]').replace("'", "").split(", ")})


In [None]:
## split to train and val
TRAIN_SIZE = 0.8
TEST_SIZE = 0.1
VAL_SIZE = 0.1
from sklearn.model_selection import train_test_split


x_train, x_test_val, y_train, y_test_val = train_test_split(df['normalized_text'], df['label final'], test_size=TEST_SIZE + VAL_SIZE, random_state=0)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=VAL_SIZE/(TEST_SIZE + VAL_SIZE), random_state=0)


In [None]:
def get_x_rationales(method):
    rationales = []
    for e, p in enumerate(pred[method]):
        rationales.append(' '.join(p))
    
    return rationales

def get_x():
    df_instances = x_train.iloc[instances].to_frame().join(df.set_index('normalized_text'), on='normalized_text')
    X = df_instances['normalized_text'].tolist()
    return X

def remove_rationales(method):
    X = get_x()
    w_punct_intances = []
    for i in X:
        w_punct_intances.append(i.translate(str.maketrans('', '', string.punctuation)))
    
    n = math.ceil((np.mean([len(a) for a in pred[method]])))
    new_X = []

    for a, b in zip(pred[method], w_punct_intances):
        
        x = b.split(' ')

        save = []
        i = 0
        while i < n and i < len(a):
            try:
                x.remove(a[i])
                i+=1
            except:
                save.append(a[i])
                i+=1

        b = ' '.join(x)

        for a in save:
            b = b.replace(a, '')
        
        new_X.append(b)

    return new_X 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForPreTraining
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from torch.cuda.amp import autocast, GradScaler
import time, datetime
import torch.nn.functional as F

In [None]:
# define as funcoes para preparar o dataset para dar entrada para o modelo

def tokenize_corpus(df, tokenizer, max_len):
    
    input_ids = []
    attention_masks = []
    
    for doc in df:
        encoded_dict = tokenizer.encode_plus(
                            doc,  # document to encode.
                            add_special_tokens=True,  # adiciona '[CLS] token para início' e '[SEP] token para fim'
                            max_length=max_len,  # define max length
                            truncation=True,  # trunca mensagens longas
                            padding='max_length',  # adiciona padding (adiciona 0 em sequências menores que o tamanho maximo)
                            return_attention_mask=True,  # cria mascaras de atenção
                            return_tensors='pt'  # retorna tensores pytorch
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])  # o attention mask diferencia o que é padding do que não é

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)
     
def prepare_dataset(features, labels):
    
    padded_tokens, attention_masks = tokenize_corpus(features.values, tokenizer, 512) # tokeniza as mensagens
    target = np.array(labels.values, dtype=np.int64).reshape(-1, 1) # transforma target em np array
    tensor_df = TensorDataset(padded_tokens, attention_masks, torch.from_numpy(target))

    return tensor_df

In [None]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased',
                                            num_labels=2, return_dict=False)

In [None]:
model.load_state_dict(torch.load("../models/bertimbau-base/my_model", map_location=torch.device('cpu')))

In [None]:
def predict_fn(text):
    model.eval()
    padded_tokens, attention_masks = tokenize_corpus(text, tokenizer, 512) # tokeniza as mensagens
    tensor_df = TensorDataset(padded_tokens, attention_masks)
    
    test_dataloader = DataLoader(tensor_df,
                              batch_size=len(text),
                              shuffle=False)
    
    for batch in test_dataloader: # itera nos batches de teste

        input_ids = batch[0].cpu()
        input_mask = batch[1].cpu()

        # nao precisa calcular os gradientes, pois e necessario apenas para o treino
        with torch.no_grad():
            # avalia modelo nos dados de teste
            logits = model(input_ids=input_ids,
                                 attention_mask=input_mask)
#     rounded_preds = np.argmax(logits[0].detach().cpu().numpy(), axis=1).flatten()
    probabilities = F.softmax(logits[0].detach().cpu(), dim=-1)
    torch.cuda.empty_cache()
    return probabilities.numpy()

In [None]:
X = get_x()
pred_X = predict_fn(X)

for method in ['lime', 'shap']:
    X_ = remove_rationales(method)
    pred_X_ = predict_fn(X_)

    x_rationales = get_x_rationales(method)
    pred_x_rationales = predict_fn(x_rationales)


    comprehensiveness = []
    for p1, p2 in zip(pred_X, pred_X_):
        comprehensiveness.append(p1[1] - p2[1])

   
    print("Comprehensiveness method {} value {}".format(method, np.mean(comprehensiveness)))

    sufficiency = []

    for p1, p2 in zip(pred_X, pred_x_rationales):
        sufficiency.append(p1[1] - p2[1])

    print("Sufficiency method {} value {}".format(method, np.mean(sufficiency)))


Get probabilities for test dataset to compute ROC curve

In [None]:
df_instances = x_test.to_frame().join(df.set_index('normalized_text'), on='normalized_text')
X = df_instances['normalized_text'].tolist()

logits = predict_fn(X)
probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()

y_pred = probs[:, 1] 

np.savetxt('../predictions/bertimbau_pred.out', y_pred, delimiter=',') 