In [1]:
!pip install transformers datasets
!pip install pytorch_lightning



In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForSequenceClassification,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
     

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv("../data/hatebr_and_rationales.csv")

In [4]:
## split to train and val
TRAIN_SIZE = 0.8
TEST_SIZE = 0.1
VAL_SIZE = 0.1
from sklearn.model_selection import train_test_split

x_train, x_test_val, y_train, y_test_val = train_test_split(df['normalized_text'], df['label final'], test_size=TEST_SIZE + VAL_SIZE, random_state=0)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=VAL_SIZE/(TEST_SIZE + VAL_SIZE), random_state=0)

In [5]:
from torch.utils.data import TensorDataset
import torch
import numpy as np
from transformers import T5Tokenizer

def tokenize_corpus(df, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for doc in df:
        # Tokenizing input text
        encoded_dict = tokenizer.encode_plus(
                            doc,
                            add_special_tokens=True,
                            max_length=max_len,
                            truncation=True,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt'
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

def prepare_dataset(features, labels, tokenizer):
    # Tokenize the input texts
    padded_tokens, attention_masks = tokenize_corpus(features.values, tokenizer, 512) # tokeniza as mensagens
    target = np.array(labels.values, dtype=np.int64).reshape(-1, 1) # transforma target em np array
    tensor_df = TensorDataset(padded_tokens, attention_masks, torch.from_numpy(target))

    return tensor_df


In [6]:
from transformers import T5ForSequenceClassification, T5Tokenizer

# Specify the path or model identifier
model_name_or_path = 'unicamp-dl/ptt5-base-portuguese-vocab'  # Or use a model identifier like 't5-small'

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)

# Load the model
model = T5ForSequenceClassification.from_pretrained(model_name_or_path)

spiece.model:   0%|          | 0.00/756k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at unicamp-dl/ptt5-base-portuguese-vocab and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Load the checkpoint
checkpoint = torch.load('../models/ptt5-fine-tuned/best-checkpoint.ckpt', map_location=torch.device('cpu'))

# Extract the state_dict from the checkpoint
state_dict = checkpoint['state_dict']

# Create a new dictionary that matches the expected keys
new_state_dict = {k.replace('model.', ''): v for k, v in state_dict.items()}

# Load the state dictionary into the model
model.load_state_dict(new_state_dict)

  checkpoint = torch.load('/kaggle/input/ptt5-fine-tuned/best-checkpoint.ckpt', map_location=torch.device('cpu'))


<All keys matched successfully>

In [8]:
import torch.nn.functional as F

def predict_fn(text):
    model.eval()
    
    padded_tokens, attention_masks = tokenize_corpus(text, tokenizer, 512) # tokeniza as mensagens
    tensor_df = TensorDataset(padded_tokens, attention_masks)
    
    test_dataloader = DataLoader(tensor_df,
                              batch_size=len(text),
                              shuffle=False)
    
    for batch in test_dataloader:
        
        with torch.no_grad():
            outputs = model(input_ids=batch[0],attention_mask= batch[1])
    
    # Get the logits (raw predictions)
    logits = outputs.logits
    
    # Apply softmax to get probabilities
    probs = F.softmax(logits, dim=-1)
    
    # Return probabilities as a numpy array (LIME and SHAP expect numpy arrays)
    return probs.cpu().numpy()

In [9]:
instances = np.where(y_train == 1)[0][0:350]

In [10]:
import shap
class_names = ["Non-hate", "Hate"]
masker = shap.maskers.Text(tokenizer=r"\W+")
explainer = shap.Explainer(predict_fn, masker=masker, output_names=class_names)

shap_values = explainer(x_train.iloc[instances].tolist())

  0%|          | 0/306 [00:00<?, ?it/s]

PartitionExplainer explainer:   1%|          | 1/100 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:   3%|▎         | 3/100 [37:32<20:23:35, 756.86s/it]

  0%|          | 0/2 [00:00<?, ?it/s]

PartitionExplainer explainer:   4%|▍         | 4/100 [37:43<11:53:40, 446.04s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:   5%|▌         | 5/100 [39:48<8:36:24, 326.15s/it] 

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:   6%|▌         | 6/100 [58:58<15:59:41, 612.57s/it]

  0%|          | 0/132 [00:00<?, ?it/s]

PartitionExplainer explainer:   9%|▉         | 9/100 [1:04:23<6:08:04, 242.69s/it]

  0%|          | 0/210 [00:00<?, ?it/s]

PartitionExplainer explainer:  10%|█         | 10/100 [1:11:42<7:35:26, 303.63s/it]

  0%|          | 0/6 [00:00<?, ?it/s]

PartitionExplainer explainer:  11%|█         | 11/100 [1:12:03<5:21:19, 216.62s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  12%|█▏        | 12/100 [1:36:40<14:41:26, 600.99s/it]

  0%|          | 0/210 [00:00<?, ?it/s]

PartitionExplainer explainer:  13%|█▎        | 13/100 [1:42:59<12:53:56, 533.75s/it]

  0%|          | 0/132 [00:00<?, ?it/s]

PartitionExplainer explainer:  14%|█▍        | 14/100 [1:47:23<10:47:56, 452.05s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  15%|█▌        | 15/100 [1:47:53<7:40:11, 324.85s/it] 

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  16%|█▌        | 16/100 [1:52:58<7:26:35, 318.99s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  17%|█▋        | 17/100 [2:15:03<14:19:52, 621.60s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  18%|█▊        | 18/100 [2:15:33<10:06:28, 443.76s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  19%|█▉        | 19/100 [2:20:37<9:02:19, 401.72s/it] 

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  20%|██        | 20/100 [2:21:26<6:34:24, 295.80s/it]

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  21%|██        | 21/100 [2:22:55<5:07:36, 233.63s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  22%|██▏       | 22/100 [2:45:56<12:31:36, 578.16s/it]

  0%|          | 0/110 [00:00<?, ?it/s]

PartitionExplainer explainer:  23%|██▎       | 23/100 [2:49:39<10:05:10, 471.56s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  24%|██▍       | 24/100 [2:51:25<7:38:21, 361.86s/it] 

  0%|          | 0/6 [00:00<?, ?it/s]

PartitionExplainer explainer:  25%|██▌       | 25/100 [2:51:46<5:24:13, 259.38s/it]

  0%|          | 0/110 [00:00<?, ?it/s]

PartitionExplainer explainer:  26%|██▌       | 26/100 [2:55:28<5:06:18, 248.36s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  27%|██▋       | 27/100 [3:00:28<5:20:58, 263.81s/it]

  0%|          | 0/30 [00:00<?, ?it/s]

PartitionExplainer explainer:  28%|██▊       | 28/100 [3:01:36<4:06:01, 205.02s/it]

  0%|          | 0/110 [00:00<?, ?it/s]

PartitionExplainer explainer:  29%|██▉       | 29/100 [3:05:20<4:09:21, 210.73s/it]

  0%|          | 0/240 [00:00<?, ?it/s]

PartitionExplainer explainer:  30%|███       | 30/100 [3:12:14<5:17:04, 271.78s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  31%|███       | 31/100 [3:34:35<11:21:29, 592.60s/it]

  0%|          | 0/72 [00:00<?, ?it/s]

PartitionExplainer explainer:  32%|███▏      | 32/100 [3:37:01<8:39:44, 458.59s/it] 

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  33%|███▎      | 33/100 [3:37:31<6:08:18, 329.83s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  34%|███▍      | 34/100 [4:00:32<11:49:52, 645.35s/it]

  0%|          | 0/342 [00:00<?, ?it/s]

PartitionExplainer explainer:  35%|███▌      | 35/100 [4:11:29<11:42:41, 648.64s/it]

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  36%|███▌      | 36/100 [4:12:58<8:32:50, 480.78s/it] 

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  37%|███▋      | 37/100 [4:14:44<6:26:42, 368.30s/it]

  0%|          | 0/210 [00:00<?, ?it/s]

PartitionExplainer explainer:  39%|███▉      | 39/100 [4:21:09<4:26:34, 262.21s/it]

  0%|          | 0/90 [00:00<?, ?it/s]

PartitionExplainer explainer:  40%|████      | 40/100 [4:24:16<3:59:37, 239.62s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  41%|████      | 41/100 [4:25:04<2:59:15, 182.29s/it]

  0%|          | 0/6 [00:00<?, ?it/s]

PartitionExplainer explainer:  42%|████▏     | 42/100 [4:25:24<2:09:11, 133.64s/it]

  0%|          | 0/342 [00:00<?, ?it/s]

PartitionExplainer explainer:  43%|████▎     | 43/100 [4:36:12<4:33:36, 288.01s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  44%|████▍     | 44/100 [4:36:42<3:16:25, 210.46s/it]

  0%|          | 0/272 [00:00<?, ?it/s]

PartitionExplainer explainer:  45%|████▌     | 45/100 [4:44:55<4:30:46, 295.38s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  46%|████▌     | 46/100 [5:02:09<7:45:08, 516.82s/it]

  0%|          | 0/342 [00:00<?, ?it/s]

PartitionExplainer explainer:  47%|████▋     | 47/100 [5:12:56<8:11:07, 555.98s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  48%|████▊     | 48/100 [5:14:44<6:05:17, 421.48s/it]

  0%|          | 0/30 [00:00<?, ?it/s]

PartitionExplainer explainer:  49%|████▉     | 49/100 [5:15:52<4:28:12, 315.54s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  50%|█████     | 50/100 [5:16:41<3:16:10, 235.40s/it]

  0%|          | 0/272 [00:00<?, ?it/s]

PartitionExplainer explainer:  52%|█████▏    | 52/100 [5:25:07<2:57:57, 222.45s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  53%|█████▎    | 53/100 [5:30:09<3:13:07, 246.54s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  54%|█████▍    | 54/100 [5:31:56<2:36:44, 204.45s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  55%|█████▌    | 55/100 [5:49:07<5:39:28, 452.62s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  56%|█████▌    | 56/100 [6:10:44<8:37:40, 705.92s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  57%|█████▋    | 57/100 [6:15:43<6:58:28, 583.91s/it]

  0%|          | 0/380 [00:00<?, ?it/s]

PartitionExplainer explainer:  58%|█████▊    | 58/100 [6:27:52<7:19:03, 627.23s/it]

  0%|          | 0/240 [00:00<?, ?it/s]

PartitionExplainer explainer:  59%|█████▉    | 59/100 [6:34:53<6:26:19, 565.36s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  60%|██████    | 60/100 [6:36:41<4:45:32, 428.32s/it]

  0%|          | 0/272 [00:00<?, ?it/s]

PartitionExplainer explainer:  61%|██████    | 61/100 [6:44:57<4:51:32, 448.54s/it]

  0%|          | 0/132 [00:00<?, ?it/s]

PartitionExplainer explainer:  62%|██████▏   | 62/100 [6:49:18<4:08:21, 392.15s/it]

  0%|          | 0/90 [00:00<?, ?it/s]

PartitionExplainer explainer:  63%|██████▎   | 63/100 [6:52:22<3:23:19, 329.72s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  64%|██████▍   | 64/100 [6:53:10<2:27:12, 245.34s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  65%|██████▌   | 65/100 [6:53:40<1:45:20, 180.59s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  66%|██████▌   | 66/100 [6:54:29<1:20:00, 141.19s/it]

  0%|          | 0/240 [00:00<?, ?it/s]

PartitionExplainer explainer:  67%|██████▋   | 67/100 [7:01:25<2:03:05, 223.82s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  68%|██████▊   | 68/100 [7:01:55<1:28:15, 165.48s/it]

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  70%|███████   | 70/100 [7:03:32<51:10, 102.37s/it]  

  0%|          | 0/90 [00:00<?, ?it/s]

PartitionExplainer explainer:  71%|███████   | 71/100 [7:06:34<1:01:09, 126.52s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  72%|███████▏  | 72/100 [7:22:32<2:55:19, 375.71s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  73%|███████▎  | 73/100 [7:23:02<2:02:30, 272.24s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  74%|███████▍  | 74/100 [7:23:53<1:29:11, 205.83s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  75%|███████▌  | 75/100 [7:24:43<1:06:18, 159.14s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  76%|███████▌  | 76/100 [7:25:34<50:38, 126.59s/it]  

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  77%|███████▋  | 77/100 [7:47:58<3:08:28, 491.66s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  78%|███████▊  | 78/100 [7:48:29<2:09:36, 353.46s/it]

  0%|          | 0/2 [00:00<?, ?it/s]

PartitionExplainer explainer:  79%|███████▉  | 79/100 [7:48:40<1:27:50, 250.98s/it]

  0%|          | 0/6 [00:00<?, ?it/s]

PartitionExplainer explainer:  80%|████████  | 80/100 [7:49:00<1:00:32, 181.63s/it]

  0%|          | 0/2 [00:00<?, ?it/s]

PartitionExplainer explainer:  81%|████████  | 81/100 [7:49:11<41:18, 130.44s/it]  

  0%|          | 0/110 [00:00<?, ?it/s]

PartitionExplainer explainer:  82%|████████▏ | 82/100 [7:53:02<48:11, 160.64s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  83%|████████▎ | 83/100 [7:53:53<36:07, 127.49s/it]

  0%|          | 0/2 [00:00<?, ?it/s]

PartitionExplainer explainer:  84%|████████▍ | 84/100 [7:54:04<24:40, 92.56s/it] 

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  85%|████████▌ | 85/100 [7:55:33<22:54, 91.62s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer:  86%|████████▌ | 86/100 [7:56:04<17:06, 73.30s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  87%|████████▋ | 87/100 [8:19:32<1:42:40, 473.90s/it]

  0%|          | 0/56 [00:00<?, ?it/s]

PartitionExplainer explainer:  88%|████████▊ | 88/100 [8:21:18<1:12:41, 363.43s/it]

  0%|          | 0/42 [00:00<?, ?it/s]

PartitionExplainer explainer:  89%|████████▉ | 89/100 [8:22:45<51:25, 280.49s/it]  

  0%|          | 0/132 [00:00<?, ?it/s]

PartitionExplainer explainer:  90%|█████████ | 90/100 [8:27:05<45:44, 274.41s/it]

  0%|          | 0/156 [00:00<?, ?it/s]

PartitionExplainer explainer:  91%|█████████ | 91/100 [8:32:09<42:28, 283.14s/it]

  0%|          | 0/72 [00:00<?, ?it/s]

PartitionExplainer explainer:  92%|█████████▏| 92/100 [8:34:37<32:22, 242.77s/it]

  0%|          | 0/342 [00:00<?, ?it/s]

PartitionExplainer explainer:  93%|█████████▎| 93/100 [8:45:22<42:24, 363.51s/it]

  0%|          | 0/182 [00:00<?, ?it/s]

PartitionExplainer explainer:  94%|█████████▍| 94/100 [8:51:04<35:41, 356.95s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  95%|█████████▌| 95/100 [9:12:12<52:31, 630.21s/it]

  0%|          | 0/30 [00:00<?, ?it/s]

PartitionExplainer explainer:  97%|█████████▋| 97/100 [9:13:30<16:18, 326.04s/it]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  98%|█████████▊| 98/100 [9:29:34<17:14, 517.43s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer:  99%|█████████▉| 99/100 [9:30:23<06:17, 377.05s/it]

  0%|          | 0/12 [00:00<?, ?it/s]

PartitionExplainer explainer: 100%|██████████| 100/100 [9:30:53<00:00, 272.83s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

PartitionExplainer explainer: 101it [9:31:42, 343.03s/it]


In [11]:
with open('../results/results_shap_ptt5_0_a_350.txt', 'w') as f:
    for i in range(len(instances)):
        f.write('[')
        for j, (word, score) in enumerate(zip(shap_values[i,:, class_names[1]].data, shap_values[i,:, class_names[1]].values)):
            f.write(str((word, score)))    
            if j < len(shap_values[i,:, class_names[1]].data)-1:
                f.write(', ')
        f.write(']\n')