In [1]:
!pip install transformers datasets
!pip install pytorch_lightning



In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForSequenceClassification,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
     

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv("../data/hatebr_and_rationales.csv")

In [4]:
## split to train and val
TRAIN_SIZE = 0.8
TEST_SIZE = 0.1
VAL_SIZE = 0.1
from sklearn.model_selection import train_test_split

x_train, x_test_val, y_train, y_test_val = train_test_split(df['normalized_text'], df['label final'], test_size=TEST_SIZE + VAL_SIZE, random_state=0)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=VAL_SIZE/(TEST_SIZE + VAL_SIZE), random_state=0)

In [5]:
from torch.utils.data import TensorDataset
import torch
import numpy as np
from transformers import T5Tokenizer

def tokenize_corpus(df, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for doc in df:
        # Tokenizing input text
        encoded_dict = tokenizer.encode_plus(
                            doc,
                            add_special_tokens=True,
                            max_length=max_len,
                            truncation=True,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt'
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

def prepare_dataset(features, labels, tokenizer):
    # Tokenize the input texts
    padded_tokens, attention_masks = tokenize_corpus(features.values, tokenizer, 512) # tokeniza as mensagens
    target = np.array(labels.values, dtype=np.int64).reshape(-1, 1) # transforma target em np array
    tensor_df = TensorDataset(padded_tokens, attention_masks, torch.from_numpy(target))

    return tensor_df


In [6]:
from transformers import T5ForSequenceClassification, T5Tokenizer

# Specify the path or model identifier
model_name_or_path = 'unicamp-dl/ptt5-base-portuguese-vocab'  # Or use a model identifier like 't5-small'

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)

# Load the model
model = T5ForSequenceClassification.from_pretrained(model_name_or_path)

spiece.model:   0%|          | 0.00/756k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at unicamp-dl/ptt5-base-portuguese-vocab and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Load the checkpoint
checkpoint = torch.load('../models/ptt5-fine-tuned/best-checkpoint.ckpt', map_location=torch.device('cpu'))

# Extract the state_dict from the checkpoint
state_dict = checkpoint['state_dict']

# Create a new dictionary that matches the expected keys
new_state_dict = {k.replace('model.', ''): v for k, v in state_dict.items()}

# Load the state dictionary into the model
model.load_state_dict(new_state_dict)

  checkpoint = torch.load('/kaggle/input/ptt5-fine-tuned/best-checkpoint.ckpt', map_location=torch.device('cpu'))


<All keys matched successfully>

In [8]:
import torch.nn.functional as F

def predict_fn(text):
    model.eval()
    
    padded_tokens, attention_masks = tokenize_corpus(text, tokenizer, 512) # tokeniza as mensagens
    tensor_df = TensorDataset(padded_tokens, attention_masks)
    
    test_dataloader = DataLoader(tensor_df,
                              batch_size=len(text),
                              shuffle=False)
        
    probabilities = []
    
    for batch in test_dataloader:
        
        with torch.no_grad():
            outputs = model(input_ids=batch[0],attention_mask= batch[1])
    
        # Get the logits (raw predictions)
        logits = outputs.logits

    # Apply softmax to get probabilities
    probs = F.softmax(logits, dim=-1)
        
    # Return probabilities as a numpy array (LIME and SHAP expect numpy arrays)
    return probs.cpu().numpy()

In [9]:
instances = np.where(y_train == 1)[0][0:350]

In [10]:
from lime.lime_text import LimeTextExplainer
class_names = ["Non-hate", "Hate"]
explainer = LimeTextExplainer(class_names = class_names)

explainers = []
for i in instances:
    t0 = time.time()
    exp = explainer.explain_instance(x_train.iloc[i], predict_fn, num_features = 10, num_samples=200, labels=(1,)) 
    explainers.append(exp)
    print(i)
    print(time.time() - t0)


89
667.9499032497406
90
644.1663982868195
93
637.2051944732666
96
641.046010017395
97
635.1210858821869
98
631.4849441051483
99
643.2390305995941
102
643.4410347938538
104
642.4352216720581
105
640.0326752662659
111
643.1889939308167
112
648.2784984111786
113
636.9806146621704
114
631.837176322937
119
640.1973872184753
121
636.6944844722748
122
639.5657041072845
126
625.7253947257996
129
642.4732925891876
130
636.4709210395813
132
641.7105338573456
133
633.1039388179779
136
636.0283403396606
137
632.0354776382446
138
641.9878396987915
141
639.5834531784058
142
633.9183983802795
145
637.5964210033417
146
632.6559965610504
149
631.2979068756104
150
632.6815078258514
151
634.145870923996
155
646.6161072254181
161
637.2703814506531
162
642.7830138206482
168
645.620306968689
174
636.563069820404
175
649.584755897522
178
635.598623752594
179
642.5767915248871
180
633.7724831104279
181
646.2289657592773
183
640.7887427806854
184
640.3912169933319
186
646.1738846302032
189
634.5664131641388
19

In [11]:
with open('../results/results_lime_ptt5_0_a_350.txt', 'w') as f:
    for exp in explainers:
        f.write(str(exp.as_list()))
        f.write('\n')