In [1]:
import datasets
import os
import pickle
import plotly
import random
import re
import scipy
import seaborn as sns
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from datasets import Dataset
from datasets.utils import disable_progress_bar
disable_progress_bar()

from lime import lime_text
from lime.lime_text import LimeTextExplainer

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

from simpletransformers.classification import ClassificationModel, ClassificationArgs

from termcolor import colored

from tqdm.auto import tqdm, trange

from transformers import AutoTokenizer, BigBirdForSequenceClassification, RobertaForSequenceClassification, pipeline, \
                            TrainingArguments, Trainer

from torch.utils.data import DataLoader

## Getting the predictor functions

In [6]:
# %cd /
tokenizer = AutoTokenizer.from_pretrained("bigbird")
#                                          cache_dir="data-imperial/cache")
# tokenizer.save_pretrained("data-imperial/bigbird/output/test_4_more_samples/checkpoint-481")

# %cd data-imperial/

In [2]:
model = BigBirdForSequenceClassification.from_pretrained("bigbird/output/ml/epoch-5")

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device

device(type='cuda')

In [4]:
explainer = LimeTextExplainer()

### Single label explainer

In [7]:
## Single label predictor function

## Corresponding label numbers:
## 0 : desire
## 1 : intent
## 2 : capability
## 3 : timeframe
## 4 : substance
## 5 : depressed
## 6 : self-harm
## 7 : anxiety
## 8 : helpful

label_num = 7

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = "longest", truncation=True, max_length = 2048)

def m3_predict_probs(sample):
    dataset = pd.DataFrame(sample, columns=['text'])
    dataset = Dataset.from_pandas(dataset)
    dataset = dataset.map(tokenization, batched=True, batch_size=2, remove_columns=['text'])
    dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

    loader = DataLoader(dataset, batch_size=2)
    probs = None

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        outputs = outputs.logits.sigmoid().cpu().detach().numpy()
        probs = outputs if probs is None else np.concatenate((probs, outputs))
        
    prob_true = probs[:,label_num]    
    return np.transpose(np.vstack(([1-prob_true, prob_true]))) 

In [8]:
sample = pickle.load(open("saved/anxiety/96_text_samples.pickle", "rb"))

In [17]:
exp_list = []

for i in tqdm(sample.text):
    text = i
    
    exp = explainer.explain_instance(text, 
                                     m3_predict_probs, 
                                     num_features=100, 
                                     num_samples=4000)

    exp_list.extend(exp.as_list())
    
pickle.dump(exp_list, open("saved/anxiety/exp_raw.pickle", "wb"))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448238472/work/aten/src/ATen/native/BinaryOps.cpp:467.)

Attention type 'block_sparse' is not possible if sequence_length: 546 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3.Changing attention type to 'original_full'...





In [18]:
## Single label predictor function

## Corresponding label numbers:
## 0 : desire
## 1 : intent
## 2 : capability
## 3 : timeframe
## 4 : substance
## 5 : depressed
## 6 : self-harm
## 7 : anxiety
## 8 : helpful

label_num = 5

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = "longest", truncation=True, max_length = 2048)

def m3_predict_probs(sample):
    dataset = pd.DataFrame(sample, columns=['text'])
    dataset = Dataset.from_pandas(dataset)
    dataset = dataset.map(tokenization, batched=True, batch_size=2, remove_columns=['text'])
    dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

    loader = DataLoader(dataset, batch_size=2)
    probs = None

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        outputs = outputs.logits.sigmoid().cpu().detach().numpy()
        probs = outputs if probs is None else np.concatenate((probs, outputs))
        
    prob_true = probs[:,label_num]    
    return np.transpose(np.vstack(([1-prob_true, prob_true]))) 

exp_list = []

for i in tqdm(sample.text):
    text = i
    
    exp = explainer.explain_instance(text, 
                                     m3_predict_probs, 
                                     num_features=100, 
                                     num_samples=4000)

    exp_list.extend(exp.as_list())
    
pickle.dump(exp_list, open("saved/anxiety/depressed/exp_raw.pickle", "wb"))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [19]:
## Single label predictor function

## Corresponding label numbers:
## 0 : desire
## 1 : intent
## 2 : capability
## 3 : timeframe
## 4 : substance
## 5 : depressed
## 6 : self-harm
## 7 : anxiety
## 8 : helpful

label_num = 0

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = "longest", truncation=True, max_length = 2048)

def m3_predict_probs(sample):
    dataset = pd.DataFrame(sample, columns=['text'])
    dataset = Dataset.from_pandas(dataset)
    dataset = dataset.map(tokenization, batched=True, batch_size=2, remove_columns=['text'])
    dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

    loader = DataLoader(dataset, batch_size=2)
    probs = None

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        outputs = outputs.logits.sigmoid().cpu().detach().numpy()
        probs = outputs if probs is None else np.concatenate((probs, outputs))
        
    prob_true = probs[:,label_num]    
    return np.transpose(np.vstack(([1-prob_true, prob_true]))) 

exp_list = []

for i in tqdm(sample.text):
    text = i
    
    exp = explainer.explain_instance(text, 
                                     m3_predict_probs, 
                                     num_features=100, 
                                     num_samples=4000)

    exp_list.extend(exp.as_list())
    
pickle.dump(exp_list, open("saved/anxiety/suicide/exp_raw.pickle", "wb"))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [20]:
sample = pickle.load(open("saved/suicide/100_text_samples.pickle", "rb"))

exp_list = []

for i in tqdm(sample.text):
    text = i
    
    exp = explainer.explain_instance(text, 
                                     m3_predict_probs, 
                                     num_features=100, 
                                     num_samples=4000)

    exp_list.extend(exp.as_list())
    
pickle.dump(exp_list, open("saved/suicide/desire/exp_raw.pickle", "wb"))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [21]:
## Single label predictor function

## Corresponding label numbers:
## 0 : desire
## 1 : intent
## 2 : capability
## 3 : timeframe
## 4 : substance
## 5 : depressed
## 6 : self-harm
## 7 : anxiety
## 8 : helpful

label_num = 1

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = "longest", truncation=True, max_length = 2048)

def m3_predict_probs(sample):
    dataset = pd.DataFrame(sample, columns=['text'])
    dataset = Dataset.from_pandas(dataset)
    dataset = dataset.map(tokenization, batched=True, batch_size=2, remove_columns=['text'])
    dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

    loader = DataLoader(dataset, batch_size=2)
    probs = None

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        outputs = outputs.logits.sigmoid().cpu().detach().numpy()
        probs = outputs if probs is None else np.concatenate((probs, outputs))
        
    prob_true = probs[:,label_num]    
    return np.transpose(np.vstack(([1-prob_true, prob_true]))) 

exp_list = []

for i in tqdm(sample.text):
    text = i
    
    exp = explainer.explain_instance(text, 
                                     m3_predict_probs, 
                                     num_features=100, 
                                     num_samples=4000)

    exp_list.extend(exp.as_list())
    
pickle.dump(exp_list, open("saved/suicide/intent/exp_raw.pickle", "wb"))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [22]:
## Single label predictor function

## Corresponding label numbers:
## 0 : desire
## 1 : intent
## 2 : capability
## 3 : timeframe
## 4 : substance
## 5 : depressed
## 6 : self-harm
## 7 : anxiety
## 8 : helpful

label_num = 2

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = "longest", truncation=True, max_length = 2048)

def m3_predict_probs(sample):
    dataset = pd.DataFrame(sample, columns=['text'])
    dataset = Dataset.from_pandas(dataset)
    dataset = dataset.map(tokenization, batched=True, batch_size=2, remove_columns=['text'])
    dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

    loader = DataLoader(dataset, batch_size=2)
    probs = None

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        outputs = outputs.logits.sigmoid().cpu().detach().numpy()
        probs = outputs if probs is None else np.concatenate((probs, outputs))
        
    prob_true = probs[:,label_num]    
    return np.transpose(np.vstack(([1-prob_true, prob_true]))) 

exp_list = []

for i in tqdm(sample.text):
    text = i
    
    exp = explainer.explain_instance(text, 
                                     m3_predict_probs, 
                                     num_features=100, 
                                     num_samples=4000)

    exp_list.extend(exp.as_list())
    
pickle.dump(exp_list, open("saved/suicide/capability/exp_raw.pickle", "wb"))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




FileNotFoundError: [Errno 2] No such file or directory: 'saved/suicide/capability/exp_raw.pickle'

In [26]:
## Single label predictor function

## Corresponding label numbers:
## 0 : desire
## 1 : intent
## 2 : capability
## 3 : timeframe
## 4 : substance
## 5 : depressed
## 6 : self-harm
## 7 : anxiety
## 8 : helpful

label_num = 3

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = "longest", truncation=True, max_length = 2048)

def m3_predict_probs(sample):
    dataset = pd.DataFrame(sample, columns=['text'])
    dataset = Dataset.from_pandas(dataset)
    dataset = dataset.map(tokenization, batched=True, batch_size=2, remove_columns=['text'])
    dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

    loader = DataLoader(dataset, batch_size=2)
    probs = None

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        outputs = outputs.logits.sigmoid().cpu().detach().numpy()
        probs = outputs if probs is None else np.concatenate((probs, outputs))
        
    prob_true = probs[:,label_num]    
    return np.transpose(np.vstack(([1-prob_true, prob_true]))) 

exp_list = []

for i in tqdm(sample.text):
    text = i
    
    exp = explainer.explain_instance(text, 
                                     m3_predict_probs, 
                                     num_features=100, 
                                     num_samples=4000)

    exp_list.extend(exp.as_list())
    
pickle.dump(exp_list, open("saved/suicide/timeframe/exp_raw.pickle", "wb"))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [7]:
## Single label predictor function

## Corresponding label numbers:
## 0 : desire
## 1 : intent
## 2 : capability
## 3 : timeframe
## 4 : substance
## 5 : depressed
## 6 : self-harm
## 7 : anxiety
## 8 : helpful

sample = pickle.load(open("saved/substance/100_text_samples.pickle", "rb"))

label_num = 4

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = "longest", truncation=True, max_length = 2048)

def m3_predict_probs(sample):
    dataset = pd.DataFrame(sample, columns=['text'])
    dataset = Dataset.from_pandas(dataset)
    dataset = dataset.map(tokenization, batched=True, batch_size=2, remove_columns=['text'])
    dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

    loader = DataLoader(dataset, batch_size=2)
    probs = None

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        outputs = outputs.logits.sigmoid().cpu().detach().numpy()
        probs = outputs if probs is None else np.concatenate((probs, outputs))
        
    prob_true = probs[:,label_num]    
    return np.transpose(np.vstack(([1-prob_true, prob_true]))) 

exp_list = []

for i in tqdm(sample.text):
    text = i
    
    exp = explainer.explain_instance(text, 
                                     m3_predict_probs, 
                                     num_features=100, 
                                     num_samples=4000)

    exp_list.extend(exp.as_list())
    
pickle.dump(exp_list, open("saved/substance/og_ml_exp_raw.pickle", "wb"))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448238472/work/aten/src/ATen/native/BinaryOps.cpp:467.)

Attention type 'block_sparse' is not possible if sequence_length: 448 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3.Changing attention type to 'original_full'...





#### Testing whether more features slows LIME down

In [11]:
explainer = LimeTextExplainer()
interval = int(len(sample)/5)
import time

In [12]:
start = time.time()
exp = explainer.explain_instance(sample, 
                            m3_predict_probs, 
                            num_features=5, 
                            num_samples=4000)
print(time.time() - start)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))





floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448238472/work/aten/src/ATen/native/BinaryOps.cpp:467.)

Attention type 'block_sparse' is not possible if sequence_length: 607 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3.Changing attention type to 'original_full'...


706.3173854351044


In [13]:
start = time.time()
exp = explainer.explain_instance(sample, 
                            m3_predict_probs, 
                            num_features=500, 
                            num_samples=4000)
print(time.time() - start)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


705.2420830726624


## Segmented LIME Explanations

In [7]:
explainer = LimeTextExplainer()

In [8]:
## getting the predicted probabilities for the different segments of the conversations

def get_explanations(sample, parts, n_features, labels, cumulative=True):
    
    pred_df = pd.DataFrame(index=np.arange(1,parts+1))
    sample = " ".join([i if not re.match(r'[A-Z][a-z]+', i) else i.lower() for i in sample.split(" ")])
    sample_len = len(sample)
    interval = int(sample_len/parts)

    for k in tqdm(labels):

        def tokenization(batched_text):
            return tokenizer(batched_text['text'], padding = "longest", truncation=True, max_length = 2048)

        def m3_predict_probs(sample):
            dataset = pd.DataFrame(sample, columns=['text'])
            dataset = Dataset.from_pandas(dataset)
            dataset = dataset.map(tokenization, batched=True, batch_size=2, remove_columns=['text'])
            dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

            loader = DataLoader(dataset, batch_size=2)
            probs = None

            for batch in loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                outputs = outputs.logits.sigmoid().cpu().detach().numpy()
                probs = outputs if probs is None else np.concatenate((probs, outputs))
                prob_true = probs[:,k]

            return np.transpose(np.vstack(([1-prob_true, prob_true]))) 
        
        def format_exp(exp):
            text = [f"{i[0]}: {i[1]:1.4f}" for i in exp]
            text = "<br>".join(text)
            return text

        def add_exp_to_df(sample_text, seg_num, label, pred_df):
            pred_prob = m3_predict_probs([sample_text])[0][1]
            pred_df.loc[seg_num, label] = pred_prob
            exp = explainer.explain_instance(sample_text, 
                                            m3_predict_probs, 
                                            num_features=n_features, 
                                            num_samples=4000)
            plot_text = f"{pred_prob:1.4f} <br>{format_exp(exp.as_list())}"
            pred_df.loc[seg_num, label+"_exp"] = plot_text

        l = labels[k]

        if cumulative: 
            for j in tqdm(pred_df.index[:-1]):
                add_exp_to_df(sample[:j*interval], j, l, pred_df)
            add_exp_to_df(sample, parts, l, pred_df)

        else: 
            for j in tqdm(pred_df.index[:-1]):
                add_exp_to_df(sample[(j-1)*interval: j*interval], j, l, pred_df)
            add_exp_to_df(sample[j*interval:], parts, l, pred_df)
    return pred_df

In [None]:
sample = pickle.load("saved/text_samples/sample7.pickle")

In [None]:
## Sample 1

parts = 5
n_features = 6
# sample = CHANGE_THIS
labels = {0: 'desire', 1:'intent', 2:'capability', 3:'timeframe', 
          4:'substance', 5:'depressed', 6:'self_harm', 7:'anxiety', 
          8:'helpful'}

pred_df = get_explanations(sample, parts, n_features, labels, cumulative=False)

In [56]:
pickle.dump(pred_df, open("saved/bb_exps/sample7_exp.pickle", "wb"))