In [1]:
!pip3 install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 30.3 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 42.0 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [2]:
!pip3 install captum

Collecting captum
  Downloading captum-0.4.0-py3-none-any.whl (1.4 MB)
[?25l[K     |▎                               | 10 kB 21.2 MB/s eta 0:00:01[K     |▌                               | 20 kB 23.5 MB/s eta 0:00:01[K     |▊                               | 30 kB 16.4 MB/s eta 0:00:01[K     |█                               | 40 kB 14.8 MB/s eta 0:00:01[K     |█▏                              | 51 kB 5.6 MB/s eta 0:00:01[K     |█▍                              | 61 kB 6.0 MB/s eta 0:00:01[K     |█▋                              | 71 kB 5.5 MB/s eta 0:00:01[K     |██                              | 81 kB 6.1 MB/s eta 0:00:01[K     |██▏                             | 92 kB 6.4 MB/s eta 0:00:01[K     |██▍                             | 102 kB 5.4 MB/s eta 0:00:01[K     |██▋                             | 112 kB 5.4 MB/s eta 0:00:01[K     |██▉                             | 122 kB 5.4 MB/s eta 0:00:01[K     |███                             | 133 kB 5.4 MB/s eta 0:00:01[K 

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, BertModel,AutoModel
from captum.attr import visualization as viz
from captum.attr import IntegratedGradients, LayerConductance, LayerIntegratedGradients
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer
import torch
import matplotlib.pyplot as plt
import numpy as np
import os
import bz2
import transformers
import pandas as pd

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print('We will use the GPU:', torch.cuda.get_device_name(0))

We will use the GPU: Tesla K80


In [5]:
# Library setups
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Constants and paths
ROOT_PATH = '/content/drive/MyDrive/Allen_NLP_hackathon'
predicted_nyt_data = os.path.join(ROOT_PATH, 'Data', 'nyt_predicted_sentences_grace.csv')
BERT_MODELS = os.path.join(ROOT_PATH, 'Models')

In [7]:
def construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id, tokenizer):

    text_ids = tokenizer.encode(text, add_special_tokens=False,max_length=64,truncation=True)
    # construct input token ids
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    # construct reference token ids 
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(text_ids)

def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
    seq_len = input_ids.size(1)
    token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
    ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
    return token_type_ids, ref_token_type_ids

def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_ids
    
def construct_attention_mask(input_ids):
    return torch.ones_like(input_ids)

def custom_forward(inputs):
    preds = predict(inputs)
    #print(torch.softmax(preds, dim = 1).shape)
    #print(torch.softmax(preds, dim = 1))
    return preds#torch.softmax(preds, dim = 1)[:, 1]# for negative attribution, torch.softmax(preds, dim = 1)[:, 1] <- for positive attribution

def predict(inputs):
    #print('model(inputs): ', model(inputs))
    return model(inputs)[0]

def get_attribution_for_test_set(lig, test_data_set, tokenizer):
    words_ls = []
    attributions_ls = []
    test_set_word_att_dict = {}
    
    for index, row in test_data_set.iterrows():
        clean_text = row["sentence1"]
        
        input_ids, ref_input_ids, sep_id = construct_input_ref_pair(clean_text, ref_token_id, sep_token_id, cls_token_id, tokenizer)
        token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
        position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
        attention_mask = construct_attention_mask(input_ids)

        indices = input_ids[0].detach().tolist()
        all_tokens = tokenizer.convert_ids_to_tokens(indices)
        
        attributions, delta = lig.attribute(inputs=input_ids,
                                    baselines=ref_input_ids,
                                    n_steps=5000,
                                    internal_batch_size=5,
                                    return_convergence_delta=True)
        tokenized_sen = tokenizer.tokenize(clean_text)
        print(len(tokenized_sen))
        tokenized_sen = tokenized_sen[:64]
        for i in tokenized_sen:
            word = i
            words_ls.append(word)
            index = tokenized_sen.index(i)+1
            attribution = float(sum(attributions[0][index]))
            attributions_ls.append(attribution)
            
    #words_ls_flatten = [item for sublist in words_ls for item in sublist]
    #attributions_ls_flatten = [item for sublist in attributions_ls for item in sublist]
    
    test_set_word_att_dict["words"] = words_ls
    test_set_word_att_dict["attribution"] = attributions_ls
    
    return test_set_word_att_dict

In [8]:
# load model
model = BertForSequenceClassification.from_pretrained(BERT_MODELS+"/anger/", output_hidden_states=True)
model.to(device)
model.eval()
model.zero_grad()

# load tokenizer
tokenizer = BertTokenizer.from_pretrained(BERT_MODELS+"/anger/")

In [9]:
anger_data = pd.read_csv(predicted_nyt_data)

In [10]:
anger_data.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'sentence1', 'anger_prediciton',
       'sadness_prediciton'],
      dtype='object')

In [11]:
len(anger_data)

250

In [12]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence

In [None]:
lig = LayerIntegratedGradients(custom_forward, model.bert.embeddings)
# test_set_word_att_dict = get_attribution_for_test_set(lig, anger_data[:20], tokenizer)
test_set_word_att_dict = get_attribution_for_test_set(lig, anger_data[40:60], tokenizer)

anger_feature_importances_scores_nyt_data = os.path.join(ROOT_PATH, 'Results', 'anger_importance_scores_grace_60.csv')

47
89
36
58
27
82


In [None]:
anger_features_importance_Scores = pd.DataFrame(test_set_word_att_dict).sort_values(by=["attribution"],ascending=False)
anger_features_importance_Scores

In [None]:
anger_features_importance_Scores.to_csv(anger_feature_importances_scores_nyt_data)