In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from tqdm import tqdm

In [3]:
from captum.attr import LayerIntegratedGradients
import torch
from fastai.text import *

In [4]:
from nlp_cmp_utils import reset_model, prepare_captum_tensors, explain_tweet
from pathlib import Path

In [5]:
awd_path = Path('../imdb/')

In [6]:
pd.set_option('max_colwidth',100)

### get file data

In [8]:
filename = 'result_2020_11_15.json'

In [9]:
df = pd.read_json(filename)

In [10]:
df

Unnamed: 0,id,feature_id,label_text,entry_id,feature_text,type
0,27,1,{stunned},13,RT @Jwhitbrook: With the new Picard up on Amazon I can finally get a clean screenshot and ask......,pos
1,1,2,{love},5,"RT @klaushismydaddy: netflix: i love ALL of my programs equally!!! stranger things, lucifer, tig...",pos
2,2,3,{excellent},5,Just finished Picard on Amazon Prime. Was not expecting to cry. What an excellent show.,pos
3,3,5,"{you,suggest,go,do,so.}",5,"If you haven’t seen the amazon prime show “Upload”, I suggest you go do so. I really enjoyed thi...",pos
4,25,6,"{""https://t.co/l2HIKNQ95V\n\nWorks""}",13,@brexitblog_info @boblister_poole https://t.co/l2HIKNQ95V\n\nWorks both ways,pos
...,...,...,...,...,...,...
423,400,1549,{NEVER},265,RT @Floydbirman: @LoyalDefender2K I have NEVER watch ANY award for decades. Same as BBC Question...,neg
424,328,1556,"{out,of,touch}",167,RT @annesayer6: @LozzaFox @EquityUK I would suggest actors denounce Equity UK for being complete...,neg
425,410,1557,{disgracefully},265,@JohnSimpsonNews Not to mention anti SNP bias! It’s blatant in Scotland! There is no doubt the b...,neg
426,422,1565,{hate},348,RT @Ally__Cinnamon: Boris Johnson could say I fuckin hate the blacks man and there would be a de...,neg


In [11]:
df[['feature_text','label_text']]

Unnamed: 0,feature_text,label_text
0,RT @Jwhitbrook: With the new Picard up on Amazon I can finally get a clean screenshot and ask......,{stunned}
1,"RT @klaushismydaddy: netflix: i love ALL of my programs equally!!! stranger things, lucifer, tig...",{love}
2,Just finished Picard on Amazon Prime. Was not expecting to cry. What an excellent show.,{excellent}
3,"If you haven’t seen the amazon prime show “Upload”, I suggest you go do so. I really enjoyed thi...","{you,suggest,go,do,so.}"
4,@brexitblog_info @boblister_poole https://t.co/l2HIKNQ95V\n\nWorks both ways,"{""https://t.co/l2HIKNQ95V\n\nWorks""}"
...,...,...
423,RT @Floydbirman: @LoyalDefender2K I have NEVER watch ANY award for decades. Same as BBC Question...,{NEVER}
424,RT @annesayer6: @LozzaFox @EquityUK I would suggest actors denounce Equity UK for being complete...,"{out,of,touch}"
425,@JohnSimpsonNews Not to mention anti SNP bias! It’s blatant in Scotland! There is no doubt the b...,{disgracefully}
426,RT @Ally__Cinnamon: Boris Johnson could say I fuckin hate the blacks man and there would be a de...,{hate}


In [13]:
def extract_vals(df,ix):

    test_text = df.iloc[ix].feature_text
    test_sent = df.iloc[ix].type
    test_labels = df.iloc[ix].label_text[1:-1].split(',')
    
    return test_text, test_sent, test_labels

In [14]:
extract_vals(df,3)

('If you haven’t seen the amazon prime show “Upload”, I suggest you go do so. I really enjoyed this movie and it surrounds itself in virtual reality. I have laughed, felt on edge and the story has some twist and turns. Please if you haven’t seen it? Do so! I hope season 2 happens. https://t.co/Mf5bSi',
 'pos',
 ['you', 'suggest', 'go', 'do', 'so.'])

### compare with captum

In [15]:
model, embedding, data_clas = reset_model(awd_path)

In [16]:
ml = Lambda(lambda x : torch.softmax(model(x)[0],-1), )

In [17]:
lig = LayerIntegratedGradients(ml, embedding,)

In [21]:
# loop through all the tweets explain each via captum
results=[]
for ix in tqdm(range(df.shape[0])):
    
    test_text, test_sent, test_labels = extract_vals(df,ix)
    
    input_tensor = data_clas.one_item(test_text)[0].cpu()
    words = [data_clas.vocab.itos[i.item()] for i in input_tensor.squeeze(0)]
    
    
    attribs = explain_tweet(test_sent,input_tensor,lig)
    attr_data_df = pd.DataFrame(data=(zip(attribs, words)), columns=('score', 'word')).rename_axis('position')
    
    attr_data_df.reset_index(inplace=True)
    
    # remove 'xxunk','xxmaj' tags - lstm only
    word_dict = attr_data_df.loc[attr_data_df.word.isin(['xxunk','xxmaj'])==False].to_dict(orient='records')
    
    results.append({'model_dict':word_dict,'human_words':test_labels,'sentiment':test_sent})
                                 
res_df_cap = pd.DataFrame(results)
res_df_cap.head()

100%|██████████| 428/428 [05:17<00:00,  1.35it/s]


Unnamed: 0,model_dict,human_words,sentiment
0,"[{'position': 0, 'score': 0.0, 'word': 'xxbos'}, {'position': 1, 'score': 0.021161681041121483, ...",[stunned],pos
1,"[{'position': 0, 'score': 0.0, 'word': 'xxbos'}, {'position': 1, 'score': 0.02491801790893078, '...",[love],pos
2,"[{'position': 0, 'score': 0.0, 'word': 'xxbos'}, {'position': 2, 'score': -0.1196400374174118, '...",[excellent],pos
3,"[{'position': 0, 'score': 0.0, 'word': 'xxbos'}, {'position': 2, 'score': 0.031670838594436646, ...","[you, suggest, go, do, so.]",pos
4,"[{'position': 0, 'score': 0.0, 'word': 'xxbos'}, {'position': 4, 'score': 0.09788115322589874, '...","[""https://t.co/l2HIKNQ95V\n\nWorks""]",pos


In [22]:
# save out
res_df_cap.to_pickle('lstm_cap_scores.pkl')