In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))

from transformers import BertModel, BertTokenizer
from copy import deepcopy
import torch.nn as nn
import torch
import re
import ipywidgets

from captum.attr import visualization as viz
from captum.attr import IntegratedGradients, LayerConductance, LayerIntegratedGradients
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer

from models.bert.model import BERT
from trainers.tf_trainer import TransformerTrainer
# from models.longformer.model import Longformer
# from models.longformer.args import get_args
# from datasets.scar_longformer import SCAR_Longformer

import torch

# os.environ['CUDA_LAUNCH_BLOCKING'] = "0". Cuda launch blocking doesn't seem to help out of memory issues. 

In [2]:
# Instantiate our Model
# steps_per_epoch = scar_longformer.get_n_training()/config.batch_size
# model = Longformer(config, loss_fn, steps_per_epoch)

# Load from checkpoint
# ckpt_path = r'C:\Users\jjnunez\PycharmProjects\scar_nlp\temp_bert_ckpt\epoch=17-step=42569.ckpt'

# Survival 60 months
ckpt_path = os.path.join(r"C:\Users\jjnunez\PycharmProjects\scar_nlp\results\survic_mo_60\BERT\default\version_0", "BERT--epoch=33_val_bal_val_bal=0.84.ckpt")
# Psychiatry in 60 months
# ckpt_path = os.path.join(r"C:\Users\jjnunez\PycharmProjects\scar_nlp\results\dsplnic_PSYCHIATRY_60\BERT\default\version_7", "BERT--epoch=11_val_bal_val_bal=0.67.ckpt")

            
model = BERT.load_from_checkpoint(ckpt_path)
print("loaded model from checkpoint!")
# prints the learning_rate you used in this checkpoint

#model.eval()
#y_hat = model(x)
            
            


Some weights of the model checkpoint at C:\Users\jjnunez\PycharmProjects\hedwig-data\models\bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


loaded model from checkpoint!


In [3]:
config = model.hparams.config

#device = "cpu" # Comment out to run on GPU
#config.device = device
model.to(config.device)
model.eval()
model.zero_grad()
# load tokenizer
pretrained_token_model_path = os.path.join(config.pretrained_dir, config.pretrained_file)
tokenizer = BertTokenizer.from_pretrained(pretrained_token_model_path)
BertTokenizer.from_pretrained(config.pretrained_file)

print('Weve loaded up a tokenizer and the model')

Weve loaded up a tokenizer and the model


In [4]:
def predict(inputs, attn_mask):
    return model(inputs, attn_mask)[1]

In [5]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence

In [6]:
def construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id):

    max_len = 2048 
    
    #text_ids = tokenizer.encode(
    #            text,
    #            None,
    #            add_special_tokens=True,
    #            max_length=max_len,
    #            padding='max_length',
    #            return_token_type_ids=False,
    #            #return_attention_mask=True,
    #            truncation=True,
    #            #return_tensors='pt'
    #    )
    #text_ids = tokenizer.encode(text, add_special_tokens=False)

    text_ids = tokenizer.encode(text,None,
                                add_special_tokens=True,
                                return_token_type_ids=False)
    
    # print(f'Here is the tokenized text: {tokenizer.convert_ids_to_tokens(text_ids)}')
    
    # construct input token ids
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    # construct reference token ids 
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=config.device), torch.tensor([ref_input_ids], device=config.device), len(text_ids)

def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
    seq_len = input_ids.size(1)
    token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=config.device)
    ref_token_type_ids = torch.zeros_like(token_type_ids, device=config.device)# * -1
    return token_type_ids, ref_token_type_ids

def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=config.device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=config.device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=config.device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_ids
    
def construct_attention_mask(input_ids):
    return torch.ones_like(input_ids)

In [7]:
def custom_forward(inputs):
    preds = predict(inputs, attention_mask)
    # return torch.softmax(preds, dim = 1)[0][0].unsqueeze(-1)
    return preds.squeeze(-1)

In [8]:
lig = LayerIntegratedGradients(custom_forward,
                               model.bert.embeddings)

In [9]:
file = open('anon_bert.txt',mode='r')
text = file.read()
file.close()

In [10]:
input_ids, ref_input_ids, sep_id = construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id)
token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
attention_mask = construct_attention_mask(input_ids)

indices = input_ids[0].detach().tolist()
all_tokens = [re.sub('Ä ', '',x) for x in tokenizer.convert_ids_to_tokens(indices)]
all_tokens = [re.sub('##', '',x) for x in tokenizer.convert_ids_to_tokens(indices)]

In [11]:
model(input_ids, attention_mask)

(0, tensor([[0.8814]], device='cuda:0', grad_fn=<SigmoidBackward>))

In [12]:
predict(input_ids, attention_mask)

tensor([[0.8814]], device='cuda:0', grad_fn=<SigmoidBackward>)

In [13]:
custom_forward(input_ids)

tensor([0.8814], device='cuda:0', grad_fn=<SqueezeBackward1>)

In [14]:
n_steps = 50 # Set here. Highest so far for BERT == 20
print(f'We will attempt to run analysis with n_steps: {n_steps}')

We will attempt to run analysis with n_steps: 50


In [15]:
attributions, delta = lig.attribute(inputs=input_ids,
                                    baselines=ref_input_ids,
                                    return_convergence_delta=True,
                                    n_steps=n_steps,
                                    internal_batch_size=8)

In [16]:
score = predict(input_ids, attention_mask)

def one_or_zero(score):
    return 1 if score>=0.50 else 0

print(f'Predicted Answer: Probability of having emotional need: {score.cpu().detach().numpy()[0][0]}. Predicted label: {one_or_zero(score)}')
#print('Predicted Answer: ' + str(torch.argmax(score[0]).numpy()) + ', prob having 1 emotional need: ' + str(torch.softmax(score, dim = 1)[0][0].detach().numpy()))

Predicted Answer: Probability of having emotional need: 0.8814416527748108. Predicted label: 1


In [17]:
def summarize_attributions(attributions):
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    return attributions

In [18]:
attributions_sum = summarize_attributions(attributions)

In [19]:
# storing couple samples in an array for visualization purposes
print(score.cpu().detach().numpy()[0][0])
score_vis = viz.VisualizationDataRecord(
                        attributions_sum,
                        score.cpu().detach().numpy()[0][0],
                        int(one_or_zero(score)),
                        1,
                        text,
                        attributions_sum.sum(),       
                        all_tokens,
                        delta)

print('\033[1m', 'Visualization For Score', '\033[0m')
viz.visualize_text([score_vis])




0.88144165
[1m Visualization For Score [0m


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (0.88),"REASON FOR REFERRAL Ms. Chan is a 41 year old nulligravid woman who has been referred for management of a large pelvic mass. HISTORY OF PRESENT ILLNESS For the past few years, Emma has noted more irregular cycles and had seen her family physician several times regarding this. She eventually went back to Taiwan to obtain imaging in late 2012. Apparently this showed that she had ovarian cysts. When she returned to Canada, she had a repeat pelvic ultrasound performed on April 14, 2012, revealing complex right adnexal lesion with a cystic component measuring 8.5 cm and a solid irregular marrow component measuring 3.3 cm. The uterus and left ovary appeared normal. She went on to see Dr. Cohen in late April and an endometrial biopsy was negative for malignancy. She had repeat ultrasound on May 22, 2013, which revealed that the right adnexal lesion was enlarging, now measuring 10.1 x 10.0 x 9.3 cm with multiple peripheral solid nodules with the largest measuring 4.5 cm, increased from 3.2 cm. The same day tumour markers revealed a slightly elevated CA125 of 42, CA19 9 of 64 and a normal CA15 3 and CEA. The patient is quite symptomatic from this mass and is noticing increasing lower abdominal pain especially on the right over the past 2 weeks. She finds that it is worse in the morning and has difficulty moving, but has been able to continue working after she discovered that a herbal supplement gives her relief. She has also noticed increasing bloating and bowel changes over the past 2 3 weeks. She now has loose bowel movements 3 4 times a day. She has increased urinary frequency and hesitancy. Her appetite is lower, but she has not had any weight loss. GYNECOLOGIC HISTORY She underwent menarche at age 13. As described previously, she has noticed shorter cycles over the past 2 years every 20 25 days. Her periods are light and she denies any dysmenorrhea. She denies any intermenstrual bleeding. She is nulligravid. She is sexually active and has no history of STIs. She had a history of CIN III in 2008 which was treated with LEEP and her Pap test has since been normal. She had her last mammogram about 3 4 years ago.",3.72,"[CLS] [CLS] reason for refer ral ms . chan is a 41 year old null ig ra vid woman who has been referred for management of a large pe l vic mass . history of present illness for the past few years , emma has noted more irregular cycles and had seen her family physician several times regarding this . she eventually went back to taiwan to obtain imaging in late 2012 . apparently this showed that she had o var ian cy sts . when she returned to canada , she had a repeat pe l vic ultrasound performed on april 14 , 2012 , revealing complex right ad ne xa l les ion with a cy stic component measuring 8 . 5 cm and a solid irregular marrow component measuring 3 . 3 cm . the ut erus and left o vary appeared normal . she went on to see dr . cohen in late april and an end ome tri al bio psy was negative for mali gnan cy . she had repeat ultrasound on may 22 , 2013 , which revealed that the right ad ne xa l les ion was en lar ging , now measuring 10 . 1 x 10 . 0 x 9 . 3 cm with multiple peripheral solid nod ules with the largest measuring 4 . 5 cm , increased from 3 . 2 cm . the same day tu mour markers revealed a slightly elevated ca 12 5 of 42 , ca 19 9 of 64 and a normal ca 15 3 and ce a . the patient is quite sy mpt oma tic from this mass and is noticing increasing lower abdominal pain especially on the right over the past 2 weeks . she finds that it is worse in the morning and has difficulty moving , but has been able to continue working after she discovered that a herbal supplement gives her relief . she has also noticed increasing b lo ating and bow el changes over the past 2 3 weeks . she now has loose bow el movements 3 4 times a day . she has increased ur ina ry frequency and he sit ancy . her appetite is lower , but she has not had any weight loss . g yne col og ic history she underwent men ar che at age 13 . as described previously , she has noticed shorter cycles over the past 2 years every 20 25 days . her periods are light and she denies any d ys men or rh ea . she denies any inter men st ru al bleeding . she is null ig ra vid . she is sexually active and has no history of st is . she had a history of ci n iii in 2008 which was treated with lee p and her pa p test has since been normal . she had her last ma mm og ram about 3 4 years ago . [SEP] [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (0.88),"REASON FOR REFERRAL Ms. Chan is a 41 year old nulligravid woman who has been referred for management of a large pelvic mass. HISTORY OF PRESENT ILLNESS For the past few years, Emma has noted more irregular cycles and had seen her family physician several times regarding this. She eventually went back to Taiwan to obtain imaging in late 2012. Apparently this showed that she had ovarian cysts. When she returned to Canada, she had a repeat pelvic ultrasound performed on April 14, 2012, revealing complex right adnexal lesion with a cystic component measuring 8.5 cm and a solid irregular marrow component measuring 3.3 cm. The uterus and left ovary appeared normal. She went on to see Dr. Cohen in late April and an endometrial biopsy was negative for malignancy. She had repeat ultrasound on May 22, 2013, which revealed that the right adnexal lesion was enlarging, now measuring 10.1 x 10.0 x 9.3 cm with multiple peripheral solid nodules with the largest measuring 4.5 cm, increased from 3.2 cm. The same day tumour markers revealed a slightly elevated CA125 of 42, CA19 9 of 64 and a normal CA15 3 and CEA. The patient is quite symptomatic from this mass and is noticing increasing lower abdominal pain especially on the right over the past 2 weeks. She finds that it is worse in the morning and has difficulty moving, but has been able to continue working after she discovered that a herbal supplement gives her relief. She has also noticed increasing bloating and bowel changes over the past 2 3 weeks. She now has loose bowel movements 3 4 times a day. She has increased urinary frequency and hesitancy. Her appetite is lower, but she has not had any weight loss. GYNECOLOGIC HISTORY She underwent menarche at age 13. As described previously, she has noticed shorter cycles over the past 2 years every 20 25 days. Her periods are light and she denies any dysmenorrhea. She denies any intermenstrual bleeding. She is nulligravid. She is sexually active and has no history of STIs. She had a history of CIN III in 2008 which was treated with LEEP and her Pap test has since been normal. She had her last mammogram about 3 4 years ago.",3.72,"[CLS] [CLS] reason for refer ral ms . chan is a 41 year old null ig ra vid woman who has been referred for management of a large pe l vic mass . history of present illness for the past few years , emma has noted more irregular cycles and had seen her family physician several times regarding this . she eventually went back to taiwan to obtain imaging in late 2012 . apparently this showed that she had o var ian cy sts . when she returned to canada , she had a repeat pe l vic ultrasound performed on april 14 , 2012 , revealing complex right ad ne xa l les ion with a cy stic component measuring 8 . 5 cm and a solid irregular marrow component measuring 3 . 3 cm . the ut erus and left o vary appeared normal . she went on to see dr . cohen in late april and an end ome tri al bio psy was negative for mali gnan cy . she had repeat ultrasound on may 22 , 2013 , which revealed that the right ad ne xa l les ion was en lar ging , now measuring 10 . 1 x 10 . 0 x 9 . 3 cm with multiple peripheral solid nod ules with the largest measuring 4 . 5 cm , increased from 3 . 2 cm . the same day tu mour markers revealed a slightly elevated ca 12 5 of 42 , ca 19 9 of 64 and a normal ca 15 3 and ce a . the patient is quite sy mpt oma tic from this mass and is noticing increasing lower abdominal pain especially on the right over the past 2 weeks . she finds that it is worse in the morning and has difficulty moving , but has been able to continue working after she discovered that a herbal supplement gives her relief . she has also noticed increasing b lo ating and bow el changes over the past 2 3 weeks . she now has loose bow el movements 3 4 times a day . she has increased ur ina ry frequency and he sit ancy . her appetite is lower , but she has not had any weight loss . g yne col og ic history she underwent men ar che at age 13 . as described previously , she has noticed shorter cycles over the past 2 years every 20 25 days . her periods are light and she denies any d ys men or rh ea . she denies any inter men st ru al bleeding . she is null ig ra vid . she is sexually active and has no history of st is . she had a history of ci n iii in 2008 which was treated with lee p and her pa p test has since been normal . she had her last ma mm og ram about 3 4 years ago . [SEP] [SEP]"
,,,,


In [20]:
vis_data_records_ig = []

vis_data_records_ig.append(viz.VisualizationDataRecord(
                        attributions_sum,
                        score.cpu().detach().numpy()[0][0],
                        int(one_or_zero(score)),
                        1,
                        '',
                        attributions_sum.sum(),       
                        all_tokens,
                        delta))

_ = viz.visualize_text(vis_data_records_ig)

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (0.88),,3.72,"[CLS] [CLS] reason for refer ral ms . chan is a 41 year old null ig ra vid woman who has been referred for management of a large pe l vic mass . history of present illness for the past few years , emma has noted more irregular cycles and had seen her family physician several times regarding this . she eventually went back to taiwan to obtain imaging in late 2012 . apparently this showed that she had o var ian cy sts . when she returned to canada , she had a repeat pe l vic ultrasound performed on april 14 , 2012 , revealing complex right ad ne xa l les ion with a cy stic component measuring 8 . 5 cm and a solid irregular marrow component measuring 3 . 3 cm . the ut erus and left o vary appeared normal . she went on to see dr . cohen in late april and an end ome tri al bio psy was negative for mali gnan cy . she had repeat ultrasound on may 22 , 2013 , which revealed that the right ad ne xa l les ion was en lar ging , now measuring 10 . 1 x 10 . 0 x 9 . 3 cm with multiple peripheral solid nod ules with the largest measuring 4 . 5 cm , increased from 3 . 2 cm . the same day tu mour markers revealed a slightly elevated ca 12 5 of 42 , ca 19 9 of 64 and a normal ca 15 3 and ce a . the patient is quite sy mpt oma tic from this mass and is noticing increasing lower abdominal pain especially on the right over the past 2 weeks . she finds that it is worse in the morning and has difficulty moving , but has been able to continue working after she discovered that a herbal supplement gives her relief . she has also noticed increasing b lo ating and bow el changes over the past 2 3 weeks . she now has loose bow el movements 3 4 times a day . she has increased ur ina ry frequency and he sit ancy . her appetite is lower , but she has not had any weight loss . g yne col og ic history she underwent men ar che at age 13 . as described previously , she has noticed shorter cycles over the past 2 years every 20 25 days . her periods are light and she denies any d ys men or rh ea . she denies any inter men st ru al bleeding . she is null ig ra vid . she is sexually active and has no history of st is . she had a history of ci n iii in 2008 which was treated with lee p and her pa p test has since been normal . she had her last ma mm og ram about 3 4 years ago . [SEP] [SEP]"
,,,,
