In [1]:
import sys
sys.executable
import os

In [2]:
os.environ["JAVA_HOME"] = "C:\Java\jdk1.8.0_221"

In [3]:
# !{sys.executable} -m pip install torch 
# !{sys.executable} -m pip install --upgrade tensorflow
# !{sys.executable} -m pip install --upgrade transformers 
# !{sys.executable} -m pip install --upgrade tokenizers
# !{sys.executable} -m pip install --upgrade datasets
# !{sys.executable} -m pip install --upgrade nltk
# !{sys.executable} -m pip install bert_score
# !{sys.executable} -m pip install seaborn
# !{sys.executable} -m pip list

In [5]:
import numpy

import glob
from pathlib import Path
import csv 

import torch 

from transformers import DistilBertTokenizer, DistilBertForMaskedLM, DistilBertConfig, pipeline, LineByLineTextDataset

import pandas as pd
import numpy as np

In [None]:
# read in the eval dataframe so we can determine probabilities of each word being the likely word for that sentence.
eval_data = pd.read_csv(
    './telemed_Data_Evaluate_Big/sentence_data.csv',
#     './telemed_Data_Evaluate/sentence_data.csv', 
    sep='\n'
)

In [None]:
# load up the tokenizer 
tokenizer = DistilBertTokenizer('./telemed_Vocab/telemed-bert-wordpiece-vocab.txt')

In [None]:
# load up the fine tuned model
config = DistilBertConfig(
    vocab_size = tokenizer.vocab_size, # this is the only default I'm changing
    max_position_embeddings=512, 
    sinusoidal_pos_embds=False, 
    n_layers=6, 
    n_heads=12, 
    dim=768, 
    hidden_dim=3072, 
    dropout=0.1, 
    attention_dropout=0.1, 
    activation='gelu', 
    initializer_range=0.02, 
    qa_dropout=0.1, 
    seq_classif_dropout=0.2, 
    pad_token_id=0
)

model = DistilBertForMaskedLM(config)

modelTrained = model.from_pretrained('./telemed_Model/')

In [None]:
fill_mask = pipeline('fill-mask', './telemed_Fill_Mask_Pipeline/')

In [None]:
# # There are no appreciable mineral opaque calculi within the urinary bladder on the available study.
# sequence = f"There are no appreciable mineral opaque {fill_mask.tokenizer.mask_token} within the urinary bladder on the available study."
# top_k = fill_mask(sequence)

# print("Complete sentence: There are no appreciable mineral opaque calculi within the urinary bladder on the available study.")
# print("\n")
# print("Sentence with exactly one token masked:")
# print(sequence)
# print("\n")
# print("k sentences with masked token filled and likelihood (probability?) of that token:")
# for item in top_k:
#     print(item['sequence'])
#     print(item['score'])
# #     print(item['token'])
# #     print(item['token_str'])

In [None]:
# # try another sentence 
# # sequence = f"There is a mild diffuse {tokenizer.mask_token} she will long pattern, the interstitial component of which is accentuated by expiratory phase of respiration."
# sequence = f"There is a mild diffuse {fill_mask.tokenizer.mask_token} lung pattern, the interstitial component of which is accentuated by expiratory phase of respiration." 
# # top_k = fill_mask(sequence)

# # print("Complete sentence: There is a mild diffuse broncho-interspace she will long pattern, the interstitial component of which is accentuated by expiratory phase of respiration.")
# # print("\n")
# # print("Sentence with exactly one token masked:")
# print(sequence)
# print(tokenizer(sequence))
# fill_mask(sequence)
# # print("\n")
# # print("k sentences with masked token filled and likelihood (probability?) of that token:")
# # for item in top_k:
# #     print(item['sequence'])
# #     print(item['score'])
    
# # parenchymal is an adjective!!
# # parenchyma is a noun - model knows a noun belongs in that slot

In [None]:
# # try another sentence 
# # sequence = f"Since the previous exam, there has been {tokenizer.mask_token} improvement in the previously described bronchointerstitial to alveolar lung pattern." 
# sequence = f"Since the previous exam, there has been {fill_mask.tokenizer.mask_token} improvement in the previously described bronchointerstitial to alveolar lung pattern." 

# top_k = fill_mask(sequence, targets = ['substantial'])

# print("Complete sentence: Since the previous exam, there has been substantial improvement in the previously described bronchointerstitial to alveolar lung pattern.")
# print("\n")
# print("Sentence with exactly one token masked:")
# print(sequence)
# print("\n")
# print("k sentences with masked token filled and likelihood (probability?) of that token:")
# for item in top_k:
#     print(item['sequence'])
#     print(item['score'])

In [None]:
# you can create a mini UI 
import ipywidgets as widgets
from IPython.display import display

# instantiate some output
out = widgets.Output()

# make the text box sufficiently wide
layout = widgets.Layout(width='auto') 
 
# define characteristics of text box    
input = widgets.Text(
    value='',
    placeholder='Enter sentence',
    description='Sentence:',
    layout = layout,
    disabled=False
)

def forward(_):
    # clear out the old output
    with out:
        out.clear_output()
    # if the user has entered some text then do the following    
    if len(input.value) > 0:
        sentence_text = input.value
        token_id_list = tokenizer(sentence_text).input_ids
        token_list = tokenizer.convert_ids_to_tokens(token_id_list, skip_special_tokens = True)
        mask_text = " " + tokenizer.mask_token + " "
        for x in range(len(token_list)):
            sentence_with_one_mask = tokenizer.convert_tokens_to_string(token_list[0:x]) +  mask_text  + tokenizer.convert_tokens_to_string(token_list[(x+1):len(token_list)])
            mask_fill_output = fill_mask(sentence_with_one_mask, targets = token_list[x])
            actual_token = mask_fill_output[0]['token_str']
            actual_token_prob = mask_fill_output[0]['score']
            mask_fill_output_top1 = fill_mask(sentence_with_one_mask)
            with out:
                print("Sentence w/ 1 token masked: " + sentence_with_one_mask)
                print("Actual word: " + actual_token)
                print("Probability of actual word: " + str(actual_token_prob))
                print("Most likely word: " + mask_fill_output_top1[0]['token_str'])
                print("Probability of most likely word: " + str(mask_fill_output_top1[0]['score']))
                print('\n')
        
input.on_submit(forward)
display(input, out)

# after you paste input just hit return.

In [None]:
# # write a function that sequentially masks each token in a sentence and then checks the topk tokens in fill-mask pipeline 
# # to see if one of those match the masked token

# sentence_text = "Since the previous exam, there has been substantial improvement in the previously described bronchointerstitial to alveolar lung pattern." 

# token_id_list = tokenizer(sentence_text).input_ids
# token_list = tokenizer.convert_ids_to_tokens(token_id_list, skip_special_tokens = True)
# print(token_list)
# print('\n')
# mask_text = " " + tokenizer.mask_token + " "

# # loop through all tokens in sentence
# for x in range(len(token_list)):
#     sentence_with_one_mask = tokenizer.convert_tokens_to_string(token_list[0:x]) +  mask_text  + tokenizer.convert_tokens_to_string(token_list[(x+1):len(token_list)])
#     print("Sentence w/ 1 token masked: " + sentence_with_one_mask)
#     mask_fill_output = fill_mask(sentence_with_one_mask, targets = token_list[x])
#     actual_token = mask_fill_output[0]['token_str']
#     actual_token_prob = mask_fill_output[0]['score']
#     print(actual_token + ": " + str(actual_token_prob))
#     print('\n')

In [None]:
# # this function returns the predicted most likely sentence
# def predict(sentence):
#     input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
#     outputs = modelTrained(input_ids, labels = input_ids)
#     loss, prediction_scores = outputs[:2]
#     print(prediction_scores.shape)
#     text = ''
#     for i in range(1, prediction_scores.shape[1]-1):
#         t = np.argmax(prediction_scores[0, i].tolist())
#         options = tokenizer.convert_ids_to_tokens([t])
#         text = text + ' ' + options[0]  
#     return text

# print(predict("Since the previous exam, there has been substantial improvement in the previously described bronchointerstitial to alveolar lung pattern."))

In [None]:
# def pred(sentence):
#     input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
#     outputs = modelTrained(input_ids, labels = input_ids)
#     loss, prediction_scores = outputs[:2] 
#     probabilities = torch.nn.functional.softmax(prediction_scores, dim = -1)
#     for i in range(1, prediction_scores.shape[1]-1): # 1 to N-1 to skip [CLS] & [SEP]
#         j = input_ids[0, i]
#         p = probabilities[0, i, j].tolist()
#         text = tokenizer.convert_ids_to_tokens([j])
#         print(text[0] + "     " + str(p))

In [None]:
# print(pred("Since the previous exam, there has been substantial improvement in the previously described bronchointerstitial to alveolar lung pattern."))

In [None]:
# use line by line function to bring in eval dataset
linebyline_eval_data = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./telemed_Data_Evaluate_Big/sentence_data.csv",
    block_size=128
)

# create an empty array in which probabilities of each actual word will be stored
prob_array = np.empty((0,1), float)

# print(len(linebyline_eval_data.examples))
# print(linebyline_eval_data.examples[0]) # len = 200,000
# print(type(linebyline_eval_data.examples))

from tqdm import tqdm

for example in tqdm(linebyline_eval_data.examples[:20_000]): #looping though all 200,000 rows takes forever
    input_ids = torch.tensor(tokenizer.encode(example)).unsqueeze(0)
    outputs = modelTrained(input_ids, labels = input_ids)
    loss, prediction_scores = outputs[:2] 
    probabilities = torch.nn.functional.softmax(prediction_scores, dim = -1)
    for i in range(1, prediction_scores.shape[1]-1): # 1 to N-1 to skip [CLS] & [SEP]
        j = input_ids[0, i]
        p = probabilities[0, i, j].tolist()
        prob_array = np.append(prob_array, np.array([[p]]), axis=0)

In [None]:
print(prob_array[0])
print(prob_array[1])

In [None]:
import seaborn as sns
sns.distplot(prob_array)

In [None]:
print((linebyline_eval_data.examples[:20_000][0]))

In [None]:
# get the token id for backslash
# token_id = tokenizer.convert_tokens_to_ids('\\')
token_id = tokenizer.convert_tokens_to_ids(',')
print("The ID of the token we want to find in text is: " + str(token_id))

# instantiate an empty list of the indexes of the above token
token_index_list = []

# convert list of lists to a numpy array    
sentences_ids_array = numpy.array(linebyline_eval_data.examples[:7]) 

# # iterate through each list in array
for list_obj in sentences_ids_array:
    # find all indexes of the token_id in the sentence id list
    index_list = list(np.where(np.isin(list_obj, [token_id]))[0])
    # append above list to the overall list
    token_index_list.append(index_list)
    for index in index_list:
        # find the words that precede and succeed it
        print(tokenizer.convert_ids_to_tokens(list_obj[(index-2):(index+3)]))


In [None]:
print(token_index_list)