In [237]:
from transformers import BertTokenizerFast, TFBertModel, BertConfig
from transformers import AutoTokenizer, AutoModel, AutoConfig
from ast import literal_eval
import pandas as pd
import numpy as np
import re
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import tensorflow_text as text

In [258]:
MODEL_NAME = 'bert-base-uncased'

In [77]:
class Preprocessing:
    def __init__(self):
        return

    def organizeData(self, featuresFile, patientNotesFile, trainFile):
        self.fDF = pd.read_csv(featuresFile)
        self.pnDF = pd.read_csv(patientNotesFile)
        self.tDF = pd.read_csv(trainFile)

        self.tDF['annotation_list'] = self.tDF['annotation'].apply(literal_eval)
        self.tDF['location_list'] = self.tDF['location'].apply(literal_eval)

        self.removeORs()
        self.removeSpecialChars()
        
        self.merged = self.tDF.merge(self.pnDF, how="left")
        self.merged = self.merged.merge(self.fDF, how="left")
        self.merged['annotation_length'] = self.tDF['annotation'].apply(len)

    def removeORs(self):
        self.fDF['feature_text'] = self.fDF['feature_text'].apply(lambda x: x.lower())
        for i in range(len(self.fDF['feature_text'])):
            self.fDF.at[i, 'feature_text'] = self.fDF['feature_text'][i].replace("-OR-", ";-").replace("-", " ")

    def removeSpecialChars(self):
        self.pnDF['pn_history'] = self.pnDF['pn_history'].apply(lambda x: x.lower())
        for i in range(len(self.pnDF['pn_history'])):
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("(", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(")", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(":", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(";", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("-", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("/", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("\\", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("\r\n", "  ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("\'", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("\"", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(",", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(".", " ")
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bzero\b', ' 0  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bone\b', ' 1 ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\btwo\b', ' 2 ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bthree\b', '  3  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bfour\b', ' 4  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bfive\b', ' 5  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bsix\b', ' 6 ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bseven\b', '  7  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\beight\b', '  8  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bnine\b', ' 9  ', self.pnDF['pn_history'][i])

In [78]:
featuresFile = "..\\CSVs\\features.csv"
patientNotesFile = "..\\CSVs\\patient_notes.csv"
trainFile = "..\\CSVs\\train.csv"

prep = Preprocessing()
prep.organizeData(featuresFile, patientNotesFile, trainFile)

print(prep.fDF['feature_text'][0])
print(prep.pnDF['pn_history'][0])

family history of mi or family history of myocardial infarction
17 year old male  has come to the student health clinic complaining of heart pounding  mr  cleveland s mother has given verbal consent for a history  physical examination  and treatment   began 2 3 months ago sudden intermittent for 2 days lasting 3 4 min  worsening non allev aggrav   associated with dispnea on exersion and rest stressed out about school   reports fe feels like his heart is jumping out of his chest   ros denies chest pain dyaphoresis wt loss chills fever nausea vomiting pedal edeam   pmh non meds  aderol  from a friend  nkda   fh father had mi recently mother has thyroid dz   sh non smoker mariguana 5 6 months ago 3 beers on the weekend  basketball at school   sh no std


In [79]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(prep.fDF['feature_text'][0], prep.pnDF['pn_history'][0])

print("Total Tokens", len(tokens))
print(tokens)
print(tokenizer(prep.pnDF['pn_history'][0]))

Total Tokens 162
['family', 'history', 'of', 'mi', 'or', 'family', 'history', 'of', 'my', '##oca', '##rdial', 'in', '##far', '##ction', '17', 'year', 'old', 'male', 'has', 'come', 'to', 'the', 'student', 'health', 'clinic', 'complaining', 'of', 'heart', 'pounding', 'mr', 'cleveland', 's', 'mother', 'has', 'given', 'verbal', 'consent', 'for', 'a', 'history', 'physical', 'examination', 'and', 'treatment', 'began', '2', '3', 'months', 'ago', 'sudden', 'intermittent', 'for', '2', 'days', 'lasting', '3', '4', 'min', 'worse', '##ning', 'non', 'all', '##ev', 'ag', '##gra', '##v', 'associated', 'with', 'di', '##sp', '##nea', 'on', 'ex', '##ers', '##ion', 'and', 'rest', 'stressed', 'out', 'about', 'school', 'reports', 'fe', 'feels', 'like', 'his', 'heart', 'is', 'jumping', 'out', 'of', 'his', 'chest', 'ro', '##s', 'denies', 'chest', 'pain', 'd', '##ya', '##ph', '##ores', '##is', 'w', '##t', 'loss', 'chill', '##s', 'fever', 'nausea', 'vomiting', 'pedal', 'ed', '##ea', '##m', 'pm', '##h', 'non', 

In [73]:
tokenized_list = tokenizer(
        prep.merged.iloc[0].feature_text,
        prep.merged.iloc[0].pn_history,
        truncation=True,
        max_length=1000,
        padding='max_length',
        return_offsets_mapping=True
)
print(prep.merged.iloc[0].pn_history[668:693])

zipped = zip(tokenized_list.sequence_ids(), tokenized_list["offset_mapping"])

idx, (seq_id, offsets) = next(enumerate(zipped))
if not seq_id or seq_id == 0:
    print("Seq ID zero, so level is -1 also")

seq_id = 1 #assume
loc_list = [668, 693]

for idx, (seq_id, offsets)  in enumerate(zip(tokenized_list.sequence_ids(), tokenized_list["offset_mapping"])):
    token_start, token_end = offsets
    for feature_start, feature_end in [loc_list]:
        if token_start >= feature_start and token_end <= feature_end:
            print(f"Word {prep.merged.iloc[0].pn_history[token_start:token_end]}, label: 1")

mom with  thyroid disease
[(0, 0), (0, 6), (7, 14), (15, 17), (18, 20), (21, 23), (24, 30), (31, 38), (39, 41), (42, 44), (44, 47), (47, 52), (53, 55), (55, 58), (58, 63), (0, 0), (0, 2), (2, 3), (5, 7), (7, 9), (10, 11), (12, 20), (21, 25), (26, 29), (29, 32), (32, 38), (40, 47), (48, 55), (56, 57), (58, 59), (60, 66), (67, 69), (70, 82), (83, 91), (92, 94), (96, 101), (102, 109), (110, 118), (119, 122), (123, 125), (126, 128), (129, 134), (137, 138), (139, 143), (144, 147), (148, 154), (155, 156), (157, 163), (164, 168), (169, 172), (173, 175), (176, 183), (185, 188), (189, 193), (194, 198), (199, 202), (203, 208), (209, 217), (218, 221), (222, 226), (227, 229), (230, 232), (233, 235), (236, 240), (241, 246), (247, 249), (250, 254), (255, 258), (260, 263), (264, 267), (268, 272), (273, 276), (276, 281), (281, 285), (288, 290), (291, 295), (296, 303), (304, 307), (307, 310), (310, 312), (313, 316), (316, 320), (321, 324), (324, 328), (328, 329), (331, 340), (341, 343), (344, 349), (35

In [197]:
def create_model():
    ## BERT encoder
    configuration = BertConfig().from_pretrained('bert-base-uncased',output_attentions=False,output_hidden_states=False,return_dict =True)
    encoder = TFBertModel(configuration)#.from_pretrained("bert-base-uncased")

    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name = 'input_ids')
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name = 'token_type_ids')
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name = 'attention_mask')
    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(learning_rate=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

In [198]:
max_len = 384
#configuration = BertConfig()
with tf.device('/cpu:0'):
    model = create_model()



In [185]:
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 384)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 384)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 384)]        0           []                               
                                                                                                  
 tf_bert_model_11 (TFBertModel)  TFBaseModelOutputWi  109482240  ['input_ids[0][0]',              
                                thPooling(last_hidd               'attention_mask[0][0]',   

In [62]:
print(prep.merged.drop(columns=['location']).iloc[0])

id                                                         00016_000
case_num                                                           0
pn_num                                                            16
feature_num                                                        0
annotation                          ['dad with recent heart attcak']
annotation_list                       [dad with recent heart attcak]
location_list                                              [696 724]
pn_history         hpi  17yo m presents with palpitations  patien...
feature_text       family history of mi or family history of myoc...
Name: 0, dtype: object


In [65]:
print(type(prep.merged['location_list'].iloc[0][0]))
print(prep.merged['location_list'].iloc[0])

<class 'str'>
['696 724']


In [180]:
def create_label(text, annotation_length, location_list):
    #tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    encoded = tokenizer(text,
                        add_special_tokens=False,
                        max_length=max_len,
                        padding="max_length",
                        return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return label

In [248]:
label = create_label(prep.merged['pn_history'][0], prep.merged['annotation_length'][0], prep.merged['location_list'][0])
label = np.asarray(label, dtype=int).reshape(1, max_len)[0]
print(label)
print(label.shape)

print("annotation location: ", prep.merged['location_list'][2])
print(prep.merged['pn_history'][2][203:217])

print(f'\nlabel = 1: {np.where(label == 1)}')
tokenized_text = tokenizer.tokenize(prep.merged['pn_history'][2], add_special_tokens=False)
print(f'tokenized_text[181:187]: {tokenized_text[40:42]}')

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  1  1  1  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1

In [257]:
input_data = tokenizer(
        prep.merged.iloc[0].pn_history,
        prep.merged.iloc[0].feature_text,
        #truncation=True,
        max_length=max_len,
        padding='max_length',
        return_offsets_mapping=False
)
#print(input_data)
test_input_data = {}
test_input_data['input_ids'] = np.asarray(input_data['input_ids'], dtype=int).reshape(1,max_len)
test_input_data['token_type_ids'] = np.asarray(input_data['token_type_ids'], dtype=int).reshape(1,max_len)
test_input_data['attention_mask'] = np.asarray(input_data['attention_mask'], dtype=int).reshape(1,max_len)
input_data = tokenizer(
        prep.merged.iloc[40].pn_history,
        prep.merged.iloc[40].feature_text,
        #truncation=True,
        max_length=max_len,
        padding='max_length',
        return_offsets_mapping=False
)
test_input_data['input_ids']=np.append(test_input_data['input_ids'],np.asarray(input_data['input_ids'], dtype=int).reshape(1,max_len))
test_input_data['token_type_ids'] = np.asarray(input_data['token_type_ids'], dtype=int).reshape(1,max_len)
test_input_data['attention_mask'] = np.asarray(input_data['attention_mask'], dtype=int).reshape(1,max_len)
print(type(test_input_data['input_ids']))
print(test_input_data['input_ids'])

<class 'numpy.ndarray'>
[  101  6522  2072  2459  7677  1049  7534  2007 14412 23270 10708  5776
  4311  1017  1018  2706  1997 23852  4178  1997  2540  6012  9836  2041
  1997  2026  3108  1016  2420  3283  2076  1037  4715  2208  2018  2019
  2792  2021  2023  2051  2018  3108  3778  1998  2371  2004  2065  2002
  2020  2183  2000  3413  2041  2106  2025  4558  9530 18436  2791  1997
  3602  5776  2203  5668  2229  8273  7741  5587 21673  2140  3952  2000
  2817  1015  1017  2335  2566  2733  2077  3522  4715  2208  2165  5587
  2121  7941  2140  2305  2077  1998  2851  1997  2208 23439  2460  2791
  1997  3052 22939  8458 16610  2483  9016  2015 10720  2015 14978 16342
  3431  1999  3637  3431  1999  4432  4994 21419 29025  2078  3431  1999
  6812  2884  2030 24471  3981  2854 14243  7610  2232  2595  3904  1054
  2595  3594  2814  5587  2121  7941  2140  1042  2232  2595  3566  2007
 29610  4295  3611  2007  3522  2540  2012 13535  4817  2035  3904 10047
 23041 22318  2039  2000  3

In [252]:
print(test_input_data['attention_mask'].shape)
print(label.shape)
#configuration = BertConfig().from_pretrained('bert-base-uncased')
#model = AutoModel.from_pretrained('bert-base-uncased', config = configuration)
model.fit(
    test_input_data,
    label,
    epochs=1,  # For demonstration, 3 epochs are recommended
    verbose=2,
    batch_size=1,
)

(384,)
(384,)


StagingError: in user code:

    File "C:\Users\Serhan\anaconda3\envs\tf-cpu\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Serhan\anaconda3\envs\tf-cpu\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Serhan\anaconda3\envs\tf-cpu\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Serhan\anaconda3\envs\tf-cpu\lib\site-packages\keras\engine\training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\Serhan\anaconda3\envs\tf-cpu\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None

    StagingError: Exception encountered when calling layer "tf_bert_model_16" (type TFBertModel).
    
    in user code:
    
        File "C:\Users\Serhan\anaconda3\envs\tf-cpu\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 887, in call  *
            outputs = self.bert(
        File "C:\Users\Serhan\anaconda3\envs\tf-cpu\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
    
        StagingError: Exception encountered when calling layer "bert" (type TFBertMainLayer).
        
        in user code:
        
            File "C:\Users\Serhan\anaconda3\envs\tf-cpu\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 658, in call  *
                extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))
        
            IndexError: list index out of range
        
        
        Call arguments received:
          • input_ids=tf.Tensor(shape=(1,), dtype=int32)
          • attention_mask=tf.Tensor(shape=(1,), dtype=int32)
          • token_type_ids=tf.Tensor(shape=(1,), dtype=int32)
          • position_ids=None
          • head_mask=None
          • inputs_embeds=None
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
          • kwargs=<class 'inspect._empty'>
    
    
    Call arguments received:
      • input_ids=tf.Tensor(shape=(1,), dtype=int32)
      • attention_mask=tf.Tensor(shape=(1,), dtype=int32)
      • token_type_ids=tf.Tensor(shape=(1,), dtype=int32)
      • position_ids=None
      • head_mask=None
      • inputs_embeds=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • training=True
      • kwargs=<class 'inspect._empty'>


In [259]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,normalization=True)
config = AutoConfig.from_pretrained(MODEL_NAME)