In [16]:
from transformers import BertTokenizerFast, TFBertModel, BertConfig
from transformers import AutoTokenizer, AutoModel, AutoConfig, TFAutoModel
from ast import literal_eval
import pandas as pd
import numpy as np
import re
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import tensorflow_text as text
from sklearn.preprocessing import  LabelEncoder
from tqdm.auto import tqdm

In [2]:
MODEL_NAME = 'bert-base-uncased'
SEQUENCE_LENGTH = 512

In [3]:
class Preprocessing:
    def __init__(self):
        return

    def organizeData(self, featuresFile, patientNotesFile, trainFile):
        self.fDF = pd.read_csv(featuresFile)
        self.pnDF = pd.read_csv(patientNotesFile)
        self.tDF = pd.read_csv(trainFile)

        self.tDF['annotation_list'] = self.tDF['annotation'].apply(literal_eval)
        self.tDF['location_list'] = self.tDF['location'].apply(literal_eval)

        self.removeORs()
        self.removeSpecialChars()
        
        self.merged = self.tDF.merge(self.pnDF, how="left")
        self.merged = self.merged.merge(self.fDF, how="left")
        #self.merged['annotation_length'] = self.tDF['annotation'].apply(len)

    def removeORs(self):
        self.fDF['feature_text'] = self.fDF['feature_text'].apply(lambda x: x.lower())
        for i in range(len(self.fDF['feature_text'])):
            self.fDF.at[i, 'feature_text'] = self.fDF['feature_text'][i].replace("-OR-", ";-").replace("-", " ")

    def removeSpecialChars(self):
        self.pnDF['pn_history'] = self.pnDF['pn_history'].apply(lambda x: x.lower())
        for i in range(len(self.pnDF['pn_history'])):
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("(", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(")", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(":", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(";", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("-", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("/", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("\\", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("\r\n", "  ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("\'", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace("\"", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(",", " ")
            self.pnDF.at[i, 'pn_history'] = self.pnDF['pn_history'][i].replace(".", " ")
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bzero\b', ' 0  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bone\b', ' 1 ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\btwo\b', ' 2 ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bthree\b', '  3  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bfour\b', ' 4  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bfive\b', ' 5  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bsix\b', ' 6 ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bseven\b', '  7  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\beight\b', '  8  ', self.pnDF['pn_history'][i])
            self.pnDF.at[i, 'pn_history'] = re.sub(r'\bnine\b', ' 9  ', self.pnDF['pn_history'][i])

In [4]:
featuresFile = "..\\CSVs\\features.csv"
patientNotesFile = "..\\CSVs\\patient_notes.csv"
trainFile = "..\\CSVs\\train.csv"

prep = Preprocessing()
prep.organizeData(featuresFile, patientNotesFile, trainFile)

print(prep.fDF['feature_text'][0])
print(prep.pnDF['pn_history'][0])

family history of mi or family history of myocardial infarction
17 year old male  has come to the student health clinic complaining of heart pounding  mr  cleveland s mother has given verbal consent for a history  physical examination  and treatment   began 2 3 months ago sudden intermittent for 2 days lasting 3 4 min  worsening non allev aggrav   associated with dispnea on exersion and rest stressed out about school   reports fe feels like his heart is jumping out of his chest   ros denies chest pain dyaphoresis wt loss chills fever nausea vomiting pedal edeam   pmh non meds  aderol  from a friend  nkda   fh father had mi recently mother has thyroid dz   sh non smoker mariguana 5 6 months ago 3 beers on the weekend  basketball at school   sh no std


In [5]:
prep.merged.head(5)

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,annotation_list,location_list,pn_history,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],[dad with recent heart attcak],[696 724],hpi 17yo m presents with palpitations patien...,family history of mi or family history of myoc...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],"[mom with ""thyroid disease]",[668 693],hpi 17yo m presents with palpitations patien...,family history of thyroid disorder
2,00016_002,0,16,2,['chest pressure'],['203 217'],[chest pressure],[203 217],hpi 17yo m presents with palpitations patien...,chest pressure
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']","[intermittent episodes, episode]","[70 91, 176 183]",hpi 17yo m presents with palpitations patien...,intermittent symptoms
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],[felt as if he were going to pass out],[222 258],hpi 17yo m presents with palpitations patien...,lightheaded


In [6]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
tokens = tokenizer.tokenize(prep.fDF['feature_text'][0], prep.pnDF['pn_history'][0])

print("Total Tokens", len(tokens))
print(tokens)
print(tokenizer(prep.pnDF['pn_history'][0]))

Total Tokens 162
['family', 'history', 'of', 'mi', 'or', 'family', 'history', 'of', 'my', '##oca', '##rdial', 'in', '##far', '##ction', '17', 'year', 'old', 'male', 'has', 'come', 'to', 'the', 'student', 'health', 'clinic', 'complaining', 'of', 'heart', 'pounding', 'mr', 'cleveland', 's', 'mother', 'has', 'given', 'verbal', 'consent', 'for', 'a', 'history', 'physical', 'examination', 'and', 'treatment', 'began', '2', '3', 'months', 'ago', 'sudden', 'intermittent', 'for', '2', 'days', 'lasting', '3', '4', 'min', 'worse', '##ning', 'non', 'all', '##ev', 'ag', '##gra', '##v', 'associated', 'with', 'di', '##sp', '##nea', 'on', 'ex', '##ers', '##ion', 'and', 'rest', 'stressed', 'out', 'about', 'school', 'reports', 'fe', 'feels', 'like', 'his', 'heart', 'is', 'jumping', 'out', 'of', 'his', 'chest', 'ro', '##s', 'denies', 'chest', 'pain', 'd', '##ya', '##ph', '##ores', '##is', 'w', '##t', 'loss', 'chill', '##s', 'fever', 'nausea', 'vomiting', 'pedal', 'ed', '##ea', '##m', 'pm', '##h', 'non', 

In [7]:
tokenized_list = tokenizer(
        prep.merged.iloc[0].feature_text,
        prep.merged.iloc[0].pn_history,
        truncation=True,
        max_length=1000,
        padding='max_length',
        return_offsets_mapping=True
)
print(prep.merged.iloc[0].pn_history[668:693])

zipped = zip(tokenized_list.sequence_ids(), tokenized_list["offset_mapping"])

idx, (seq_id, offsets) = next(enumerate(zipped))
if not seq_id or seq_id == 0:
    print("Seq ID zero, so level is -1 also")

seq_id = 1 #assume
loc_list = [668, 693]

for idx, (seq_id, offsets)  in enumerate(zip(tokenized_list.sequence_ids(), tokenized_list["offset_mapping"])):
    token_start, token_end = offsets
    for feature_start, feature_end in [loc_list]:
        if token_start >= feature_start and token_end <= feature_end:
            print(f"Word {prep.merged.iloc[0].pn_history[token_start:token_end]}, label: 1")

mom with  thyroid disease
Seq ID zero, so level is -1 also
Word mom, label: 1
Word with, label: 1
Word thyroid, label: 1
Word disease, label: 1


In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,normalization=True)
config = AutoConfig.from_pretrained(MODEL_NAME)

In [9]:
EMPTY =  'EMPTY'
CLASSES = [EMPTY,]+prep.fDF.feature_num.unique().tolist()
print(CLASSES)
label_encoder = LabelEncoder()
label_encoder.fit(CLASSES)
features = np.asarray(prep.merged['feature_num'], dtype = str)
print(features)
prep.merged['TARGET'] = label_encoder.transform(features)
N_CLASSES = len(label_encoder.classes_)
EMPTY_IDX = label_encoder.transform([EMPTY,]) [0]
print(prep.merged['TARGET'])

['EMPTY', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 700, 701, 702, 703, 704, 705, 706, 707, 708, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916]
['0' '1' '2' ... '914' '915' '916']
0          0
1          1
2         18
3         36
4         53
        ... 
14295    138
14296    139
14297    140
14298    141
14299    142
Name: TARGET, Length: 14300, dtype: int32


In [12]:
def decode_location(locations):
    for x in ["[","]","'"]:
        locations = locations.replace(x,'')
    locations = locations.replace(',',';')
    locations = locations.split(";")
    res = []
    for location in locations:
        if location:
            x,y = location.split()
            res.append((int(x),int(y)))
    return sorted(res,key=lambda x:x[0])


In [13]:
sequences, labels, masks = [], [], []
for g1 in tqdm(prep.merged.groupby('pn_num')):
    gdf = g1[1]
    pn_history  = gdf.iloc[0].pn_history

    tokens = tokenizer.encode_plus(pn_history, max_length=SEQUENCE_LENGTH, padding='max_length',truncation=True, return_offsets_mapping=True)
    sequence = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    label = np.array([EMPTY_IDX for _ in range(SEQUENCE_LENGTH)])

    # BUILD THE TARGET ARRAY
    offsets = tokens['offset_mapping']
    label_empty = True
    for index, row in gdf.iterrows():
        TARGET = row.TARGET
        for i, (w_start, w_end) in enumerate(offsets):
            for start,end in decode_location(row.location):
                if w_start < w_end and (w_start >= start) and (end >= w_end):
                    label[i] = TARGET
                    label_empty = False
                if w_start >= w_end:
                    break
    if not label_empty:
        sequences.append(sequence)
        masks.append(attention_mask)
        labels.append(label)

sequences = np.array(sequences).astype(np.int32)
masks = np.array(masks).astype(np.uint8)
labels = np.array(tf.keras.utils.to_categorical(labels,N_CLASSES)).astype(np.uint8)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [14]:
def build_model():
    
    tokens = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'tokens', dtype=tf.int32)
    attention = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'attention', dtype=tf.int32)
    
    config = AutoConfig.from_pretrained(MODEL_NAME)
    backbone = TFAutoModel.from_pretrained(MODEL_NAME,config=config)
    
    out = backbone(tokens, attention_mask=attention)[0]
    out = tf.keras.layers.Dropout(0.2)(out)
    out = tf.keras.layers.Dense(N_CLASSES, activation='softmax')(out)
    
    model = tf.keras.Model([tokens,attention],out)
    
    return model

In [18]:
with tf.device('/cpu:0'):
    model = build_model()

    callback = tf.keras.callbacks.EarlyStopping(monitor='loss',mode='min', patience=3)

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                  loss=tf.keras.losses.categorical_crossentropy,metrics=['acc',])

    history = model.fit((sequences,masks),labels,
                        batch_size=12,
                        epochs=1,
                        callbacks=[callback,])

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


 1/84 [..............................] - ETA: 1:50:39 - loss: 5.4660 - acc: 3.2552e-04

KeyboardInterrupt: 