In [31]:
import numpy as np
import os
import pandas as pd
import re

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import time


In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [7]:
os.chdir("/content/drive/MyDrive/DLH project")

In [9]:
t1 = time.time()
clinical_notes_df = pd.read_csv("NOTEEVENTS.csv")
# CHARTDATE and CHARTTIME apparrently have mixed types, but that does not matter for us
t2 = time.time()
print(t2 - t1)

  clinical_notes_df = pd.read_csv("NOTEEVENTS.csv")


58.83007049560547


In [10]:
annotations_df = pd.read_csv("annotations.csv")
annotations_df.shape

(1610, 18)

In [11]:
# Get all documents wtih a combination of hospital admission ID and subject ID matching one
# used in the study
merged_df = pd.merge(annotations_df, clinical_notes_df, left_on = ["Hospital.Admission.ID", "subject.id"], right_on = ["HADM_ID", "SUBJECT_ID"])

In [12]:
merged_df.shape

(56839, 29)

In [13]:
merged_df["CATEGORY"].value_counts()

Nursing/other        21066
Radiology            13135
Nursing               7520
Physician             5059
ECG                   5026
Discharge summary     1976
Respiratory           1155
Echo                  1021
Nutrition              325
General                290
Rehab Services         166
Social Work             64
Case Management         36
Name: CATEGORY, dtype: int64

In [14]:
# Only the discharge summaries are relevant
merged_df = merged_df[merged_df["CATEGORY"] == "Discharge summary"]
merged_df.shape

(1976, 29)

In [15]:
# Number of unique combinations of hospital admission ID and subject ID
merged_df.groupby(["HADM_ID", "SUBJECT_ID"]).size().reset_index().rename(columns = {0 : 'count'}).shape

(1560, 3)

In [16]:
temp = merged_df.groupby(["HADM_ID", "SUBJECT_ID"]).size().reset_index().rename(columns = {0 : 'count'})

# Number of unique combinations of hospital admission ID and subject ID
# which have more than one discharge summary
print(temp[temp["count"] > 1].shape)

# Number of unique combinations of hospital admission ID and subject ID
# which have only one discharge summary
print(temp[temp["count"] == 1].shape)

(219, 3)
(1341, 3)


In [17]:
# A dataframe containing the hospital admission ID and subject ID for all
# patients used in the study and who have only one discharge summary in MIMIC-III
# We have no way of telling which set of labels corresponds to which discharge summary
# for each such combination of IDs, due to the labels being somewhat mislabeled (the chart.time
# field contains either a copy of the hospital admission ID or 999999 for each set of labels,
# instead of an actual time), so we can only use discharge summaries from these patients
ids_for_non_duplicate = temp[temp["count"] == 1][["HADM_ID", "SUBJECT_ID"]]

In [18]:
# This contains the hospital admission ID, subject ID, discharge summary text, and labels
# for all of the patients we will be using in our study
labelled_corpus_df = pd.merge(merged_df, ids_for_non_duplicate, left_on = ["HADM_ID", "SUBJECT_ID"], right_on = ["HADM_ID", "SUBJECT_ID"])
labelled_corpus_df = labelled_corpus_df[["HADM_ID",
                                         "SUBJECT_ID",
                                         "TEXT",
                                         "Advanced.Cancer",
                                         "Advanced.Heart.Disease",
                                         "Advanced.Lung.Disease",
                                         "Chronic.Neurological.Dystrophies",
                                         "Chronic.Pain.Fibromyalgia",
                                         "Alcohol.Abuse",
                                         "Other.Substance.Abuse",
                                         "Obesity",
                                         "Schizophrenia.and.other.Psychiatric.Disorders",
                                         "Depression"]]

In [19]:
labelled_corpus_df.head(1)

Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,Advanced.Cancer,Advanced.Heart.Disease,Advanced.Lung.Disease,Chronic.Neurological.Dystrophies,Chronic.Pain.Fibromyalgia,Alcohol.Abuse,Other.Substance.Abuse,Obesity,Schizophrenia.and.other.Psychiatric.Disorders,Depression
0,118003.0,3644,Admission Date: [**2200-4-7**] Discharge ...,0,0,0,0,1,0,0,0,0,1


In [20]:
labelled_corpus_df.iloc[0]["TEXT"]

"Admission Date:  [**2200-4-7**]     Discharge Date:  [**2200-4-10**]\n\nDate of Birth:   [**2146-9-21**]     Sex:  F\n\nService:  CARDIAC INTENSIVE CARE MEDICINE\n\nCHIEF COMPLAINT:  The patient was admitted to the Cardiac\nIntensive Care Unit Medicine Service on [**2200-4-7**], with the\nchief complaint of acute myocardial infarction and fever.\n\nHISTORY OF PRESENT ILLNESS:  The patient is a 53 year old\nwhite female with a history of coronary artery disease,\nhypertension, hypercholesterolemia and two pack per day\ntobacco use with previous coronary artery bypass graft\nsurgery presenting to an outside hospital on [**2200-4-6**], with a\ntwo day history of fevers and confusion.  The patient had a\nCT scan of the chest at that time which revealed pneumonia by\nreport in the left lower lobe.\n\nWhile in the outside hospital Emergency Department, the\npatient complained of chest pain.  The patient states that\nshe has had this pain for approximately two weeks with no\nrelief.  She was

In [21]:
# From provided code
# Did not remove commas??
# Separates "patient's" into "patient 's"??? Purpose???? -> should be caught with n-grams
def clean_str(string):
    """
    Tokenization/string cleaning.
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower() # We include lower()??? # .lower() word2vec is case sensitive



In [22]:
clean_str(labelled_corpus_df.iloc[0]["TEXT"])

"admission date 2200 4 7 discharge date 2200 4 10 date of birth 2146 9 21 sex f service cardiac intensive care medicine chief complaint the patient was admitted to the cardiac intensive care unit medicine service on 2200 4 7 , with the chief complaint of acute myocardial infarction and fever history of present illness the patient is a 53 year old white female with a history of coronary artery disease , hypertension , hypercholesterolemia and two pack per day tobacco use with previous coronary artery bypass graft surgery presenting to an outside hospital on 2200 4 6 , with a two day history of fevers and confusion the patient had a ct scan of the chest at that time which revealed pneumonia by report in the left lower lobe while in the outside hospital emergency department , the patient complained of chest pain the patient states that she has had this pain for approximately two weeks with no relief she was given levofloxacin for apparent community acquired pneumonia and cardiac enzymes w

In [23]:
# Takes ~25 seconds once, then just 1.6344 seconds when run later?
t1 = time.time()
labelled_corpus_df["Cleaned Text"] = labelled_corpus_df.apply(lambda row : clean_str(row["TEXT"]), axis = 1)
labelled_corpus_df = labelled_corpus_df.drop(['TEXT'], axis=1)
t2 = time.time()
print(t2 - t1)

1.667518138885498


In [24]:
labelled_corpus_df.iloc[2]["Cleaned Text"]

"admission date 2167 5 19 discharge date 2167 6 11 date of birth 2112 10 13 sex m service medicine allergies penicillins attending first name3 ( lf ) 1828 chief complaint squamous cell carcinoma , bacteremia , need for peg and extraction of teeth major surgical or invasive procedure none major had extraction of all teeth , and placment of gastrostomy tube history of present illness mr known lastname 40332 is a 54 year old man with type i dm , ckd stage v on dialysis , coronary artery disease who was diagnosed a little over one month ago with scc of head and neck he is admitted now for management of multiple issues since his diagnosis , radiation oncology , medical oncology and dental have spent extensive time and energy arranging for treatment plan ultimately , decision made to pursue peg tube , followed by teeth extraction ( very poor dentition ) followed by radiation treatment and erbitux he was seen in the hospital clinic today for planned peg tube but gi unable to place due to conc

In [25]:
labelled_corpus_df.head(1)

Unnamed: 0,HADM_ID,SUBJECT_ID,Advanced.Cancer,Advanced.Heart.Disease,Advanced.Lung.Disease,Chronic.Neurological.Dystrophies,Chronic.Pain.Fibromyalgia,Alcohol.Abuse,Other.Substance.Abuse,Obesity,Schizophrenia.and.other.Psychiatric.Disorders,Depression,Cleaned Text
0,118003.0,3644,0,0,0,0,1,0,0,0,0,1,admission date 2200 4 7 discharge date 2200 4 ...


In [26]:
model = LogisticRegression(solver='liblinear')
#model = LogisticRegression(n_jobs=32) #differnet model takes longer time may be more accurate

In [27]:
#model fit for bag of words and mulitple n-grams
def bag_of_words(model, X_train, X_test, y_train, y_test):
    scores = np.zeros((10,5))
    #loop through the 10 different phenotypes
    for i in range(y_train.shape[1]):
        #Fit the model
        model.fit(X_train, y_train.iloc[:,i])
        #predit the score for each of the phenotypes
        y_predict = model.predict(X_test)
        #save the scores to be printed later for each phenotype
        scores[i][0] = metrics.accuracy_score(y_test.iloc[:,i], y_predict)
        scores[i][1] = metrics.precision_score(y_test.iloc[:,i], y_predict)
        scores[i][2] = metrics.recall_score(y_test.iloc[:,i], y_predict)
        scores[i][3] = metrics.f1_score(y_test.iloc[:,i], y_predict)
        scores[i][4] = metrics.roc_auc_score(y_test.iloc[:,i], y_predict)
    return scores

In [33]:
#returns X_train, X_test, y_train, y_test and feature_names 
#from labelled_corpus_df using the n_gram_max as max and 1 as min for connected words.
def get_train_test_data(n_gram_max):
    #get the text grouping for n-gram were there are n words connected
    vectorizer = CountVectorizer(ngram_range=(1,n_gram_max), dtype='int16') #using int16 to reduce size still plenty big
    X = vectorizer.fit_transform(labelled_corpus_df.loc[:,"Cleaned Text"])
    #split the data to 80% train and 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, labelled_corpus_df.iloc[:,2:12],test_size=0.2)
    #return feature_names to show what names or words are connected
    feature_names = vectorizer.get_feature_names_out()
    return X_train, X_test, y_train, y_test, feature_names

In [34]:
#run the model for [1]-gram bag-of-words
t1 = time.time()
#get the text grouping for 1-gram
X_train, X_test, y_train, y_test, feature_names_1 = get_train_test_data(1) #the 1 in get_train_test_data means max 1 word connected

scores_1 = bag_of_words(model, X_train, X_test, y_train, y_test)
t2 = time.time()
time_1_gram = t2 - t1
print("time of [1]-gram:", time_1_gram)

time of [1]-gram: 10.505383014678955


In [40]:
#show first 10 feature_names for [1]-gram
print(feature_names_1[0:10])

['00' '000' '0000' '000mcg' '000mg' '000u' '000unit' '000units' '000wbc'
 '001']


In [36]:
#put in dataframe with labels for easier viewing 
df_scores_1 = pd.DataFrame(scores_1, index=labelled_corpus_df.iloc[:,2:12].columns,
                        columns=["Accuracy","Precision", "Recall", "F1", "roc_auc"])
df_scores_1.style.set_caption("[1]-gram bag-of-words with time of: "+ str(round(time_1_gram,2)) + " seconds")

Unnamed: 0,Accuracy,Precision,Recall,F1,roc_auc
Advanced.Cancer,0.929368,0.736842,0.5,0.595745,0.739627
Advanced.Heart.Disease,0.862454,0.703704,0.395833,0.506667,0.679817
Advanced.Lung.Disease,0.907063,0.473684,0.375,0.418605,0.667092
Chronic.Neurological.Dystrophies,0.758364,0.522727,0.343284,0.414414,0.619662
Chronic.Pain.Fibromyalgia,0.836431,0.571429,0.48,0.521739,0.698904
Alcohol.Abuse,0.910781,0.777778,0.411765,0.538462,0.697372
Other.Substance.Abuse,0.929368,0.9,0.333333,0.486486,0.664601
Obesity,0.903346,0.5,0.115385,0.1875,0.551519
Schizophrenia.and.other.Psychiatric.Disorders,0.858736,0.75,0.403846,0.525,0.685794
Depression,0.758364,0.586207,0.453333,0.511278,0.664811


In [37]:
#run the model for [1-2]-gram bag-of-words
t1 = time.time()
X_train, X_test, y_train, y_test, feature_names_2 = get_train_test_data(2)  
scores_2 = bag_of_words(model, X_train, X_test, y_train, y_test)
t2 = time.time()
time_2_gram = t2 - t1
print("time of [1-2]-gram:", time_2_gram)

time of [1-2]-gram: 44.59006452560425


In [39]:
#show first 10 feature_names for [1-2]-gram
print(feature_names_2[0:10])

['00' '00 00' '00 16' '00 20' '00 21' '00 2112' '00 2122' '00 2140'
 '00 2177' '00 2199']


In [41]:
#put in dataframe with labels for easier viewing 
df_scores_2 = pd.DataFrame(scores_2, index=labelled_corpus_df.iloc[:,2:12].columns,
                        columns=["Accuracy","Precision", "Recall", "F1", "roc_auc"])
df_scores_2.style.set_caption("[1-2]-gram bag-of-words with time of: "+ str(round(time_2_gram,2)) + " seconds")

Unnamed: 0,Accuracy,Precision,Recall,F1,roc_auc
Advanced.Cancer,0.903346,0.772727,0.447368,0.566667,0.712862
Advanced.Heart.Disease,0.899628,0.8,0.47619,0.597015,0.727082
Advanced.Lung.Disease,0.895911,0.6,0.2,0.3,0.591632
Chronic.Neurological.Dystrophies,0.806691,0.7,0.328125,0.446809,0.642111
Chronic.Pain.Fibromyalgia,0.821561,0.5625,0.346154,0.428571,0.640819
Alcohol.Abuse,0.907063,1.0,0.264706,0.418605,0.632353
Other.Substance.Abuse,0.921933,0.428571,0.15,0.222222,0.566968
Obesity,0.895911,0.5,0.035714,0.066667,0.515782
Schizophrenia.and.other.Psychiatric.Disorders,0.877323,0.6,0.324324,0.421053,0.644921
Depression,0.795539,0.54,0.457627,0.495413,0.674052


In [42]:
#run the model for [1-3]-gram bag-of-words
t1 = time.time()
X_train, X_test, y_train, y_test, feature_names_3 = get_train_test_data(3)  
scores_3 = bag_of_words(model, X_train, X_test, y_train, y_test)
t2 = time.time()
time_3_gram = t2 - t1
print("time of [1-3]-gram:", time_3_gram)

time of [1-3]-gram: 123.26719999313354


In [43]:
#show first 10 feature_names for [1-3]-gram
print(feature_names_3[0:10])

['00' '00 00' '00 00 on' '00 16' '00 16 furosemide' '00 20' '00 20 14'
 '00 21' '00 21 job' '00 2112']


In [44]:
#put in dataframe with labels for easier viewing 
df_scores_3 = pd.DataFrame(scores_3, index=labelled_corpus_df.iloc[:,2:12].columns,
                        columns=["Accuracy","Precision", "Recall", "F1", "roc_auc"])
df_scores_3.style.set_caption("[1-3]-gram bag-of-words with time of: "+ str(round(time_3_gram,2)) + " seconds")

Unnamed: 0,Accuracy,Precision,Recall,F1,roc_auc
Advanced.Cancer,0.925651,1.0,0.310345,0.473684,0.655172
Advanced.Heart.Disease,0.821561,0.733333,0.354839,0.478261,0.658096
Advanced.Lung.Disease,0.895911,0.555556,0.172414,0.263158,0.577874
Chronic.Neurological.Dystrophies,0.817844,0.741935,0.359375,0.484211,0.660175
Chronic.Pain.Fibromyalgia,0.795539,0.6,0.20339,0.303797,0.582647
Alcohol.Abuse,0.899628,0.8125,0.351351,0.490566,0.66921
Other.Substance.Abuse,0.933086,0.769231,0.4,0.526316,0.693852
Obesity,0.925651,0.0,0.0,0.0,0.498
Schizophrenia.and.other.Psychiatric.Disorders,0.843866,0.647059,0.234043,0.34375,0.603508
Depression,0.736059,0.581395,0.320513,0.413223,0.613136


In [45]:
#run the model for [1-4]-gram bag-of-words
t1 = time.time()
X_train, X_test, y_train, y_test, feature_names_4 = get_train_test_data(4)  
scores_4 = bag_of_words(model, X_train, X_test, y_train, y_test)
t2 = time.time()
time_4_gram = t2 - t1
print("time of [1-4]-gram:", time_4_gram)

time of [1-4]-gram: 181.1189033985138


In [46]:
#show first 10 feature_names for [1-4]-gram
print(feature_names_4[0:10])

['00' '00 00' '00 00 on' '00 00 on 2146' '00 16' '00 16 furosemide'
 '00 16 furosemide 40' '00 20' '00 20 14' '00 20 14 job']


In [47]:
#put in dataframe with labels for easier viewing 
df_scores_4 = pd.DataFrame(scores_4, index=labelled_corpus_df.iloc[:,2:12].columns,
                        columns=["Accuracy","Precision", "Recall", "F1", "roc_auc"])
df_scores_4.style.set_caption("[1-4]-gram bag-of-words with time of: "+ str(round(time_4_gram,2)) + " seconds")

Unnamed: 0,Accuracy,Precision,Recall,F1,roc_auc
Advanced.Cancer,0.903346,0.875,0.21875,0.35,0.607265
Advanced.Heart.Disease,0.895911,0.76,0.463415,0.575758,0.718549
Advanced.Lung.Disease,0.881041,0.666667,0.171429,0.272727,0.579304
Chronic.Neurological.Dystrophies,0.806691,0.727273,0.258065,0.380952,0.61454
Chronic.Pain.Fibromyalgia,0.802974,0.545455,0.218182,0.311688,0.585726
Alcohol.Abuse,0.947955,1.0,0.517241,0.681818,0.758621
Other.Substance.Abuse,0.921933,0.636364,0.291667,0.4,0.63767
Obesity,0.925651,0.5,0.05,0.090909,0.522992
Schizophrenia.and.other.Psychiatric.Disorders,0.881041,0.9375,0.326087,0.483871,0.660801
Depression,0.747212,0.644444,0.358025,0.460317,0.636459


In [48]:
#run the model for [1-5]-gram bag-of-words
t1 = time.time()
X_train, X_test, y_train, y_test, feature_names_5 = get_train_test_data(5)  
scores_5 = bag_of_words(model, X_train, X_test, y_train, y_test)
t2 = time.time()
time_5_gram = t2 - t1
print("time of [1-5]-gram:", time_5_gram)

  _warn_prf(average, modifier, msg_start, len(result))


time of [1-5]-gram: 243.22261357307434


In [49]:
#show first 10 feature_names for [1-5]-gram
print(feature_names_5[0:10])

['00' '00 00' '00 00 on' '00 00 on 2146' '00 00 on 2146 11' '00 16'
 '00 16 furosemide' '00 16 furosemide 40' '00 16 furosemide 40 mg' '00 20']


In [50]:
#put in dataframe with labels for easier viewing 
df_scores_5 = pd.DataFrame(scores_5, index=labelled_corpus_df.iloc[:,2:12].columns,
                        columns=["Accuracy","Precision", "Recall", "F1", "roc_auc"])
df_scores_5.style.set_caption("[1-5]-gram bag-of-words with time of: "+ str(round(time_5_gram,2)) + " seconds")

Unnamed: 0,Accuracy,Precision,Recall,F1,roc_auc
Advanced.Cancer,0.925651,1.0,0.2,0.333333,0.6
Advanced.Heart.Disease,0.855019,0.6,0.27907,0.380952,0.621836
Advanced.Lung.Disease,0.869888,0.5,0.028571,0.054054,0.512149
Chronic.Neurological.Dystrophies,0.821561,0.789474,0.254237,0.384615,0.617595
Chronic.Pain.Fibromyalgia,0.828996,0.588235,0.204082,0.30303,0.586132
Alcohol.Abuse,0.892193,1.0,0.275,0.431373,0.6375
Other.Substance.Abuse,0.895911,0.8,0.235294,0.363636,0.613392
Obesity,0.918216,0.0,0.0,0.0,0.5
Schizophrenia.and.other.Psychiatric.Disorders,0.840149,0.846154,0.211538,0.338462,0.601161
Depression,0.739777,0.6,0.272727,0.375,0.599905
