In [1]:
import numpy as np
import os
import pandas as pd
import re

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import time


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# New Section

# New Section

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
os.chdir("/content/drive/MyDrive/DLH project")

In [5]:
t1 = time.time()
clinical_notes_df = pd.read_csv("NOTEEVENTS.csv")
# CHARTDATE and CHARTTIME apparrently have mixed types, but that does not matter for us
t2 = time.time()
print(t2 - t1)

  clinical_notes_df = pd.read_csv("NOTEEVENTS.csv")


89.28294348716736


In [6]:
annotations_df = pd.read_csv("annotations.csv")
annotations_df.shape

(1610, 18)

In [7]:
# Get all documents wtih a combination of hospital admission ID and subject ID matching one
# used in the study
merged_df = pd.merge(annotations_df, clinical_notes_df, left_on = ["Hospital.Admission.ID", "subject.id"], right_on = ["HADM_ID", "SUBJECT_ID"])

In [8]:
merged_df.shape

(56839, 29)

In [9]:
merged_df["CATEGORY"].value_counts()

Nursing/other        21066
Radiology            13135
Nursing               7520
Physician             5059
ECG                   5026
Discharge summary     1976
Respiratory           1155
Echo                  1021
Nutrition              325
General                290
Rehab Services         166
Social Work             64
Case Management         36
Name: CATEGORY, dtype: int64

In [10]:
# Only the discharge summaries are relevant
merged_df = merged_df[merged_df["CATEGORY"] == "Discharge summary"]
merged_df.shape

(1976, 29)

In [11]:
# Number of unique combinations of hospital admission ID and subject ID
merged_df.groupby(["HADM_ID", "SUBJECT_ID"]).size().reset_index().rename(columns = {0 : 'count'}).shape

(1560, 3)

In [12]:
temp = merged_df.groupby(["HADM_ID", "SUBJECT_ID"]).size().reset_index().rename(columns = {0 : 'count'})

# Number of unique combinations of hospital admission ID and subject ID
# which have more than one discharge summary
print(temp[temp["count"] > 1].shape)

# Number of unique combinations of hospital admission ID and subject ID
# which have only one discharge summary
print(temp[temp["count"] == 1].shape)

(219, 3)
(1341, 3)


In [13]:
# A dataframe containing the hospital admission ID and subject ID for all
# patients used in the study and who have only one discharge summary in MIMIC-III
# We have no way of telling which set of labels corresponds to which discharge summary
# for each such combination of IDs, due to the labels being somewhat mislabeled (the chart.time
# field contains either a copy of the hospital admission ID or 999999 for each set of labels,
# instead of an actual time), so we can only use discharge summaries from these patients
ids_for_non_duplicate = temp[temp["count"] == 1][["HADM_ID", "SUBJECT_ID"]]

In [14]:
# This contains the hospital admission ID, subject ID, discharge summary text, and labels
# for all of the patients we will be using in our study
labelled_corpus_df = pd.merge(merged_df, ids_for_non_duplicate, left_on = ["HADM_ID", "SUBJECT_ID"], right_on = ["HADM_ID", "SUBJECT_ID"])
labelled_corpus_df = labelled_corpus_df[["HADM_ID",
                                         "SUBJECT_ID",
                                         "TEXT",
                                         "Advanced.Cancer",
                                         "Advanced.Heart.Disease",
                                         "Advanced.Lung.Disease",
                                         "Chronic.Neurological.Dystrophies",
                                         "Chronic.Pain.Fibromyalgia",
                                         "Alcohol.Abuse",
                                         "Other.Substance.Abuse",
                                         "Obesity",
                                         "Schizophrenia.and.other.Psychiatric.Disorders",
                                         "Depression"]]

In [15]:
labelled_corpus_df.head(1)

Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,Advanced.Cancer,Advanced.Heart.Disease,Advanced.Lung.Disease,Chronic.Neurological.Dystrophies,Chronic.Pain.Fibromyalgia,Alcohol.Abuse,Other.Substance.Abuse,Obesity,Schizophrenia.and.other.Psychiatric.Disorders,Depression
0,118003.0,3644,Admission Date: [**2200-4-7**] Discharge ...,0,0,0,0,1,0,0,0,0,1


In [16]:
labelled_corpus_df.iloc[0]["TEXT"]

"Admission Date:  [**2200-4-7**]     Discharge Date:  [**2200-4-10**]\n\nDate of Birth:   [**2146-9-21**]     Sex:  F\n\nService:  CARDIAC INTENSIVE CARE MEDICINE\n\nCHIEF COMPLAINT:  The patient was admitted to the Cardiac\nIntensive Care Unit Medicine Service on [**2200-4-7**], with the\nchief complaint of acute myocardial infarction and fever.\n\nHISTORY OF PRESENT ILLNESS:  The patient is a 53 year old\nwhite female with a history of coronary artery disease,\nhypertension, hypercholesterolemia and two pack per day\ntobacco use with previous coronary artery bypass graft\nsurgery presenting to an outside hospital on [**2200-4-6**], with a\ntwo day history of fevers and confusion.  The patient had a\nCT scan of the chest at that time which revealed pneumonia by\nreport in the left lower lobe.\n\nWhile in the outside hospital Emergency Department, the\npatient complained of chest pain.  The patient states that\nshe has had this pain for approximately two weeks with no\nrelief.  She was

In [17]:
# From provided code
# Did not remove commas??
# Separates "patient's" into "patient 's"??? Purpose???? -> should be caught with n-grams
def clean_str(string):
    """
    Tokenization/string cleaning.
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower() # We include lower()??? # .lower() word2vec is case sensitive



In [18]:
clean_str(labelled_corpus_df.iloc[0]["TEXT"])

"admission date 2200 4 7 discharge date 2200 4 10 date of birth 2146 9 21 sex f service cardiac intensive care medicine chief complaint the patient was admitted to the cardiac intensive care unit medicine service on 2200 4 7 , with the chief complaint of acute myocardial infarction and fever history of present illness the patient is a 53 year old white female with a history of coronary artery disease , hypertension , hypercholesterolemia and two pack per day tobacco use with previous coronary artery bypass graft surgery presenting to an outside hospital on 2200 4 6 , with a two day history of fevers and confusion the patient had a ct scan of the chest at that time which revealed pneumonia by report in the left lower lobe while in the outside hospital emergency department , the patient complained of chest pain the patient states that she has had this pain for approximately two weeks with no relief she was given levofloxacin for apparent community acquired pneumonia and cardiac enzymes w

In [19]:
# Takes ~25 seconds once, then just 1.6344 seconds when run later?
t1 = time.time()
labelled_corpus_df["Cleaned Text"] = labelled_corpus_df.apply(lambda row : clean_str(row["TEXT"]), axis = 1)
labelled_corpus_df = labelled_corpus_df.drop(['TEXT'], axis=1)
t2 = time.time()
print(t2 - t1)

3.030029773712158


In [20]:
labelled_corpus_df.iloc[2]["Cleaned Text"]

"admission date 2167 5 19 discharge date 2167 6 11 date of birth 2112 10 13 sex m service medicine allergies penicillins attending first name3 ( lf ) 1828 chief complaint squamous cell carcinoma , bacteremia , need for peg and extraction of teeth major surgical or invasive procedure none major had extraction of all teeth , and placment of gastrostomy tube history of present illness mr known lastname 40332 is a 54 year old man with type i dm , ckd stage v on dialysis , coronary artery disease who was diagnosed a little over one month ago with scc of head and neck he is admitted now for management of multiple issues since his diagnosis , radiation oncology , medical oncology and dental have spent extensive time and energy arranging for treatment plan ultimately , decision made to pursue peg tube , followed by teeth extraction ( very poor dentition ) followed by radiation treatment and erbitux he was seen in the hospital clinic today for planned peg tube but gi unable to place due to conc

In [21]:
labelled_corpus_df.head(1)

Unnamed: 0,HADM_ID,SUBJECT_ID,Advanced.Cancer,Advanced.Heart.Disease,Advanced.Lung.Disease,Chronic.Neurological.Dystrophies,Chronic.Pain.Fibromyalgia,Alcohol.Abuse,Other.Substance.Abuse,Obesity,Schizophrenia.and.other.Psychiatric.Disorders,Depression,Cleaned Text
0,118003.0,3644,0,0,0,0,1,0,0,0,0,1,admission date 2200 4 7 discharge date 2200 4 ...


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(labelled_corpus_df.loc[:,"Cleaned Text"])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
df_bow_sklearn.shape

(1341, 33607)

In [25]:
df_bow_sklearn.head()

Unnamed: 0,00,000,0000,000mcg,000mg,000u,000unit,000units,000wbc,001,...,zosysn,zovirax,zyban,zydis,zygoma,zygomatic,zymar,zyprexa,zyrtec,zyvox
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
#labelled_corpus_df.loc[:,"Advanced.Heart.Disease"]

In [27]:
#show only the 10 pyototypes
labelled_corpus_df.iloc[:,2:12].head(1)

Unnamed: 0,Advanced.Cancer,Advanced.Heart.Disease,Advanced.Lung.Disease,Chronic.Neurological.Dystrophies,Chronic.Pain.Fibromyalgia,Alcohol.Abuse,Other.Substance.Abuse,Obesity,Schizophrenia.and.other.Psychiatric.Disorders,Depression
0,0,0,0,0,1,0,0,0,0,1


In [28]:
#model = LogisticRegression(solver='liblinear')
#model = LogisticRegression(solver='lbfgs', max_iter=150)#, random_state=0 
model = LogisticRegression(n_jobs=2)

In [29]:
#rondomly split the data into training and testing parts using train_test_split from sklearn
X_train, X_test, y_train, y_test = train_test_split(df_bow_sklearn, labelled_corpus_df.iloc[:,2:12])
print("X_train",X_train.shape)
print("X_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

X_train (1005, 33607)
X_test (336, 33607)
y_train (1005, 10)
y_test (336, 10)


In [30]:
t1 = time.time()
scores = np.zeros((10,5))
#loop through the 10 different phenotypes
for i in range(y_train.shape[1]):
    #Fit the model
    model.fit(X_train, y_train.iloc[:,i])
    #predit the score for each of the phenotypes
    y_predict = model.predict(X_test)
    #save the scores to be printed later for each phenotype
    scores[i][0] = metrics.accuracy_score(y_test.iloc[:,i], y_predict)
    scores[i][1] = metrics.precision_score(y_test.iloc[:,i], y_predict)
    scores[i][2] = metrics.recall_score(y_test.iloc[:,i], y_predict)
    scores[i][3] = metrics.f1_score(y_test.iloc[:,i], y_predict)
    scores[i][4] = metrics.roc_auc_score(y_test.iloc[:,i], y_predict)
    #print out the resulting accuracy
    print("L.R. Accuracy of: %.3f for Phenotype:" %metrics.accuracy_score(y_test.iloc[:,i], y_predict), labelled_corpus_df.columns[i+2])
    
t2 = time.time()
print("time:", t2 - t1)

L.R. Accuracy of: 0.905 for Phenotype: Advanced.Cancer
L.R. Accuracy of: 0.824 for Phenotype: Advanced.Heart.Disease
L.R. Accuracy of: 0.914 for Phenotype: Advanced.Lung.Disease
L.R. Accuracy of: 0.804 for Phenotype: Chronic.Neurological.Dystrophies
L.R. Accuracy of: 0.792 for Phenotype: Chronic.Pain.Fibromyalgia
L.R. Accuracy of: 0.887 for Phenotype: Alcohol.Abuse
L.R. Accuracy of: 0.929 for Phenotype: Other.Substance.Abuse
L.R. Accuracy of: 0.932 for Phenotype: Obesity
L.R. Accuracy of: 0.854 for Phenotype: Schizophrenia.and.other.Psychiatric.Disorders
L.R. Accuracy of: 0.768 for Phenotype: Depression
time: 99.20718622207642


In [31]:
#put in dataframe with labels for easier viewing 
df_scores = pd.DataFrame(scores, index=labelled_corpus_df.iloc[:,2:12].columns,
                        columns=["Accuracy","Precision", "Recall", "F1", "roc_auc"])

In [32]:
df_scores

Unnamed: 0,Accuracy,Precision,Recall,F1,roc_auc
Advanced.Cancer,0.904762,0.727273,0.380952,0.5,0.680272
Advanced.Heart.Disease,0.824405,0.576923,0.447761,0.504202,0.682988
Advanced.Lung.Disease,0.91369,0.666667,0.342857,0.45283,0.661462
Chronic.Neurological.Dystrophies,0.803571,0.634615,0.4125,0.5,0.669141
Chronic.Pain.Fibromyalgia,0.791667,0.45,0.272727,0.339623,0.595623
Alcohol.Abuse,0.886905,0.846154,0.392857,0.536585,0.689286
Other.Substance.Abuse,0.928571,0.692308,0.529412,0.6,0.751461
Obesity,0.931548,0.642857,0.333333,0.439024,0.658576
Schizophrenia.and.other.Psychiatric.Disorders,0.854167,0.666667,0.419355,0.514851,0.685955
Depression,0.767857,0.604938,0.515789,0.556818,0.691505
