In [1]:
import pandas as pd

In [2]:
#Load raw data into dfs 
admissions_df = pd.read_csv("../Data/ADMISSIONS.csv")
diag_df =  pd.read_csv("../Data/DIAGNOSES_ICD.csv")
events_df = pd.read_csv("../Data/NOTEEVENTS.csv",dtype={"CHARTTIME":"string", "STORETIME":"string"})
patients_df = pd.read_csv("../Data/PATIENTS.csv")

In [5]:

#This cell gets the first ICU HADM ID of the first ICU visit for each patients and also removes patients under 15
#Most of this was taken from the previous notebook
admissions_sorted_df = admissions_df.sort_values("ADMITTIME").reset_index(drop = True).groupby("SUBJECT_ID").first().reset_index()

age_df = admissions_sorted_df.merge(patients_df, on = "SUBJECT_ID")
age_df["ADMITTIME"] = pd.to_datetime(age_df["ADMITTIME"], dayfirst=True, errors='coerce')
age_df["DOB"] = pd.to_datetime(age_df["DOB"], dayfirst=True, errors='coerce')
age_df["AGE"] = (age_df["ADMITTIME"].values - age_df["DOB"].values)
age_df["AGE"] = age_df["AGE"].dt.days/365
age_df = age_df[age_df["AGE"] >= 15]
age_df = age_df[["SUBJECT_ID","HADM_ID"]]
age_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID
1,3,145834
2,4,185777
4,6,107064
7,9,150750
9,11,194540


In [6]:
#This cell removes missing values from the events data frame 
#and then concatenates the text into one row for each HADM ID
events_no_na = events_df.dropna(subset = ["HADM_ID"])
events_no_na = events_no_na.loc[events_no_na["ISERROR"] != 1]

events_grouped  = events_no_na.groupby("HADM_ID")["TEXT"].apply(lambda x: " ".join(x)).reset_index()
events_grouped["HADM_ID"] = events_grouped["HADM_ID"].astype(int)

events_grouped.head()

Unnamed: 0,HADM_ID,TEXT
0,100001,Admission Date: [**2117-9-11**] ...
1,100003,Admission Date: [**2150-4-17**] ...
2,100006,Admission Date: [**2108-4-6**] Discharg...
3,100007,Admission Date: [**2145-3-31**] ...
4,100009,Admission Date: [**2162-5-16**] ...


In [7]:
#This cell combines the diagnostic codes and concatenated text from the previous cell
#The output is one row for each individual ICD9 code

#Get the ICD9 codes of the first icu admission for each patient
icd9_df = age_df.merge(diag_df,on = "HADM_ID")


icd9_df = icd9_df.merge(events_grouped,on = "HADM_ID")[["ICD9_CODE","TEXT"]]

icd9_df.head()


Unnamed: 0,ICD9_CODE,TEXT
0,389,Admission Date: [**2101-10-20**] Discharg...
1,78559,Admission Date: [**2101-10-20**] Discharg...
2,5849,Admission Date: [**2101-10-20**] Discharg...
3,4275,Admission Date: [**2101-10-20**] Discharg...
4,41071,Admission Date: [**2101-10-20**] Discharg...


In [11]:
#This cell first categorises the ICD9 codes as seen in figure 8 of the Farsight paper
#And then groups the ICD9 categories for each HADM ID into a list and then drops duplicate rows
#The final output is a dataframe with columns: TEXT, ICD9_GROUPS with each row corresponding to one HADM ID (HADM ID column removed in final dataframe)

#Takes the first 3 digits of each icd9 code for easier categorising
final_df = icd9_df.copy()
final_df["ICD9_CODE"] = final_df["ICD9_CODE"].astype(str).apply(lambda x: x[0:3])

#df.isna or df.dropna couldn't find nan
final_df = final_df.loc[final_df["ICD9_CODE"] != "nan"]

#Converts V and E to 1000 for easier categorising
final_df["ICD9_CODE"] = final_df["ICD9_CODE"].apply(lambda x: "1000" if (x[0] == "V" or x[0] == "E") else x).astype(int)

#Categorises the icd9 codes using bins
icd9_bins = [0,139,239,279,289,319,389,459,519,579,629,677,709,739,759,789,796,799,999,1000]
final_df["ICD9_GROUPS"] = pd.cut(final_df["ICD9_CODE"], bins = icd9_bins, right = True, labels = False)



#Removes duplicates
final_df = final_df.drop_duplicates(subset = ["ICD9_GROUPS","TEXT"])

#Puts the cateogrised ICD9 codes into a list by grouping by the text 
final_df = final_df.groupby("TEXT")["ICD9_GROUPS"].apply(list).reset_index()

print(final_df.shape)
final_df.head(25)


(36370, 2)
