In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
# Handling DIAGNOSES_ICD.csv
df = pd.read_csv("DIAGNOSES_ICD.csv")
df.drop(axis=1, labels=['ROW_ID'], inplace=True)
df.dropna(subset=['HADM_ID', 'SUBJECT_ID'], inplace=True)
df['SEQ_NUM'] = df['SEQ_NUM'].astype(str)
df['ICD9_CODE'] = df['ICD9_CODE'].astype(str)
merge = lambda a: ",".join(a) 
df = df.groupby(['SUBJECT_ID','HADM_ID']).agg({'SEQ_NUM': merge, 'ICD9_CODE': merge}).reset_index()
df.drop_duplicates(subset='SUBJECT_ID', keep="first", inplace=True)
diag_icd_df = df
df = None
diag_icd_df = diag_icd_df.sort_values(['HADM_ID'], ascending=True).reset_index()
print(diag_icd_df)

In [None]:
# Handling LABEVENTS.csv
fields = ['SUBJECT_ID','HADM_ID','ITEMID','VALUENUM']
df = pd.read_csv("LABEVENTS.csv",error_bad_lines=False,warn_bad_lines=True,encoding = "iso-8859-1",usecols=fields)
lab_events_df = df
lab_events_df['HADM_ID'] = pd.to_numeric(lab_events_df['HADM_ID'], errors='coerce')
lab_events_df = lab_events_df.dropna(subset=['HADM_ID'])

In [None]:
lab_events_df = lab_events_df.sort_values(['HADM_ID']).reset_index()
print(lab_events_df)

In [None]:
# Handling D_LABITEMS.csv

fields = ['ITEMID','LABEL']
df = pd.read_csv("D_LABITEMS.csv",error_bad_lines=False,warn_bad_lines=True,encoding = "iso-8859-1",usecols=fields)
labitems_df = df
labitems_df['ITEMID'] = pd.to_numeric(labitems_df['ITEMID'], errors='coerce')
labitems_df = labitems_df.dropna(subset=['ITEMID'])
labitems_df = labitems_df.sort_values(['ITEMID'], ascending=True).reset_index()
print(labitems_df)

In [None]:
# Creating final.csv
for itemid in labitems_df['ITEMID']:
    diag_icd_df[itemid]=0

In [None]:
print(diag_icd_df)

In [None]:
j=0
for i in range(1000):
    hadmId = diag_icd_df['HADM_ID'][i]
    while j < len(lab_events_df) and lab_events_df['HADM_ID'][j] <= hadmId:
        if lab_events_df['HADM_ID'][j] == hadmId:
            itemId = lab_events_df['ITEMID'][j]
            value = lab_events_df['VALUENUM'][j]
            diag_icd_df[itemId][i] = value
        j = j+1
    print(i)

In [None]:
print(diag_icd_df)

In [None]:
diag_icd_df.to_csv("final.csv")

In [None]:
# Handling fullInputData.csv - 1000 samples from 35,500 records (final.csv)
# Select top 1000 records from final.csv to generate fullInputData.csv
df = pd.read_csv("fullInputData.csv")
df.drop(df.columns[[0,1]], axis=1, inplace=True)
full_input_data_df = df

In [None]:
full_input_data_df.fillna(0)
counts = full_input_data_df.astype(bool).sum(axis=0)

In [None]:
for itemid in labitems_df['ITEMID']:
    if counts[str(itemid)] < 700:
        full_input_data_df.drop(axis=1, labels=[str(itemid)], inplace=True)

In [None]:
print(full_input_data_df)

In [None]:
full_input_data_df.drop(axis=1, labels=['SEQ_NUM'], inplace=True)

In [None]:
full_input_data_df.rename(columns={'50868': 'Anion Gap', 
                   '50882': 'Bicarbonate', 
                   '50893': 'Calcium', 
                   '50902': 'Chloride', 
                   '50931': 'Glucose', 
                   '50960': 'Magnesium',
                   '50970': 'Phosphate',
                   '50971': 'Potassium',
                   '50983': 'Sodium',
                   '51006': 'Urea Nitrogen',
                   '51221': 'Hematocrit',
                   '51222': 'Hemoglobin',
                   '51237': 'INR',
                   '51244': 'Lymphocytes',
                   '51248': 'MCH',
                   '51249': 'MCHC',
                   '51250': 'MCV',
                   '51254': 'Monocytes',
                   '51256': 'Neutrophils',
                   '51265': 'Platelet Count',
                   '51274': 'PT',
                   '51275': 'PTT',
                   '51277': 'RDW',
                   '51279': 'Red Blood Cells',
                   '51301': 'White Blood Cells'
                  }, inplace=True)

In [None]:
full_input_data_df = full_input_data_df.replace(0, np.NaN)
full_input_data_df.fillna(full_input_data_df.mean(), inplace=True)

In [None]:
# Convert ICD9_CODE to output
def get_label(idc9_Code):
    num = int(idc9_Code[:3])
    if 1 <= num <= 139:
        return 1 #Infectious and Parasitic Diseases
    elif 140 <= num <= 239:
        return 2 #Neoplasms
    elif 240 <= num <= 279:
        return 3 #Endocrine, Nutritional, Metabolic, Immunity
    elif 280 <= num <= 289:
        return 4 #Blood and Blood-Forming Organs
    elif 290 <= num <= 319:
        return 5 #Mental Disorders
    elif 320 <= num <= 389:
        return 6 #Nervous System and Sense Organs
    elif 390 <= num <= 459:
        return 7 #Circulatory System
    elif 460 <= num <= 519:
        return 8 #Respiratory System
    elif 520 <= num <= 579:
        return 9 #Digestive System
    elif 580 <= num <= 629:
        return 10 #Genitourinary System
    elif 630 <= num <= 677:
        return 11 #Pregnancy, Childbirth, and the Puerperium
    elif 680 <= num <= 709:
        return 12 #Skin and Subcutaneous Tissue
    elif 710 <= num <= 739:
        return 13 #Musculoskeletal System and Connective Tissue
    elif 740 <= num <= 759:
        return 14 #Congenital Anomalies
    elif 760 <= num <= 779:
        return 15 #Conditions Originating in the Perinatal Period
    elif 780 <= num <= 789:
        return 16 #Symptoms
    elif 790 <= num <= 796:
        return 17 #Nonspecific Abnormal Findings
    elif 797 <= num <= 799:
        return 18 #Ill-defined and Unknown Causes of Morbidity and Mortality
    elif 800 <= num <= 999:
        return 19 #Injury and Poisoning 

In [None]:
output_classes = []
for index, row in full_input_data_df.iterrows():
    word_list = str(row['ICD9_CODE']).split(',')
    output_class = [0 if (word[0]=='V' or word[0]=='E' or word=='nan')  else get_label(word) for word in word_list]
    zz = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    for i in output_class:
        zz[i]=1
    output_classes.append(zz)
output_classes

In [None]:
output_classes = np.array(output_classes)

dataset = pd.DataFrame({
'Supplemental_V_Codes':output_classes[:,0],
'Infectious_and_Parasitic_Diseases':output_classes[:,1],
'Neoplasms':output_classes[:,2],
'Endocrine_Nutritional_Metabolic_Immunity':output_classes[:,3],
'Blood_and_Blood_Forming_Organs':output_classes[:,4],
'Mental_Disorders':output_classes[:,5],
'Nervous_System_and_Sense_Organs':output_classes[:,6],
'Circulatory_System':output_classes[:,7],
'Respiratory_System':output_classes[:,8],
'Digestive_System':output_classes[:,9],
'Genitourinary_System':output_classes[:,10],
'Pregnancy_Childbirth_and_the_Puerperium':output_classes[:,11],
'Skin_and_Subcutaneous_Tissue':output_classes[:,12],
'Musculoskeletal_System_and_Connective_Tissue':output_classes[:,13],
'Congenital_Anomalies':output_classes[:,14],
'Conditions_Originating_in_the_Perinatal_Period':output_classes[:,15],
'Symptoms':output_classes[:,16],
'Nonspecific_Abnormal_Findings':output_classes[:,17],
'Ill_defined_and_Unknown_Causes_of_Morbidity_and_Mortality':output_classes[:,18],
'Injury_and_Poisoning':output_classes[:,19]
})

In [None]:
full_input_data_df = full_input_data_df.drop(['ICD9_CODE'],axis=1)

In [None]:
frames = [full_input_data_df,dataset]

final_df = pd.concat(frames,axis=1)
final_df.to_csv('ProcessedFinalData.csv',index=False)

In [None]:
final_df.head()