# Start

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import tensorflow as tf
import re

pd.DataFrame.iteritems = pd.DataFrame.items
pd.options.display.max_rows = 4000

2023-09-30 05:24:24.143117: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-09-30 05:24:24.143166: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-09-30 05:24:24.145428: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-09-30 05:24:24.312555: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# function that performs basic regex operations on column of interest
def basic_regex(df, column_name):  
    df[column_name] = df[column_name].str.replace('[+]', 'pos', regex=True)  # replace any + the string pos. - is not used to indicate negative values
    df[column_name] = df[column_name].str.replace('[^a-zA-Z0-9]', ' ', regex=True)  # replace any non alphanumeric characters w/ space
    df[column_name] = df[column_name].str.replace(' +', ' ', regex=True)  # replace empty spaces > 1 to normal spaces = 1
    df[column_name] = df[column_name].str.lower()  # convert to lower-case
    df[column_name] = df[column_name].str.strip()  # strip trailing and leading spaces
    
def org_name_regex(df, column_name): 
    df[column_name] = df[column_name].str.replace(r'\bstaph\b', 'staphylococcus', regex=True)  # convert staph to staphylococcus
    df[column_name] = df[column_name].str.replace(r'\bspecies\b', 'sp', regex=True)  # convert species to sp
    df[column_name] = df[column_name].str.replace(r'\bcoagulase\b', 'coag', regex=True)  # convert bcoagulase to coag
    df[column_name] = df[column_name].str.replace(r'\bpositive\b', 'pos', regex=True)  # convert positive to pos
    df[column_name] = df[column_name].str.replace(r'\bnegative\b', 'neg', regex=True)  # convert negative to neg
    # convert all gram negative rod variations to just gram negative rods
    df[column_name] = df[column_name].str.replace(r'gram neg rod[^a-zA-Z]*[a-zA-Z]*', 'gram neg rods', regex=True)
    df[column_name] = df[column_name].str.replace(r'^pos for ', '', regex=True)  # remove pos for from sentences
    df[column_name] = df[column_name].apply(lambda x: np.nan if isinstance(x, str) and 'presumptive' in x else x)  # change presumptive obs to NA
    
def spec_type_desc_regex(df, column_name): 
    df[column_name] = df[column_name].apply(lambda x: np.nan if isinstance(x, str) and 'xxx' in x else x)  # change xxx obs to NA
    
# Preprocess microbiologyevents
def preprocess_microbiologyevents(microbiologyevents):
    micro_subset = microbiologyevents[['HADM_ID', 'ORG_NAME', 'SPEC_TYPE_DESC', 'INTERPRETATION']]
    micro_subset = micro_subset[micro_subset["ORG_NAME"] != 'escherichia coli']  # omit E coli 
    micro_subset = micro_subset.drop_duplicates(subset=["HADM_ID", "ORG_NAME"]).copy()  # drop duplicates of HADM_ID and ORG_NAME
    
    # perform basic regex and data cleaning
    basic_regex(micro_subset, 'ORG_NAME')
    basic_regex(micro_subset, 'SPEC_TYPE_DESC')
    # more specific data cleaning 
    org_name_regex(micro_subset, 'ORG_NAME')
    spec_type_desc_regex(micro_subset, 'SPEC_TYPE_DESC')
    
    # convert to one-hot; note: all microbes will only have 0 or 1 as we deleted duplicates, however, interepretion will be a sum of values.
    micro_oh = pd.get_dummies(micro_subset, columns=['ORG_NAME', 'SPEC_TYPE_DESC', 'INTERPRETATION'], prefix='', prefix_sep='')
    wide_micro = micro_oh.groupby('HADM_ID').sum().reset_index()  # get sum of occurances by HADMID
    
    return wide_micro

# function to preprocess prescription data
def preprocess_prescriptions(prescriptions):
    routes = {"Oral and Enteral": ["ORAL", "PO", "G TUBE", "J TUBE", "ENTERAL TUBE ONLY ? NOT ORAL",
                                   "OG", "NG", "NG/OG", "PO OR ENTERAL TUBE", "PO/NG", "PO/OG"],
              
              "Respiratory and Inhalation": ["AERO", "IH", "INHALATION", "NEB", "ET", "NU"],
              
              "Ocular and Otic": ["AS", "AU", "BOTH EARS", "LEFT EAR", "RIGHT EAR", "OD", "OS",
                                  "OU", "BOTH EYES", "LEFT EYE", "RIGHT EYE"],
              
              "Dermal and Transdermal": ["ID", "SC", "SUBCUT", "TD", "BUCCAL", "BU", "SL", "AXILLARY"],
              
              "Intravenous and Intramuscular": ["IV", "IM", "IV BOLUS", "IV DRIP", "IVPCA", "IJ"],
              
              "Other Invasive Routes": ["ED", "IA", "IC", "IT", "IO", "IP", "IR", "IVT", "PR", "INTRAPERICARDIAL"],
              
              "Miscellaneous and Specific Methods": ["EX-VIVO", "LOCK", "IRR", "DIALYS", "DWELL"]
             }
    
    prescriptions_subset = prescriptions[['HADM_ID', 'ROUTE']]
    # apply dictionary grouping; inverting the routes dictionary speeds up cleaning
    inverted_routes = {route: key for key, value in routes.items() for route in value}
    prescriptions_subset['ROUTE_GROUP'] = prescriptions_subset['ROUTE'].map(inverted_routes)
    prescriptions_subset = prescriptions_subset[['HADM_ID', 'ROUTE_GROUP']]
    # perform one hot encoding and take the sum to get the number of administered drugs per HADM_ID
    prescriptions_oh = pd.get_dummies(prescriptions_subset, columns=['ROUTE_GROUP'], prefix='', prefix_sep='')
    prescriptions_onehot_sum = prescriptions_oh.groupby('HADM_ID').sum().reset_index()
    
    return prescriptions_onehot_sum

In [235]:
preprocess_microbiologyevents(microbiologyevents)

Unnamed: 0,HADM_ID,2nd isolate,abiotrophia granulicatella sp,achromobacter alcaligenes dentrificans,achromobacter alcaligenes xylosoxidans ss dentrificans,acidfast bacilli,acinetobacter baumannii,acinetobacter baumannii complex,acinetobacter sp,acremonium sp,...,urine kidney,urine suprapubic aspirate,varicella zoster culture,viral culture r o cytomegalovirus,viral culture r o herpes simplex virus,worm,I,P,R,S
0,100001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,100003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,100006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,100007,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,100009,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48614,199993,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48615,199994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
48616,199995,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48617,199998,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
microbiologyevents = pd.read_csv('/Deep Learning/capstone_project/Data/MICROBIOLOGYEVENTS.csv')

In [214]:
microbiologyevents['ORG_NAME'].sort_values().unique()

array(['ABSCESS', 'ANORECTAL/VAGINAL CULTURE', 'ARTHROPOD', 'ASPIRATE',
       'BILE', 'BIOPSY', 'BLOOD', 'BLOOD BAG FLUID', 'BLOOD CULTURE',
       'BLOOD CULTURE ( MYCO/F LYTIC BOTTLE)',
       'BLOOD CULTURE (POST-MORTEM)', 'BLOOD CULTURE - NEONATE',
       'BONE MARROW', 'BONE MARROW - CYTOGENETICS', 'BRONCHIAL BRUSH',
       'BRONCHIAL BRUSH - PROTECTED', 'BRONCHIAL WASHINGS',
       'BRONCHOALVEOLAR LAVAGE', 'Blood (CMV AB)', 'Blood (EBV)',
       'Blood (Malaria)', 'Blood (Toxo)', 'C, E, & A Screening',
       'CATHETER TIP-IV', 'CORNEAL EYE SCRAPINGS', 'CRE Screen',
       'CSF;SPINAL FLUID', 'DIALYSIS FLUID',
       'DIRECT ANTIGEN TEST FOR VARICELLA-ZOSTER VIRUS',
       'Direct Antigen Test for Herpes Simplex Virus Types 1 & 2', 'EAR',
       'EYE', 'FECAL SWAB', 'FLUID RECEIVED IN BLOOD CULTURE BOTTLES',
       'FLUID WOUND', 'FLUID,OTHER', 'FOOT CULTURE', 'FOREIGN BODY',
       'GASTRIC ASPIRATE', 'IMMUNOLOGY', 'Immunology (CMV)',
       'Influenza A/B by DFA', 'Influenza 

In [231]:
# perform basic regex
basic_regex(microbiologyevents, 'ORG_NAME')
basic_regex(microbiologyevents, 'SPEC_TYPE_DESC')

org_name_regex(microbiologyevents, 'ORG_NAME')
spec_type_desc_regex(microbiologyevents, 'SPEC_TYPE_DESC')

micro_filtered = microbiologyevents[['HADM_ID', 'ORG_NAME', 'SPEC_TYPE_DESC', 'INTERPRETATION']]
micro_filtered = micro_filtered.drop_duplicates(subset=["HADM_ID", "ORG_NAME"]).copy()

#display(micro_filtered['ORG_NAME'].value_counts().head(300))
micro_filtered

Unnamed: 0,HADM_ID,ORG_NAME,SPEC_TYPE_DESC,INTERPRETATION
0,170324,pseudomonas aeruginosa,bronchoalveolar lavage,
1,170324,,sputum,
6,175533,,urine,
14,175533,staphylococcus coag neg,urine,
19,195700,,blood culture neonate,
...,...,...,...,...
631686,197851,,mrsa screen,
631687,167791,,mrsa screen,
631688,147562,,blood culture,
631694,174582,,mrsa screen,


In [232]:
micro_oh = pd.get_dummies(micro_filtered, columns=['ORG_NAME', 'SPEC_TYPE_DESC', 'INTERPRETATION'], prefix='', prefix_sep='')
micro_onehot = micro_oh.groupby('HADM_ID').sum().reset_index()  # convert true false to zero 0

In [23]:
# function to preprocess prescription data
def preprocess_prescriptions(prescriptions):
    routes = {"Oral and Enteral": ["ORAL", "PO", "G TUBE", "J TUBE", "ENTERAL TUBE ONLY ? NOT ORAL",
                                   "OG", "NG", "NG/OG", "PO OR ENTERAL TUBE", "PO/NG", "PO/OG"],
              
              "Respiratory and Inhalation": ["AERO", "IH", "INHALATION", "NEB", "ET", "NU"],
              
              "Ocular and Otic": ["AS", "AU", "BOTH EARS", "LEFT EAR", "RIGHT EAR", "OD", "OS",
                                  "OU", "BOTH EYES", "LEFT EYE", "RIGHT EYE"],
              
              "Dermal and Transdermal": ["ID", "SC", "SUBCUT", "TD", "BUCCAL", "BU", "SL", "AXILLARY"],
              
              "Intravenous and Intramuscular": ["IV", "IM", "IV BOLUS", "IV DRIP", "IVPCA", "IJ"],
              
              "Other Invasive Routes": ["ED", "IA", "IC", "IT", "IO", "IP", "IR", "IVT", "PR", "INTRAPERICARDIAL"],
              
              "Miscellaneous and Specific Methods": ["EX-VIVO", "LOCK", "IRR", "DIALYS", "DWELL"]
             }
    
    prescriptions_subset = prescriptions[['HADM_ID', 'ROUTE']]
    # apply dictionary grouping; inverting the routes dictionary speeds up cleaning
    inverted_routes = {route: key for key, value in routes.items() for route in value}
    prescriptions_subset['ROUTE_GROUP'] = prescriptions_subset['ROUTE'].map(inverted_routes)
    prescriptions_subset = prescriptions_subset[['HADM_ID', 'ROUTE_GROUP']]
    # perform one hot encoding and take the sum to get the number of administered drugs per HADM_ID
    prescriptions_oh = pd.get_dummies(prescriptions_subset, columns=['ROUTE_GROUP'], prefix='', prefix_sep='')
    prescriptions_onehot_sum = prescriptions_oh.groupby('HADM_ID').sum().reset_index()
    
    return prescriptions_onehot_sum

In [5]:
microbiologyevents = pd.read_csv('/Deep Learning/capstone_project/Data/MICROBIOLOGYEVENTS.csv')
prescriptions = pd.read_csv('/Deep Learning/capstone_project/Data/PRESCRIPTIONS.csv', low_memory=False) 

In [6]:
prescriptions_oh = preprocess_prescriptions(prescriptions)
microbiologyevents_oh = preprocess_microbiologyevents(microbiologyevents)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prescriptions_subset['ROUTE_GROUP'] = prescriptions_subset['ROUTE'].map(inverted_routes)


In [9]:
prescriptions_oh.shape

(50216, 8)

In [10]:
microbiologyevents_oh.shape

(48740, 417)

# join micro and prescriptions

In [11]:
def outer_join(df1, df2, on):
    merged_df = pd.merge(df1, df2, on='HADM_ID', how='outer')
    merged_df.fillna(0, inplace=True)
    
    return merged_df

In [7]:
micro_prescription = pd.merge(microbiologyevents_oh, prescriptions_oh, on='HADM_ID', how='outer')

# Lab stuff

In [8]:
lab_items = pd.read_csv('/Deep Learning/capstone_project/Data/D_LABITEMS.csv')  

In [9]:
lab_events = pd.read_csv('/Deep Learning/capstone_project/Data/LABEVENTS.csv', usecols=['HADM_ID', 'ITEMID', 'FLAG'])  

In [10]:
lab_merged = pd.merge(lab_events, lab_items, on='ITEMID', how='left')  # Merge lab_event with lab_item using a left merge

In [11]:
def preprocess_lab(lab_merged):
    lab_merged.dropna(subset=['HADM_ID'], inplace=True) # remove rows where HADM ID is NA
    lab_subset = lab_merged[["HADM_ID", "LABEL", "FLAG"]]
    # Set 'FLAG' values to NaN where it is not 'abnormal'
    lab_subset.loc[lab_subset['FLAG'] != 'abnormal', 'FLAG'] = np.nan
    # convert to one-hot; note: all microbes will only have 0 or 1 as we deleted duplicates, however, interepretion will be a sum of values.
    lab_oh = pd.get_dummies(lab_subset, columns=["LABEL", "FLAG"], prefix='', prefix_sep='')
    wide_micro = micro_oh.groupby('HADM_ID').sum().reset_index()  # get sum of occurances by HADMID
    return wide_lab

In [6]:
lab_merged['FLAG'].sort_values().unique()

NameError: name 'lab_merged' is not defined

In [12]:
lab_subset = preprocess_lab(lab_merged)
lab_subset

MemoryError: Unable to allocate 11.6 GiB for an array with shape (561, 22245034) and data type bool

In [14]:
# Merge lab_event with lab_item using a left merge
lab_merged = pd.merge(lab_events, lab_items, on='ITEMID', how='left')

In [15]:
lab_merged

Unnamed: 0,HADM_ID,ITEMID,CHARTTIME,FLAG,ROW_ID,LABEL,FLUID,CATEGORY,LOINC_CODE
0,,50820,2101-10-12 16:07:00,,21,pH,Blood,Blood Gas,11558-4
1,,50800,2101-10-12 18:17:00,,1,SPECIMEN TYPE,BLOOD,BLOOD GAS,
2,,50802,2101-10-12 18:17:00,,3,Base Excess,Blood,Blood Gas,11555-0
3,,50804,2101-10-12 18:17:00,,5,Calculated Total CO2,Blood,Blood Gas,34728-6
4,,50808,2101-10-12 18:17:00,abnormal,9,Free Calcium,Blood,Blood Gas,1994-3
...,...,...,...,...,...,...,...,...,...
27854050,103219.0,50882,2109-12-30 01:40:00,,83,Bicarbonate,Blood,Chemistry,1963-8
27854051,103219.0,50885,2109-12-30 01:40:00,abnormal,86,"Bilirubin, Total",Blood,Chemistry,1975-2
27854052,103219.0,50902,2109-12-30 01:40:00,,103,Chloride,Blood,Chemistry,2075-0
27854053,103219.0,50911,2109-12-30 01:40:00,,112,"Creatine Kinase, MB Isoenzyme",Blood,Chemistry,6773-6


In [None]:
def preprocess_lab(lab_merged):
    lab_subset = lab_merged[["HADM_ID", "LABEL", "FLAG"]]
    lab_subset = lab_subset[lab_subset["FLAG"] == "abnormal"]
    lab_subset = lab_subset[["HADM_ID", "LABEL"]]
    lab_subset.columns = ["HADM_ID", "lab_flag"]

    lab_subset = lab_subset.drop_duplicates(subset=["HADM_ID", "lab_flag"]).copy()
    lab_subset['present'] = 1
    wide_lab = lab_subset.pivot(index='HADM_ID', columns='lab_flag', values='present')
    wide_lab = wide_lab.fillna(0).reset_index()

    lab_columns = wide_lab.columns.difference(['HADM_ID'])
    wide_lab[lab_columns] = wide_lab[lab_columns].fillna(0)

    return wide_lab