# All Data Raw - Preparation

The following code pulls data using queries from the MIMIC-III dataset stored in a local server. To first run this notebook, the query *all_data_raw.sql* must be run. <br>
The notebook will then merge the all_data_raw tables with missing information from other tables.

## Setup

In [1]:
# Import libraries

# Import useful libraries
import numpy as np
import pandas as pd
import os
import re
import pickle
import sys
import time
import psycopg2

In [2]:
# Setup Repository
with open("repo_info.txt", "r") as repo_info:
    path_to_repo = repo_info.readline()

path_to_data = f"{path_to_repo}data/"
path_to_raw = f"{path_to_data}raw/"
path_to_processed = f"{path_to_data}processed/"

In [8]:
# Read from a txt file the information on the credentials for our local PgAdmin server
with open("pg_admin_info.txt", "r") as repo_info:
    pg_credentials = repo_info.readlines()

sqluser = pg_credentials[0].strip()
dbname = pg_credentials[1].strip()
schema_name =  pg_credentials[2].strip()
password = pg_credentials[3].strip()

## Queries

In [65]:
%%time

# Connect to local postgres version of mimic
con = psycopg2.connect(host='localhost', database=dbname, user=sqluser, password=password)
cur = con.cursor()

# SQL query (gets 1st icd9_code from diagnoses_icd table for each hadm_id)
query = \
"""
select hadm_id, icd9_code from diagnoses_icd where seq_num = 1
"""

# Perform SQL query
cur.execute('SET search_path to ' + schema_name)
icd9_code = pd.read_sql_query(query,con)

#SQL Query (gets all icd9_code from procedures_icd)
query = \
"""
select hadm_id, icd9_code from procedures_icd
"""

# Perform SQL query
proc_icd9 = pd.read_sql_query(query,con)

#SQL Query (gets all icd9_code from diagnoses_icd)
query = \
"""
select hadm_id, icd9_code from diagnoses_icd
"""

# Perform SQL query
diag_icd9 = pd.read_sql_query(query,con)

#SQL Query (gets all icd9_code from diagnoses_icd)
query = \
"""
select * from all_data3
"""

# Perform SQL query
df = pd.read_sql_query(query,con)

con.close()

Wall time: 40.2 s


## Data Processing

In [67]:
%%time
df['icd9_code'] = 0
check =[]
for hadm_id in set(df['hadm_id'].values):
    if hadm_id in icd9_code['hadm_id'].values:
        df.loc[df['hadm_id']==hadm_id, 'icd9_code'] = icd9_code.loc[(icd9_code['hadm_id']==hadm_id), 'icd9_code'].values[0]
    else:
        check.append(hadm_id)

Wall time: 13min 16s


In [68]:
%%time
df['proc_icd9'] = 0
for hadm_id in set(proc_icd9.hadm_id):
    test = proc_icd9[proc_icd9['hadm_id'] == hadm_id]
    df.loc[df['hadm_id'] == hadm_id, 'proc_icd9'] = test.icd9_code.str.cat(sep=',')

Wall time: 19min 18s


In [69]:
%%time
df['diag_icd9'] = 0
for hadm_id in set(diag_icd9.hadm_id):
    test = diag_icd9[diag_icd9['hadm_id'] == hadm_id]
    df.loc[df['hadm_id'] == hadm_id, 'diag_icd9'] = test.icd9_code.str.cat(sep=',')

Wall time: 21min 49s


In [79]:
import copy

data = df.copy()

data["age_cat"] = ''  
data['type_stay'] = ''  
data['prev_adm'] = ''  
data['dest_discharge'] = ''
data['emergency_dpt'] = ''
data['icd_chapter'] = ''

"\nfor idx in range(len(data)):\n    df.loc[idx, 'age_cat'] = Age_cat(data, idx)\n    df.loc[idx, 'THS_cat'] = THS_cat(data, idx)\n    df.loc[idx, 'OP'] = OP(data, idx)\n    df.loc[idx, 'emergency_dpt'] = emergency_dpt(data, idx)\n    df.loc[idx, 'dest_discharge'] = dest_discharge(data, idx)\n    df.loc[idx, 'prev_adm'] = prev_adm(data, idx, 14)\n    df.loc[idx, 'icd_chapters'] = icd9_chapters(data, idx)\nTHS_cat2(data)\n"

In [81]:
%%time
# gender recoding
data.replace({'F':'2-Female', 'M': '1-Male'}, inplace = True)

Wall time: 6.48 s


In [82]:
%%time
# age_cat
data.loc[data['age']<18, 'age_cat'] = 1
data.loc[(data['age']>=18) & (df['age'] < 45), 'age_cat'] = 2
data.loc[(data['age']>=45) & (df['age'] < 65), 'age_cat'] = 3
data.loc[(data['age']>=65) & (df['age'] < 85), 'age_cat'] = 4
data.loc[(data['age']>=85), 'age_cat'] = 5

Wall time: 126 ms


In [83]:
%%time
# type of stay
medical_stay = {'CMED', 'DENT', 'ENT', 'GU', 'GYN', 'MED', 'NB', 'NBB', 'NMED', 'OBS', 'OMED', 'PSYCH', 'TRAUM'}
surgical_stay = {'CSURG', 'NSURG', 'ORTHO', 'PSURG', 'SURG','TSURG', 'VSURG'}
obstetric_stay = {'OBS'}
data.loc[(data['curr_service'].isin(medical_stay)), 'type_stay'] = '1-Medical'
data.loc[(data['curr_service'].isin(surgical_stay)), 'type_stay'] = '3-Surgical'
data.loc[(data['curr_service'].isin(obstetric_stay)), 'type_stay'] = '2-Obstetrics'

Wall time: 729 ms


In [84]:
%%time
# Origin of patient
home = {'PHYS REFERRAL/NORMAL DELI', 'HMO REFERRAL/SICK', 'TRANSFER FROM SKILLED NUR'}
data.loc[data['admission_location'].isin(home), 'origin_patient'] = '1-Home'
data.loc[~data['admission_location'].isin(home), 'origin_patient'] = '2-Other'

Wall time: 253 ms


In [85]:
%%time
# admission type
urgent = {'URGENT', 'EMERGENCY'}
data.loc[data['admission_type'].isin(urgent), 'emergency_dpt'] = 'Yes'
data.loc[~data['admission_type'].isin(urgent), 'emergency_dpt'] = 'No'

Wall time: 368 ms


In [86]:
%%time
# discharge location
dest = {'HOME', 'HOME HEALTH CARE'}
data.loc[data['discharge_location'].isin(dest), 'dest_discharge'] = '1-Home'
data.loc[~data['discharge_location'].isin(dest), 'dest_discharge'] = '2-Other'

Wall time: 280 ms


In [87]:
%%time
for subject in set(data.subject_id):
    df_adm = data[data['subject_id'] == subject]
    for i in df_adm.index:
        cur_date = df_adm.loc[i, 'admittime']
        hadm_id = df_adm.loc[i, 'hadm_id']
        subject_id = df_adm.loc[i, 'subject_id']
        df_prev_adm = df_adm[(df_adm.admittime < cur_date) & (df_adm['hadm_id'] != hadm_id)]
        test = df_prev_adm['emergency_dpt'] == 'Yes'
        if len(df_prev_adm) == 0:
            data.loc[i, 'prev_adm'] = '1-No hospitalization'
        elif test.any():
            data.loc[i, 'prev_adm'] = '3-At least one with emergency'
        else:
            data.loc[i, 'prev_adm'] = '2-At least one non emergency'

Wall time: 5h 26min 56s


In [88]:
import datetime
now = datetime.datetime.now()
print("Current date and time: ")
print(str(now))

Current date and time: 
2022-11-09 15:44:39.835854


In [89]:
%%time
data['icd9_code'] = data['icd9_code'].astype(str)
for idx in data.index:
    if data.loc[idx, 'icd9_code'][0] == 'E':
        data.loc[idx, 'icd_chapter'] = 19
    elif data.loc[idx, 'icd9_code'][0] == 'V':
        data.loc[idx, 'icd_chapter'] = 18
    elif 0 < int(data.loc[idx, 'icd9_code'][0:3]) < 140:
        data.loc[idx, 'icd_chapter'] = 1
    elif 140 <= int(data.loc[idx, 'icd9_code'][0:3]) < 240:
        data.loc[idx, 'icd_chapter'] = 2
    elif 240 <= int(data.loc[idx, 'icd9_code'][0:3]) < 280:
        data.loc[idx, 'icd_chapter'] = 3
    elif 280 <= int(data.loc[idx, 'icd9_code'][0:3]) < 290:
        data.loc[idx, 'icd_chapter'] = 4
    elif 290 <= int(data.loc[idx, 'icd9_code'][0:3]) < 320:
        data.loc[idx, 'icd_chapter'] = 5
    elif 320 <= int(data.loc[idx, 'icd9_code'][0:3]) < 390:
        data.loc[idx, 'icd_chapter'] = 6
    elif 390 <= int(data.loc[idx, 'icd9_code'][0:3]) < 460:
        data.loc[idx, 'icd_chapter'] = 7
    elif 460 <= int(data.loc[idx, 'icd9_code'][0:3]) < 520:
        data.loc[idx, 'icd_chapter'] = 8
    elif 520 <= int(data.loc[idx, 'icd9_code'][0:3]) < 580:
        data.loc[idx, 'icd_chapter'] = 9
    elif 580 <= int(data.loc[idx, 'icd9_code'][0:3]) < 630:
        data.loc[idx, 'icd_chapter'] = 10
    elif 630 <= int(data.loc[idx, 'icd9_code'][0:3]) < 680:
        data.loc[idx, 'icd_chapter'] = 11
    elif 680 <= int(data.loc[idx, 'icd9_code'][0:3]) < 710:
        data.loc[idx, 'icd_chapter'] = 12
    elif 710 <= int(data.loc[idx, 'icd9_code'][0:3]) < 740:
        data.loc[idx, 'icd_chapter'] = 13
    elif 740 <= int(data.loc[idx, 'icd9_code'][0:3]) < 760:
        data.loc[idx, 'icd_chapter'] = 14
    elif 760 <= int(data.loc[idx, 'icd9_code'][0:3]) < 780:
        data.loc[idx, 'icd_chapter'] = 15
    elif 780 <= int(data.loc[idx, 'icd9_code'][0:3]) < 800:
        data.loc[idx, 'icd_chapter'] = 16
    elif 800 <= int(data.loc[idx, 'icd9_code'][0:3]) < 1000:
        data.loc[idx, 'icd_chapter'] = 17 

Wall time: 5h 37min 16s


In [33]:
# Finally save the file to a CSV
data.to_csv(f'{path_to_raw}all_data_raw.csv.gzip', compression = 'gzip', index = False)