In [1]:
import numpy as np
import pandas as pd
import sqlite3 # library for working with sqlite database
conn = sqlite3.connect("data.db") # Create a connection to the on-disk database

In [2]:
pd.read_sql("SELECT * FROM sqlite_master where type='table'", conn)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,patients,patients,2,"CREATE TABLE patients(ROW_ID INT,SUBJECT_ID IN..."
1,table,admissions,admissions,639,"CREATE TABLE admissions(ROW_ID INT,SUBJECT_ID ..."
2,table,icustays,icustays,3515,"CREATE TABLE icustays(ROW_ID INT,SUBJECT_ID IN..."
3,table,diagnoses_icd,diagnoses_icd,4944,"CREATE TABLE diagnoses_icd(ROW_ID INT,SUBJECT_..."
4,table,drgcodes,drgcodes,9061,"CREATE TABLE drgcodes(ROW_ID INT,SUBJECT_ID IN..."


In [3]:
pd.read_sql("""SELECT * FROM admissions LIMIT 10""", conn)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1
5,26,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,,,V-TACH,0,1
6,27,27,134931,2191-11-30 22:16:00,2191-12-03 14:45:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,CATHOLIC,,WHITE,,,NEWBORN,0,1
7,28,28,162569,2177-09-01 07:15:00,2177-09-06 16:00:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
8,29,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,,URGENT,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,UNKNOWN/NOT SPECIFIED,,,UNSTABLE ANGINA\CATH,0,1
9,30,31,128652,2108-08-22 23:27:00,2108-08-30 15:00:00,2108-08-30 15:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,,CATHOLIC,MARRIED,WHITE,,,STATUS EPILEPTICUS,1,1


In [4]:
pat_df = pd.read_sql("SELECT * FROM patients", conn)
# adm_df = pd.read_sql("SELECT * FROM admissions", conn)
pat_df.shape[0]

46520

In [5]:
pat_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1
2,236,251,M,2090-03-15 00:00:00,,,,0
3,237,252,M,2078-03-06 00:00:00,,,,0
4,238,253,F,2089-11-26 00:00:00,,,,0


In [6]:
# 'First' admission if it is a patient's first ever admission
# Exclude patients who die during their 'first' visit

one_admission_deaths = pd.read_sql("""SELECT SUBJECT_ID FROM admissions
WHERE HOSPITAL_EXPIRE_FLAG IS TRUE
GROUP BY SUBJECT_ID
HAVING COUNT(SUBJECT_ID) = 1
""", conn)
print(one_admission_deaths.shape[0])
# print(one_admission_deaths.head())

one_admission_survivors = pd.read_sql("""SELECT SUBJECT_ID FROM admissions
WHERE HOSPITAL_EXPIRE_FLAG IS FALSE
GROUP BY SUBJECT_ID
HAVING COUNT(SUBJECT_ID) = 1
ORDER BY CAST(SUBJECT_ID AS UNSIGNED) ASC
""", conn)
print(one_admission_survivors.shape[0])
print(one_admission_survivors.head())

mult_admissions = pd.read_sql("""SELECT SUBJECT_ID FROM admissions
GROUP BY SUBJECT_ID
HAVING COUNT(SUBJECT_ID) > 1
ORDER BY CAST(SUBJECT_ID AS UNSIGNED) ASC
""", conn)
print(mult_admissions.shape[0])
print(mult_admissions.head())

# Concatenate all subject IDs from patients who survived their first visit
patients_cols = pd.concat([one_admission_survivors, mult_admissions], axis = 0)
print(patients_cols.shape[0])
patients_cols[0:10]

5772
35522
   SUBJECT_ID
0           2
1           3
2           4
3           5
4           6
7537
   SUBJECT_ID
0          17
1          21
2          23
3          34
4          36
43059


Unnamed: 0,SUBJECT_ID
0,2
1,3
2,4
3,5
4,6
5,7
6,8
7,10
8,11
9,13


In [7]:
# Only take table info for subjects that are included in our design matrix
# Filter via inner join
admissions_feats = pd.read_sql("""SELECT adm.SUBJECT_ID, HADM_ID, ADMITTIME, DISCHTIME
                               FROM admissions adm
                               INNER JOIN patients pat
                               ON adm.SUBJECT_ID = pat.SUBJECT_ID
                               ORDER BY pat.SUBJECT_ID ASC
                               """, conn)
icustays_feats = pd.read_sql("""SELECT icu.SUBJECT_ID, HADM_ID, LAST_CAREUNIT, LOS
                             FROM icustays icu
                             INNER JOIN patients pat
                             ON icu.SUBJECT_ID = pat.SUBJECT_ID
                             ORDER BY pat.SUBJECT_ID ASC
                             """, conn)
diagnoses_icd_feats = pd.read_sql("""SELECT diag.SUBJECT_ID, HADM_ID, ICD9_CODE
                                  FROM diagnoses_icd diag
                                  INNER JOIN patients pat
                                  ON diag.SUBJECT_ID = pat.SUBJECT_ID
                                  ORDER BY pat.SUBJECT_ID ASC
                                  """, conn)
# Left joined DRG codes because a lot of DRG severity/mortality data is missing
drgcodes_feats = pd.read_sql("""SELECT drg.SUBJECT_ID, HADM_ID, DRG_SEVERITY, DRG_MORTALITY 
                             FROM drgcodes drg
                             LEFT JOIN patients pat
                             ON drg.SUBJECT_ID = pat.SUBJECT_ID
                             ORDER BY pat.SUBJECT_ID ASC
                             """, conn)

In [8]:
admissions_feats.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME
0,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00
3,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00


In [9]:
icustays_feats.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,LAST_CAREUNIT,LOS
0,2,163353,NICU,0.0918
1,3,145834,MICU,6.0646
2,4,185777,MICU,1.6785
3,5,178980,NICU,0.0844
4,6,107064,SICU,3.6729


In [10]:
diagnoses_icd_feats.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,2,163353,V3001
1,2,163353,V053
2,2,163353,V290
3,3,145834,0389
4,3,145834,78559


In [11]:
drgcodes_feats[15:25]

Unnamed: 0,SUBJECT_ID,HADM_ID,DRG_SEVERITY,DRG_MORTALITY
15,16,103251,,
16,17,194023,1.0,1.0
17,17,194023,,
18,17,161087,,
19,17,161087,3.0,3.0
20,18,188822,,
21,19,109235,3.0,3.0
22,19,109235,,
23,20,157681,,
24,21,111970,,


In [12]:
# Generate additional columns:
# Append truth labels column (Was patient readmitted within 30 days?)
# Append 'ICU stay?' column

In [13]:
subj_adm = admissions_feats["SUBJECT_ID"].value_counts()
subj_adm = subj_adm.sort_index()

mult_adm_subj = subj_adm.index[subj_adm > 1]

# Instantiate to all zeros
truth = np.zeros((len(subj_adm)))

subj_adm_idx = list(subj_adm.index)

for subject in mult_adm_subj:
    temp_df = admissions_feats[admissions_feats["SUBJECT_ID"] == subject]
    
    # Get time between first discharge and second admission
    first_disc_time = temp_df["DISCHTIME"].iloc[0]
    second_adm_time = temp_df["ADMITTIME"].iloc[1]
    days_to_readm = (pd.to_datetime(second_adm_time) - pd.to_datetime(first_disc_time)).days
    
    # If readmitted within 30 days, set truth to 1
    if days_to_readm <= 30:
        truth[subj_adm_idx.index(subject)] = 1
        
truth_df = pd.DataFrame({'SUBJECT_ID': subj_adm_idx, 'TARGET': np.array(truth, dtype='int')})