# Preprocessing: procedures_icd Table

In [1]:
import os
os.chdir('../../')
from hosp_preprocess_util import *
import pickle
import pandas as pd

In [2]:
admit_data = "./mimic-iv-1.0/core/admissions.csv.gz"
adm = pd.read_csv(admit_data, usecols=['hadm_id', 'admittime'], parse_dates = ['admittime'])
adm.head()

Unnamed: 0,hadm_id,admittime
0,21038362,2139-09-26 14:16:00
1,24941086,2123-10-07 23:56:00
2,21965160,2147-01-14 09:00:00
3,24709883,2165-12-27 17:33:00
4,23272159,2122-08-28 08:48:00


### Getting the Procedures table into long format with timedelta

In [4]:
# Use custom function to get timedelta according to the procedure's chartdate
proc = timestamp_cohort_data("./mimic-iv-1.0/hosp/procedures_icd.csv.gz", './data/cohort.gzip', 'chartdate', 'base_anchor_year', dtypes=None, usecols=None)
proc.head()

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version,admit_year,label,base_anchor_year,max_year_group,min_year_group,anchor_year,timedelta_days,timedelta_years
516275,10000032,25742920,1,2180-08-06,5491,9,2180,0,2174,2016,2014,2180,2409,6.0
516273,10000032,22595853,1,2180-05-07,5491,9,2180,0,2174,2016,2014,2180,2318,6.0
516274,10000032,22841357,1,2180-06-27,5491,9,2180,0,2174,2016,2014,2180,2369,6.0
473014,10000280,25852320,1,2151-03-18,8938,9,2151,0,2151,2010,2008,2151,76,0.0
248724,10000560,28979390,1,2189-10-16,5551,9,2189,0,2189,2010,2008,2189,288,0.0


In [5]:
proc_merged = proc.merge(adm, left_on = 'hadm_id', right_on = 'hadm_id', how = 'inner')

In [7]:
proc_merged.head()

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version,admit_year,label,base_anchor_year,max_year_group,min_year_group,anchor_year,timedelta_days,timedelta_years,admittime
0,10000032,25742920,1,2180-08-06,5491,9,2180,0,2174,2016,2014,2180,2409,6.0,2180-08-05 23:44:00
1,10000032,22595853,1,2180-05-07,5491,9,2180,0,2174,2016,2014,2180,2318,6.0,2180-05-06 22:23:00
2,10000032,22841357,1,2180-06-27,5491,9,2180,0,2174,2016,2014,2180,2369,6.0,2180-06-26 18:27:00
3,10000280,25852320,1,2151-03-18,8938,9,2151,0,2151,2010,2008,2151,76,0.0,2151-03-18 03:28:00
4,10000560,28979390,1,2189-10-16,5551,9,2189,0,2189,2010,2008,2189,288,0.0,2189-10-15 10:30:00


In [11]:
proc_merged['proc_time_from_admit'] = proc_merged['chartdate'] - proc_merged['admittime']

In [12]:
proc_merged.head()

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version,admit_year,label,base_anchor_year,max_year_group,min_year_group,anchor_year,timedelta_days,timedelta_years,admittime,proc_time_from_admit
0,10000032,25742920,1,2180-08-06,5491,9,2180,0,2174,2016,2014,2180,2409,6.0,2180-08-05 23:44:00,0 days 00:16:00
1,10000032,22595853,1,2180-05-07,5491,9,2180,0,2174,2016,2014,2180,2318,6.0,2180-05-06 22:23:00,0 days 01:37:00
2,10000032,22841357,1,2180-06-27,5491,9,2180,0,2174,2016,2014,2180,2369,6.0,2180-06-26 18:27:00,0 days 05:33:00
3,10000280,25852320,1,2151-03-18,8938,9,2151,0,2151,2010,2008,2151,76,0.0,2151-03-18 03:28:00,-1 days +20:32:00
4,10000560,28979390,1,2189-10-16,5551,9,2189,0,2189,2010,2008,2189,288,0.0,2189-10-15 10:30:00,0 days 13:30:00


In [13]:
proc = proc_merged

In [14]:
# Print unique counts and value_counts
print("# Unique ICD9 Procedures:  ", proc.loc[proc.icd_version == 9].icd_code.dropna().nunique())
print("# Unique ICD10 Procedures: ",proc.loc[proc.icd_version == 10].icd_code.dropna().nunique())

print("\nValue counts of each ICD version:\n", proc.icd_version.value_counts())

# Unique ICD9 Procedures:   2549
# Unique ICD10 Procedures:  4932

Value counts of each ICD version:
 9     466147
10     44906
Name: icd_version, dtype: int64


### Saving the long-format dataset

In [15]:
# Save two versions of procedures dataset; one with all procedure codes (ICD9 & 10), and then one with only ICD10 codes
proc[['subject_id', 'hadm_id', 'icd_code', 'chartdate', 'admittime', 'proc_time_from_admit']].dropna().to_csv("./data/long_format/proc/preproc_proc.csv.gz", compression='gzip', index=False)
proc.loc[proc.icd_version == 10][['subject_id', 'hadm_id', 'icd_code', 'chartdate', 'admittime', 'proc_time_from_admit']].dropna().to_csv("./data/long_format/proc/preproc_proc_icd10.csv.gz", compression='gzip', index=False)

In [16]:
pd.read_pickle("./data/long_format/proc/long_proc_icd10_p4.gzip", compression='gzip')

FileNotFoundError: [Errno 2] No such file or directory: './data/long_format/proc/long_proc_icd10_p4.gzip'