# Data Preparation

## Setup

In [1]:
# Import useful libraries
import numpy as np
import pandas as pd
import os
import re
import pickle
import sys
import time

In [6]:
# Setup Repository
with open("repo_info.txt", "r") as repo_info:
    path_to_repo = repo_info.readline()

path_to_data = f"{path_to_repo}data/"
path_to_raw = f"{path_to_data}raw/"
path_to_processed = f"{path_to_data}processed/"

## Import Data

In [7]:
# We firstly import the data resulting from our queries
df = pd.read_csv(f'{path_to_raw}all_data2_1.csv').drop(columns=['Unnamed: 0','Unnamed: 0.1'])

  df = pd.read_csv(f'{path_to_data}all_data2_1.csv').drop(columns=['Unnamed: 0','Unnamed: 0.1'])


In [10]:
# Then we drop any duplicates
df = df.drop_duplicates(subset=['hadm_id']).reset_index()

In [11]:
# And print info our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36141 entries, 0 to 36140
Data columns (total 72 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  36141 non-null  int64  
 1   hadm_id                36141 non-null  int64  
 2   subject_id             36141 non-null  int64  
 3   admittime              36141 non-null  object 
 4   dischtime              36141 non-null  object 
 5   deathtime              4693 non-null   object 
 6   ethnicity              36141 non-null  object 
 7   admission_type         36141 non-null  object 
 8   admission_location     36141 non-null  object 
 9   insurance              36141 non-null  object 
 10  religion               36019 non-null  object 
 11  marital_status         34445 non-null  object 
 12  discharge_location     36141 non-null  object 
 13  costcenter             36141 non-null  object 
 14  cpt_code               36141 non-null  object 
 15  fi

In [12]:
df.head()

Unnamed: 0,index,hadm_id,subject_id,admittime,dischtime,deathtime,ethnicity,admission_type,admission_location,insurance,...,readmit_last_careunit,icd9_code,age_cat,THS_cat,prev_adm,dest_discharge,emergency_dpt,admit_loc_cat,icd_chapter,OP
0,0,100003,54610,2150-04-17 15:34:00,2150-04-21 17:30:00,,WHITE,EMERGENCY,EMERGENCY ROOM ADMIT,Private,...,,53100,3,1-Medical,1-No hospitalization,1-Home,Yes,,9,2-Other
1,6,100006,9895,2108-04-06 15:49:00,2108-04-18 17:18:00,,BLACK/AFRICAN AMERICAN,EMERGENCY,EMERGENCY ROOM ADMIT,Private,...,,49320,3,,1-No hospitalization,1-Home,Yes,,8,2-Other
2,14,100007,23018,2145-03-31 05:33:00,2145-04-07 12:40:00,,WHITE,EMERGENCY,EMERGENCY ROOM ADMIT,Private,...,,56081,4,3-Surgical,1-No hospitalization,1-Home,Yes,,9,2-Other
3,17,100009,533,2162-05-16 15:56:00,2162-05-21 13:37:00,,WHITE,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,Private,...,,41401,3,3-Surgical,1-No hospitalization,1-Home,Yes,,7,2-Other
4,25,100010,55853,2109-12-10 07:15:00,2109-12-14 16:45:00,,WHITE,ELECTIVE,PHYS REFERRAL/NORMAL DELI,Private,...,,1890,3,1-Medical,1-No hospitalization,1-Home,No,,2,1-Home


In [17]:
# Convert the deathtime column to a datetime-column
df['deathtime'] = pd.to_datetime(df.deathtime, errors = 'coerce')
# Remove any observation with a recorded death case
df_clean = df[np.isnat(df['deathtime'])].copy()
print(f"Number of observations with a deathtime: {df.shape[0] - df_clean.shape[0]}")

Number of observations with a deathtime: 4693


In [19]:
# We now select only the columns we need, remove the rest and check again for duplicates
vars = ['admit_loc_cat','admittime','albumin_min','cpt_code','costcenter','curr_service','deathtime','discharge_location','dischtime','dob','diag_icd9','first_careunit','first_wardid','icd9_code','icu_los','last_careunit','last_wardid','next_readmit_dt','patientweight','prev_service','proc_icd9','readmit_last_careunit','subject_id','THS_cat']
print(f"N. of observations - pre duplicates removal: {df_clean.shape[0]}")
df_clean = df_clean.drop(columns=vars).drop_duplicates()
print(f"N. of observations - post duplicates removal: {df_clean.shape[0]}")

N. of observations - pre duplicates removal: 31448
N. of observations - post duplicates removal: 31448


### Process NOTEEVENTS to get discharge notes

In [19]:
# Now load the NOTEEVENTS
text = pd.read_csv(f'{path_to_raw}NOTEEVENTS.csv', low_memory = False)

In [20]:
text.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [35]:
# Subset to only discharge summaries
df_adm_text = text[text['CATEGORY'] == 'Discharge summary'][['HADM_ID','TEXT']].set_index('HADM_ID')

In [None]:
# Then merge it with the tabular dataset
df_full = pd.merge(df_clean.set_index('hadm_id'), df_adm_text, left_index=True, right_index=True, indicator = True)
print(f"Check merges:\n{df_full._merge.value_counts()}")
df_full.drop(columns = ['_merge'], inplace = True)

In [54]:
# Finally save the file to a CSV
df_full.to_csv(f'{path_to_processed}df_mixed.csv.gzip', compression = 'gzip', index = False)