# Data Cleaning

## Setup

In [1]:
# Import useful libraries
import numpy as np
import pandas as pd
import os
import re
import pickle
import sys
import time

In [2]:
pd.set_option('display.max_rows', None)  ###
pd.set_option('display.max_columns', None)  ###
pd.set_option('display.width', None)  ###
pd.set_option('display.max_colwidth', None)  ###

In [3]:
# Setup Repository
with open("repo_info.txt", "r") as repo_info:
    path_to_repo = repo_info.readline()

path_to_data = f"{path_to_repo}data/"
path_to_raw = f"{path_to_data}raw/"
path_to_processed = f"{path_to_data}processed/"
path_to_icd = f"{path_to_data}icd_codes/"

In [4]:
# PARAMETERS -----------------------------------
# Set to True if we want to include deaths
death_incl = True
death_tag = np.where(death_incl,"_death", "")

## Import Data

In [5]:
raw_df = pd.read_csv(f"{path_to_processed}df_mixed{death_tag}.csv.gzip", compression = 'gzip', low_memory=False)

In [6]:
# Convert the columns to datetime values
raw_df['dischtime'] = pd.to_datetime(raw_df['dischtime'])
raw_df['admittime'] = pd.to_datetime(raw_df['admittime'])

In [7]:
# Calculate LOS
raw_df['los'] = raw_df.dischtime - raw_df.admittime

In [8]:
raw_df.columns

Index(['index', 'hadm_id', 'subject_id', 'admittime', 'dischtime', 'deathtime',
       'ethnicity', 'admission_type', 'admission_location', 'insurance',
       'religion', 'marital_status', 'discharge_location', 'costcenter',
       'cpt_code', 'first_careunit', 'last_careunit', 'first_wardid',
       'last_wardid', 'icu_los', 'prev_service', 'curr_service', 'gender',
       'age', 'dob', 'urea_n_min', 'urea_n_max', 'urea_n_mean',
       'platelets_min', 'platelets_max', 'platelets_mean', 'magnesium_max',
       'albumin_min', 'calcium_min', 'resprate_min', 'resprate_max',
       'resprate_mean', 'glucose_min', 'glucose_max', 'glucose_mean', 'hr_min',
       'hr_max', 'hr_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean',
       'diasbp_min', 'diasbp_max', 'diasbp_mean', 'temp_min', 'temp_max',
       'temp_mean', 'sapsii', 'sofa', 'urine_min', 'urine_mean', 'urine_max',
       'patientweight', 'icd9_code', 'proc_icd9', 'diag_icd9', 'age_cat',
       'type_stay', 'prev_adm', 'dest_discharg

In [9]:
# Keep only relevant columns
df = raw_df[['hadm_id', 'subject_id', 'ethnicity', 'admission_type',
             'admission_location', 'insurance', 'religion', 'marital_status', 'discharge_location',
             'icu_los','gender', 'age','urea_n_min', 'urea_n_max', 'urea_n_mean','platelets_min', 
             'platelets_max', 'platelets_mean', 'magnesium_max','albumin_min', 'calcium_min', 
             'resprate_min', 'resprate_max','resprate_mean', 'glucose_min', 'glucose_max', 
             'glucose_mean', 'hr_min','hr_max', 'hr_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean',
             'diasbp_min', 'diasbp_max', 'diasbp_mean', 'temp_min', 'temp_max','temp_mean', 'sapsii', 
             'sofa', 'urine_min', 'urine_mean', 'urine_max', 'patientweight',
             'age_cat','type_stay', 'prev_adm', 'dest_discharge', 'emergency_dpt', 'icd_chapter', 
             'origin_patient', 'los', 'text']]

## Explore and clean categorical variables

In [10]:
# Keep only categorical variables
var_cat = df.columns[df.dtypes == "object"]
df_cat = df[var_cat]

In [11]:
# Check how many modalities each variable has
df_cat.nunique()

ethnicity                41
admission_type            3
admission_location        8
insurance                 5
religion                 19
marital_status            7
discharge_location       17
gender                    2
type_stay                 3
prev_adm                  3
dest_discharge            2
emergency_dpt             2
origin_patient            2
text                  35268
dtype: int64

In [12]:
# Check the categories present in each variable
for col in var_cat:
    print(df_cat[col].value_counts(), '\n')

WHITE                                                       25605
BLACK/AFRICAN AMERICAN                                       2832
UNKNOWN/NOT SPECIFIED                                        2399
HISPANIC OR LATINO                                            884
OTHER                                                         749
UNABLE TO OBTAIN                                              671
ASIAN                                                         500
PATIENT DECLINED TO ANSWER                                    336
ASIAN - CHINESE                                               177
HISPANIC/LATINO - PUERTO RICAN                                161
WHITE - RUSSIAN                                               139
BLACK/CAPE VERDEAN                                            126
MULTI RACE ETHNICITY                                           77
BLACK/HAITIAN                                                  65
HISPANIC/LATINO - DOMINICAN                                    63
ASIAN - AS

In [12]:
# Harmonize the ETHNICITY column
df.loc[df['ethnicity'].str.contains('WHITE', case=False), 'ethnicity'] = 'white'
df.loc[df['ethnicity'].str.contains('BLACK', case=False), 'ethnicity'] = 'black'
df.loc[df['ethnicity'].str.contains('ASIAN', case=False), 'ethnicity'] = 'asian'
df.loc[df['ethnicity'].str.contains('HISPANIC|PORTUGUESE|SOUTH AMERICAN', case=False), 'ethnicity'] = 'hispanic'
df.loc[df['ethnicity'].str.contains('UNABLE TO OBTAIN|PATIENT DECLINED TO ANSWER|UNKNOWN/NOT SPECIFIED', case=False), 'ethnicity'] = 'unknown'
df.loc[~df['ethnicity'].str.contains('white|black|asian|hispanic|unknown', case=False), 'ethnicity'] = 'other'

In [13]:
# Check now what we have as categories
df['ethnicity'].value_counts()

white       25863
unknown      3406
black        3047
hispanic     1232
other         895
asian         825
Name: ethnicity, dtype: int64

In [14]:
# Harmonize the discharge location column
df.loc[df['discharge_location'].str.contains('home|LEFT AGAINST MEDICAL ADVI', case=False), 'dest_discharge'] = 'home'
df.loc[~df['discharge_location'].str.contains('home|LEFT AGAINST MEDICAL ADVI', case=False), 'dest_discharge'] = 'other'

In [15]:
# Drop columns we will not need
df.drop(columns=['discharge_location'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['discharge_location'], inplace = True)


In [16]:
# Harmonize the marital status column
df['marital_status'].fillna('unknown', inplace=True)
df.loc[df['marital_status'].str.contains('MARRIED|LIFE PARTNER', case=False, na='unknown'),'marital_status'] = 'couple'
df.loc[df['marital_status'].str.contains('DIVORCED|SEPARATED', case=False), 'marital_status'] = 'separated'
df.loc[df['marital_status'].str.contains('unknown', case=False), 'marital_status'] = 'unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['marital_status'].fillna('unknown', inplace=True)


In [17]:
# And check now the results
df['marital_status'].value_counts()

couple       16737
SINGLE        8717
WIDOWED       5367
separated     2635
unknown       1812
Name: marital_status, dtype: int64

In [18]:
# Harmonize the religion column
df['religion'].fillna('unknown', inplace=True)
df.loc[df['religion'].str.contains('NOT SPECIFIED|UNOBTAINABLE|unknown', case=False), 'religion'] = 'undefined'
df.loc[~df['religion'].str.contains('CATHOLIC|PROTESTANT QUAKER|JEWISH|undefined', case=False), 'religion'] = 'other'
df.religion = df.religion.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['religion'].fillna('unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.religion = df.religion.str.lower()


In [19]:
# And check now the results
df['religion'].value_counts()

catholic             13034
undefined            11202
protestant quaker     4497
jewish                3401
other                 3134
Name: religion, dtype: int64

In [20]:
# Harmonize the admission location column
df.loc[df['admission_location'].str.contains('EMERGENCY ROOM ADMIT|PHYS REFERRAL', case=False), 'admission_location'] = 'home'
df.loc[~df['admission_location'].str.contains('home', case=False), 'admission_location'] = 'other'

In [21]:
# And check now the results
df['admission_location'].value_counts()

home     21212
other    14056
Name: admission_location, dtype: int64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35268 entries, 0 to 35267
Data columns (total 53 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   hadm_id             35268 non-null  int64          
 1   subject_id          35268 non-null  int64          
 2   ethnicity           35268 non-null  object         
 3   admission_type      35268 non-null  object         
 4   admission_location  35268 non-null  object         
 5   insurance           35268 non-null  object         
 6   religion            35268 non-null  object         
 7   marital_status      35268 non-null  object         
 8   icu_los             35268 non-null  float64        
 9   gender              35268 non-null  object         
 10  age                 35268 non-null  float64        
 11  urea_n_min          35253 non-null  float64        
 12  urea_n_max          35253 non-null  float64        
 13  urea_n_mean         35253 non-n

In [23]:
# Replace the age category column with the proper names
df['age_cat'] = df['age_cat'].replace({1:'< 18 years',2:'18-44 years',3:'45-64 years',4:'65-84 years',5:'85+ years'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age_cat'] = df['age_cat'].replace({1:'< 18 years',2:'18-44 years',3:'45-64 years',4:'65-84 years',5:'85+ years'})


In [24]:
# And check the Age Distribution
df['age_cat'].value_counts()

65-84 years    15133
45-64 years    11672
18-44 years     4368
85+ years       4095
Name: age_cat, dtype: int64

## ICD9 Codes

In [25]:
# Load the ICD Codes
icd = pd.read_csv(f'{path_to_icd}icd_chapter.txt', sep='\t', low_memory=False)

In [26]:
icd.head()

Unnamed: 0,Chapter,Code Range,Description
0,1,001-139,Infectious Parasitic
1,2,140-239,Neoplasms
2,3,240-279,Endocrine Nutritional Metabolic Immunity Disorders
3,4,280-289,Blood & Blood-Forming Organs
4,5,290-319,Mental Disorders


In [27]:
icd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Chapter      19 non-null     int64 
 1   Code Range   19 non-null     object
 2   Description  19 non-null     object
dtypes: int64(1), object(2)
memory usage: 584.0+ bytes


In [28]:
# Generate a dictionary for all ICD codes
icd_dict = {key:value for key,value in zip(icd['Chapter'],icd['Description'])}

In [29]:
icd_dict

{1: 'Infectious Parasitic ',
 2: 'Neoplasms',
 3: 'Endocrine Nutritional Metabolic Immunity Disorders',
 4: 'Blood & Blood-Forming Organs',
 5: 'Mental Disorders',
 6: 'Nervous System & Sense Organs',
 7: 'Circulatory System',
 8: 'Respiratory System',
 9: 'Digestive System',
 10: 'Genitourinary System',
 11: 'Complications Pregnancy Childbirth Puerperium',
 12: 'Skin Subcutaneous Tissue',
 13: 'Musculoskeletal System Connective Tissue',
 14: 'Congenital Anomalies',
 15: 'Certain Conditions Originating Perinatal Period',
 16: 'Symptoms Signs Ill-Defined Conditions',
 17: 'Injury Poisoning',
 18: 'Supp Factors Health Status',
 19: 'Supp External Causes Injury Poisoning'}

In [30]:
# Finally replace each ICD chapter with its proper description
df['icd_chapter'] = df['icd_chapter'].replace(icd_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['icd_chapter'] = df['icd_chapter'].replace(icd_dict)


In [31]:
# Finally save the file to a CSV
df.to_csv(f'{path_to_processed}df_mixed_discharge{death_tag}.csv.gzip', compression = 'gzip', index = False)