# Data Cleaning

## Setup

In [1]:
# Import useful libraries
import numpy as np
import pandas as pd
import os
import re
import pickle
import sys
import time

In [2]:
pd.set_option('display.max_rows', None)  ###
pd.set_option('display.max_columns', None)  ###
pd.set_option('display.width', None)  ###
pd.set_option('display.max_colwidth', None)  ###

In [3]:
# Setup Repository
with open("repo_info.txt", "r") as repo_info:
    path_to_repo = repo_info.readline()

path_to_data = f"{path_to_repo}data/"
path_to_raw = f"{path_to_data}raw/"
path_to_processed = f"{path_to_data}processed/"
path_to_icd = f"{path_to_data}icd_codes/"

## Import Data

In [4]:
raw_df = pd.read_csv(f"{path_to_processed}df_mixed.csv.gzip", compression = 'gzip', low_memory=False)

In [5]:
# Convert the columns to datetime values
raw_df['dischtime'] = pd.to_datetime(raw_df['dischtime'])
raw_df['admittime'] = pd.to_datetime(raw_df['admittime'])

In [6]:
# Calculate LOS
raw_df['los'] = raw_df.dischtime - raw_df.admittime

In [7]:
raw_df.columns

Index(['index', 'hadm_id', 'subject_id', 'admittime', 'dischtime', 'deathtime',
       'ethnicity', 'admission_type', 'admission_location', 'insurance',
       'religion', 'marital_status', 'discharge_location', 'costcenter',
       'cpt_code', 'first_careunit', 'last_careunit', 'first_wardid',
       'last_wardid', 'icu_los', 'prev_service', 'curr_service', 'gender',
       'age', 'dob', 'urea_n_min', 'urea_n_max', 'urea_n_mean',
       'platelets_min', 'platelets_max', 'platelets_mean', 'magnesium_max',
       'albumin_min', 'calcium_min', 'resprate_min', 'resprate_max',
       'resprate_mean', 'glucose_min', 'glucose_max', 'glucose_mean', 'hr_min',
       'hr_max', 'hr_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean',
       'diasbp_min', 'diasbp_max', 'diasbp_mean', 'temp_min', 'temp_max',
       'temp_mean', 'sapsii', 'sofa', 'urine_min', 'urine_mean', 'urine_max',
       'patientweight', 'icd9_code', 'proc_icd9', 'diag_icd9', 'age_cat',
       'type_stay', 'prev_adm', 'dest_discharg

In [8]:
# Keep only relevant columns
df = raw_df[['hadm_id', 'subject_id', 'ethnicity', 'admission_type',
             'admission_location', 'insurance', 'religion', 'marital_status', 'discharge_location',
             'icu_los','gender', 'age','urea_n_min', 'urea_n_max', 'urea_n_mean','platelets_min', 
             'platelets_max', 'platelets_mean', 'magnesium_max','albumin_min', 'calcium_min', 
             'resprate_min', 'resprate_max','resprate_mean', 'glucose_min', 'glucose_max', 
             'glucose_mean', 'hr_min','hr_max', 'hr_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean',
             'diasbp_min', 'diasbp_max', 'diasbp_mean', 'temp_min', 'temp_max','temp_mean', 'sapsii', 
             'sofa', 'urine_min', 'urine_mean', 'urine_max', 'patientweight',
             'age_cat','type_stay', 'prev_adm', 'dest_discharge', 'emergency_dpt', 'icd_chapter', 
             'origin_patient', 'los', 'text']]

## Explore and clean categorical variables

In [9]:
# Keep only categorical variables
var_cat = df.columns[df.dtypes == "object"]
df_cat = df[var_cat]

In [10]:
# Check how many modalities each variable has
df_cat.nunique()

ethnicity                41
admission_type            3
admission_location        8
insurance                 5
religion                 19
marital_status            7
discharge_location       16
gender                    2
type_stay                 3
prev_adm                  3
dest_discharge            2
emergency_dpt             2
origin_patient            2
text                  30985
dtype: int64

In [26]:
# Check the categories present in each variable
for col in var_cat:
    print(df_cat[col].value_counts(), '\n')

WHITE                                                       22544
BLACK/AFRICAN AMERICAN                                       2568
UNKNOWN/NOT SPECIFIED                                        1951
HISPANIC OR LATINO                                            806
OTHER                                                         656
UNABLE TO OBTAIN                                              553
ASIAN                                                         423
PATIENT DECLINED TO ANSWER                                    299
ASIAN - CHINESE                                               156
HISPANIC/LATINO - PUERTO RICAN                                151
WHITE - RUSSIAN                                               121
BLACK/CAPE VERDEAN                                            118
MULTI RACE ETHNICITY                                           69
HISPANIC/LATINO - DOMINICAN                                    61
BLACK/HAITIAN                                                  59
ASIAN - AS

In [11]:
# Harmonize the ETHNICITY column
df.loc[df['ethnicity'].str.contains('WHITE', case=False), 'ethnicity'] = 'white'
df.loc[df['ethnicity'].str.contains('BLACK', case=False), 'ethnicity'] = 'black'
df.loc[df['ethnicity'].str.contains('ASIAN', case=False), 'ethnicity'] = 'asian'
df.loc[df['ethnicity'].str.contains('HISPANIC|PORTUGUESE|SOUTH AMERICAN', case=False), 'ethnicity'] = 'hispanic'
df.loc[df['ethnicity'].str.contains('UNABLE TO OBTAIN|PATIENT DECLINED TO ANSWER|UNKNOWN/NOT SPECIFIED', case=False), 'ethnicity'] = 'unknown'
df.loc[~df['ethnicity'].str.contains('white|black|asian|hispanic|unknown', case=False), 'ethnicity'] = 'other'

In [12]:
# Check now what we have as categories
df['ethnicity'].value_counts()

white       22778
unknown      2803
black        2766
hispanic     1140
other         788
asian         710
Name: ethnicity, dtype: int64

In [14]:
# Harmonize the discharge location column
df.loc[df['discharge_location'].str.contains('home|LEFT AGAINST MEDICAL ADVI', case=False), 'dest_discharge'] = 'home'
df.loc[~df['discharge_location'].str.contains('home|LEFT AGAINST MEDICAL ADVI', case=False), 'dest_discharge'] = 'other'

In [16]:
# Drop columns we will not need
df.drop(columns=['discharge_location'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['discharge_location'], inplace = True)


In [17]:
# Harmonize the marital status column
df['marital_status'].fillna('unknown', inplace=True)
df.loc[df['marital_status'].str.contains('MARRIED|LIFE PARTNER', case=False, na='unknown'),'marital_status'] = 'couple'
df.loc[df['marital_status'].str.contains('DIVORCED|SEPARATED', case=False), 'marital_status'] = 'separated'
df.loc[df['marital_status'].str.contains('unknown', case=False), 'marital_status'] = 'unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['marital_status'].fillna('unknown', inplace=True)


In [18]:
# And check now the results
df['marital_status'].value_counts()

couple       14806
SINGLE        7903
WIDOWED       4566
separated     2347
unknown       1363
Name: marital_status, dtype: int64

In [22]:
# Harmonize the religion column
df['religion'].fillna('unknown', inplace=True)
df.loc[df['religion'].str.contains('NOT SPECIFIED|UNOBTAINABLE|unknown', case=False), 'religion'] = 'undefined'
df.loc[~df['religion'].str.contains('CATHOLIC|PROTESTANT QUAKER|JEWISH|undefined', case=False), 'religion'] = 'other'
df.religion = df.religion.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['religion'].fillna('unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.religion = df.religion.str.lower()


In [23]:
# And check now the results
df['religion'].value_counts()

catholic             11571
undefined             9682
protestant quaker     3999
jewish                2928
other                 2805
Name: religion, dtype: int64

In [24]:
# Harmonize the admission location column
df.loc[df['admission_location'].str.contains('EMERGENCY ROOM ADMIT|PHYS REFERRAL', case=False), 'admission_location'] = 'home'
df.loc[~df['admission_location'].str.contains('home', case=False), 'admission_location'] = 'other'

In [25]:
# And check now the results
df['admission_location'].value_counts()

home     18691
other    12294
Name: admission_location, dtype: int64

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30985 entries, 0 to 30984
Data columns (total 53 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   hadm_id             30985 non-null  int64          
 1   subject_id          30985 non-null  int64          
 2   ethnicity           30985 non-null  object         
 3   admission_type      30985 non-null  object         
 4   admission_location  30985 non-null  object         
 5   insurance           30985 non-null  object         
 6   religion            30985 non-null  object         
 7   marital_status      30985 non-null  object         
 8   icu_los             30985 non-null  float64        
 9   gender              30985 non-null  object         
 10  age                 30985 non-null  float64        
 11  urea_n_min          30976 non-null  float64        
 12  urea_n_max          30976 non-null  float64        
 13  urea_n_mean         30976 non-n

In [27]:
# Replace the age category column with the proper names
df['age_cat'] = df['age_cat'].replace({1:'< 18 years',2:'18-44 years',3:'45-64 years',4:'65-84 years',5:'85+ years'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age_cat'] = df['age_cat'].replace({1:'< 18 years',2:'18-44 years',3:'45-64 years',4:'65-84 years',5:'85+ years'})


In [28]:
# And check the Age Distribution
df['age_cat'].value_counts()

65-84 years    13060
45-64 years    10580
18-44 years     4080
85+ years       3265
Name: age_cat, dtype: int64

## ICD9 Codes

In [29]:
# Load the ICD Codes
icd = pd.read_csv(f'{path_to_icd}icd_chapter.txt', sep='\t', low_memory=False)

In [30]:
icd.head()

Unnamed: 0,Chapter,Code Range,Description
0,1,001-139,Infectious Parasitic
1,2,140-239,Neoplasms
2,3,240-279,Endocrine Nutritional Metabolic Immunity Disorders
3,4,280-289,Blood & Blood-Forming Organs
4,5,290-319,Mental Disorders


In [31]:
icd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Chapter      19 non-null     int64 
 1   Code Range   19 non-null     object
 2   Description  19 non-null     object
dtypes: int64(1), object(2)
memory usage: 584.0+ bytes


In [32]:
# Generate a dictionary for all ICD codes
icd_dict = {key:value for key,value in zip(icd['Chapter'],icd['Description'])}

In [33]:
icd_dict

{1: 'Infectious Parasitic ',
 2: 'Neoplasms',
 3: 'Endocrine Nutritional Metabolic Immunity Disorders',
 4: 'Blood & Blood-Forming Organs',
 5: 'Mental Disorders',
 6: 'Nervous System & Sense Organs',
 7: 'Circulatory System',
 8: 'Respiratory System',
 9: 'Digestive System',
 10: 'Genitourinary System',
 11: 'Complications Pregnancy Childbirth Puerperium',
 12: 'Skin Subcutaneous Tissue',
 13: 'Musculoskeletal System Connective Tissue',
 14: 'Congenital Anomalies',
 15: 'Certain Conditions Originating Perinatal Period',
 16: 'Symptoms Signs Ill-Defined Conditions',
 17: 'Injury Poisoning',
 18: 'Supp Factors Health Status',
 19: 'Supp External Causes Injury Poisoning'}

In [34]:
# Finally replace each ICD chapter with its proper description
df['icd_chapter'] = df['icd_chapter'].replace(icd_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['icd_chapter'] = df['icd_chapter'].replace(icd_dict)


In [35]:
# Finally save the file to a CSV
df.to_csv(f'{path_to_processed}df_mixed_discharge.csv.gzip', compression = 'gzip', index = False)