# Data Cleaning

## Setup

In [None]:
# Import useful libraries
import numpy as np
import pandas as pd
import os
import re
import pickle
import sys
import time

In [2]:
pd.set_option('display.max_rows', None)  ###
pd.set_option('display.max_columns', None)  ###
pd.set_option('display.width', None)  ###
pd.set_option('display.max_colwidth', None)  ###

In [None]:
# Setup Repository
with open("repo_info.txt", "r") as repo_info:
    path_to_repo = repo_info.readline()

path_to_data = f"{path_to_repo}data/"
path_to_raw = f"{path_to_data}raw/"
path_to_processed = f"{path_to_data}processed/"
path_to_icd = f"{path_to_data}icd_codes/"

## Import Data

In [4]:
raw_df = pd.read_csv(f"{path_to_processed}df_mixed.csv.gzip", compression = 'gzip', low_memory=False)

In [8]:
# Keep only relevant columns
df = raw_df[['HADM_ID', 'subject_id', 'ethnicity', 'admission_type',
             'admission_location', 'insurance', 'religion', 'marital_status', 'discharge_location',
             'icu_los','gender', 'age','urea_n_min', 'urea_n_max', 'urea_n_mean','platelets_min', 
             'platelets_max', 'platelets_mean', 'magnesium_max','albumin_min', 'calcium_min', 
             'resprate_min', 'resprate_max','resprate_mean', 'glucose_min', 'glucose_max', 
             'glucose_mean', 'hr_min','hr_max', 'hr_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean',
             'diasbp_min', 'diasbp_max', 'diasbp_mean', 'temp_min', 'temp_max','temp_mean', 'sapsii', 
             'sofa', 'urine_min', 'urine_mean', 'urine_max', 'patientweight', 'readmit_dt',
             'age_cat','THS_cat', 'prev_adm', 'dest_discharge', 'emergency_dpt','admit_loc_cat', 'icd_chapter', 'OP', 'los', 'discharge']]

In [9]:
# Then rename certain columns
df.rename(columns = {"OP":"origin_patient","THS_cat":"type_stay"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Explore and clean categorical variables

In [12]:
# Keep only categorical variables
var_cat = df.columns[df.dtypes == "object"]
df_cat = df[var_cat]

In [13]:
# Check how many modalities each variable has
df_cat.nunique()

ethnicity             41
admission_type         3
admission_location     8
insurance              5
religion              19
marital_status         7
discharge_location    17
gender                 2
type_stay              3
prev_adm               3
dest_discharge         2
emergency_dpt          2
origin_patient         2
dtype: int64

In [15]:
# Check the categories present in each variable
for col in var_cat:
    print(df_cat[col].value_counts(), '\n')

WHITE                                                       28814
BLACK/AFRICAN AMERICAN                                       3183
UNKNOWN/NOT SPECIFIED                                        2937
HISPANIC OR LATINO                                            996
OTHER                                                         847
UNABLE TO OBTAIN                                              743
ASIAN                                                         559
PATIENT DECLINED TO ANSWER                                    370
ASIAN - CHINESE                                               188
HISPANIC/LATINO - PUERTO RICAN                                171
WHITE - RUSSIAN                                               151
BLACK/CAPE VERDEAN                                            133
MULTI RACE ETHNICITY                                           84
BLACK/HAITIAN                                                  68
HISPANIC/LATINO - DOMINICAN                                    66
ASIAN - AS

In [16]:
# Harmonize the ETHNICITY column
df.loc[df['ethnicity'].str.contains('WHITE', case=False), 'ethnicity'] = 'white'
df.loc[df['ethnicity'].str.contains('BLACK', case=False), 'ethnicity'] = 'black'
df.loc[df['ethnicity'].str.contains('ASIAN', case=False), 'ethnicity'] = 'asian'
df.loc[df['ethnicity'].str.contains('HISPANIC|PORTUGUESE|SOUTH AMERICAN', case=False), 'ethnicity'] = 'hispanic'
df.loc[df['ethnicity'].str.contains('UNABLE TO OBTAIN|PATIENT DECLINED TO ANSWER|UNKNOWN/NOT SPECIFIED', case=False), 'ethnicity'] = 'unknown'
df.loc[~df['ethnicity'].str.contains('white|black|asian|hispanic|unknown', case=False), 'ethnicity'] = 'other'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [22]:
# Check now what we have as categories
df['ethnicity'].value_counts()

white       29103
unknown      4050
black        3408
hispanic     1368
other        1007
asian         905
Name: ethnicity, dtype: int64

In [23]:
# Drop useless columns
df.drop(columns='dest_discharge', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [24]:
# Harmonize the discharge location column
df.loc[df['discharge_location'].str.contains('home|LEFT AGAINST MEDICAL ADVI', case=False), 'dest_discharge'] = 'home'
df.loc[~df['discharge_location'].str.contains('home|LEFT AGAINST MEDICAL ADVI', case=False), 'dest_discharge'] = 'other'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)


In [26]:
# And check now the results
df['discharge_location'].value_counts()

other    21912
home     17929
Name: dest_discharge, dtype: int64

In [27]:
# Harmonize the marital status column
df['marital_status'].fillna('unknown', inplace=True)
df.loc[df['marital_status'].str.contains('MARRIED|LIFE PARTNER', case=False, na='unknown'),'marital_status'] = 'couple'
df.loc[df['marital_status'].str.contains('DIVORCED|SEPARATED', case=False), 'marital_status'] = 'separated'
df.loc[df['marital_status'].str.contains('unknown', case=False), 'marital_status'] = 'unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [30]:
# And check now the results
df['marital_status'].value_counts()

couple       18870
SINGLE        9768
WIDOWED       6054
separated     3010
unknown       2139
Name: marital_status, dtype: int64

In [31]:
# Harmonize the religion column
df['religion'].fillna('unknown', inplace=True)
df.loc[df['religion'].str.contains('NOT SPECIFIED|UNOBTAINABLE|unknown', case=False), 'religion'] = 'undefined'
df.loc[~df['religion'].str.contains('CATHOLIC|PROTESTANT QUAKER|JEWISH|undefined', case=False), 'religion'] = 'other'

In [33]:
# And check now the results
df['religion'].value_counts()

CATHOLIC             14720
undefined            12569
PROTESTANT QUAKER     5087
JEWISH                3863
other                 3602
Name: religion, dtype: int64

In [35]:
# Harmonize the admission location column
df.loc[df['admission_location'].str.contains('EMERGENCY ROOM ADMIT|PHYS REFERRAL', case=False), 'admission_location'] = 'home'
df.loc[~df['admission_location'].str.contains('home', case=False), 'admission_location'] = 'other'

In [37]:
# And check now the results
df['admission_location'].value_counts()

home     24155
other    15686
Name: admission_location, dtype: int64

# CHECK HERE IF WE ACTUALLY DROPPED IT

In [40]:
# Drop columns we will not need
df_final = df.drop(columns=['discharge_location','admit_loc_cat'])

In [41]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39841 entries, 0 to 39840
Data columns (total 53 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   HADM_ID             39841 non-null  float64
 1   subject_id          39841 non-null  int64  
 2   ethnicity           39841 non-null  object 
 3   admission_type      39841 non-null  object 
 4   admission_location  39841 non-null  object 
 5   insurance           39841 non-null  object 
 6   religion            39841 non-null  object 
 7   marital_status      39841 non-null  object 
 8   icu_los             39841 non-null  float64
 9   gender              39841 non-null  object 
 10  age                 39841 non-null  float64
 11  urea_n_min          39826 non-null  float64
 12  urea_n_max          39826 non-null  float64
 13  urea_n_mean         39826 non-null  float64
 14  platelets_min       39821 non-null  float64
 15  platelets_max       39821 non-null  float64
 16  plat

In [43]:
# Replace the age category column with the proper names
df_final['age_cat'] = df_final['age_cat'].replace({1:'< 18 years',2:'18-44 years',3:'45-64 years',4:'65-84 years',5:'85+ years'})

In [44]:
# And check the Age Distribution
df_final['age_cat'].value_counts()

65-84 years    17198
45-64 years    13169
18-44 years     4870
85+ years       4539
< 18 years        65
Name: age_cat, dtype: int64

## ICD9 Codes

In [45]:
# Load the ICD Codes
icd = pd.read_csv(f'{path_to_icd}icd_chapter.txt', sep='\t', low_memory=False)

In [46]:
icd.head()

Unnamed: 0,Chapter,Code Range,Description
0,1,001-139,Infectious Parasitic
1,2,140-239,Neoplasms
2,3,240-279,Endocrine Nutritional Metabolic Immunity Disorders
3,4,280-289,Blood & Blood-Forming Organs
4,5,290-319,Mental Disorders


In [47]:
icd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Chapter      19 non-null     int64 
 1   Code Range   19 non-null     object
 2   Description  19 non-null     object
dtypes: int64(1), object(2)
memory usage: 584.0+ bytes


In [48]:
# Generate a dictionary for all ICD codes
icd_dict = {key:value for key,value in zip(icd['Chapter'],icd['Description'])}

In [49]:
icd_dict

{1: 'Infectious Parasitic ',
 2: 'Neoplasms',
 3: 'Endocrine Nutritional Metabolic Immunity Disorders',
 4: 'Blood & Blood-Forming Organs',
 5: 'Mental Disorders',
 6: 'Nervous System & Sense Organs',
 7: 'Circulatory System',
 8: 'Respiratory System',
 9: 'Digestive System',
 10: 'Genitourinary System',
 11: 'Complications Pregnancy Childbirth Puerperium',
 12: 'Skin Subcutaneous Tissue',
 13: 'Musculoskeletal System Connective Tissue',
 14: 'Congenital Anomalies',
 15: 'Certain Conditions Originating Perinatal Period',
 16: 'Symptoms Signs Ill-Defined Conditions',
 17: 'Injury Poisoning',
 18: 'Supp Factors Health Status',
 19: 'Supp External Causes Injury Poisoning'}

In [50]:
# Finally replace each ICD chapter with its proper description
df_final['icd_chapter'] = df_final['icd_chapter'].replace(icd_dict)

In [None]:
# Finally save the file to a CSV
df_final.to_csv(f'{path_to_processed}df_mixed_discharge.csv.gzip', compression = 'gzip', index = False)