# Data Cleaning

## Setup

In [1]:
# Import useful libraries
import numpy as np
import pandas as pd
import os
import re
import pickle
import sys
import time

In [2]:
pd.set_option('display.max_rows', None)  ###
pd.set_option('display.max_columns', None)  ###
pd.set_option('display.width', None)  ###
pd.set_option('display.max_colwidth', None)  ###

In [None]:
# # Setup Repository
# with open("repo_info.txt", "r") as repo_info:
#     path_to_repo = repo_info.readline()

# path_to_data = f"{path_to_repo}data/"
# path_to_raw = f"{path_to_data}raw/"
# path_to_processed = f"{path_to_data}processed/"
# path_to_icd = f"{path_to_data}icd_codes/"

## Import Data

In [3]:
path_to_processed= 'processed_data'
raw_df = pd.read_csv(f"{path_to_processed}/df_mixed.csv.gzip", compression = 'gzip', low_memory=False)

In [4]:
# Convert the columns to datetime values
raw_df['dischtime'] = pd.to_datetime(raw_df['dischtime'])
raw_df['admittime'] = pd.to_datetime(raw_df['admittime'])

In [5]:
# Calculate LOS
raw_df['los'] = raw_df.dischtime - raw_df.admittime

In [6]:
raw_df.columns

Index(['hadm_id', 'subject_id', 'admittime', 'dischtime', 'deathtime', 'race',
       'admission_type', 'admission_location', 'insurance', 'language',
       'marital_status', 'discharge_location', 'first_careunit',
       'last_careunit', 'icu_los', 'prev_service', 'curr_service', 'gender',
       'age', 'urea_n_min', 'urea_n_max', 'urea_n_mean', 'platelets_min',
       'platelets_max', 'platelets_mean', 'magnesium_max', 'albumin_min',
       'calcium_min', 'resprate_min', 'resprate_max', 'resprate_mean',
       'glucose_min', 'glucose_max', 'glucose_mean', 'hr_min', 'hr_max',
       'hr_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean', 'diasbp_min',
       'diasbp_max', 'diasbp_mean', 'temp_min', 'temp_max', 'temp_mean',
       'sapsii', 'sofa_24hours', 'urine_min', 'urine_mean', 'urine_max',
       'charlson_comorbidity_index', 'patientweight', 'icd_code', 'proc_icd',
       'diag_icd', 'age_cat', 'type_stay', 'prev_adm', 'dest_discharge',
       'emergency_dpt', 'icd_chapter', 'origin

In [29]:
# Keep only relevant columns
df = raw_df[['hadm_id', 'subject_id', 'race', 'admission_type',
             'admission_location', 'insurance', 'language', 'marital_status', 'discharge_location',
             'icu_los','gender', 'age','urea_n_min', 'urea_n_max', 'urea_n_mean','platelets_min', 
             'platelets_max', 'platelets_mean', 'magnesium_max','albumin_min', 'calcium_min', 
             'resprate_min', 'resprate_max','resprate_mean', 'glucose_min', 'glucose_max', 
             'glucose_mean', 'hr_min','hr_max', 'hr_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean',
             'diasbp_min', 'diasbp_max', 'diasbp_mean', 'temp_min', 'temp_max','temp_mean', 'sapsii', 
             'sofa_24hours', 'urine_min', 'urine_mean', 'urine_max', 'patientweight',
             'age_cat','type_stay', 'prev_adm', 'dest_discharge', 'emergency_dpt', 'icd_chapter', 
             'origin_patient', 'los', 'discharge_text','radiology_text']]

## Explore and clean categorical variables

In [30]:
# Keep only categorical variables
var_cat = df.columns[df.dtypes == "object"]
df_cat = df[var_cat]

In [31]:
# Check how many modalities each variable has
df_cat.nunique()

race                     33
admission_type            9
admission_location       11
insurance                 3
language                  2
marital_status            4
discharge_location       13
gender                    2
type_stay                 3
prev_adm                  3
dest_discharge            2
emergency_dpt             2
origin_patient            1
discharge_text        42673
radiology_text        41656
dtype: int64

In [1]:
# Check the categories present in each variable
for col in var_cat:
    print(df_cat[col].value_counts(), '\n')

In [33]:
# Harmonize the ETHNICITY column
df.loc[df['race'].str.contains('WHITE', case=False), 'race'] = 'white'
df.loc[df['race'].str.contains('BLACK', case=False), 'race'] = 'black'
df.loc[df['race'].str.contains('ASIAN', case=False), 'race'] = 'asian'
df.loc[df['race'].str.contains('HISPANIC|PORTUGUESE|SOUTH AMERICAN', case=False), 'race'] = 'hispanic'
df.loc[df['race'].str.contains('UNABLE TO OBTAIN|PATIENT DECLINED TO ANSWER|UNKNOWN/NOT SPECIFIED', case=False), 'race'] = 'unknown'
df.loc[~df['race'].str.contains('white|black|asian|hispanic|unknown', case=False), 'race'] = 'other'

In [34]:
# Check now what we have as categories
df['race'].value_counts()

white       29743
black        4076
UNKNOWN      3788
hispanic     1771
other        1602
asian        1167
unknown       864
Name: race, dtype: int64

In [35]:
# Harmonize the discharge location column
df.loc[df['discharge_location'].str.contains('home|LEFT AGAINST MEDICAL ADVI', case=False, na=False), 'dest_discharge'] = 'home'
df.loc[~df['discharge_location'].str.contains('home|LEFT AGAINST MEDICAL ADVI', case=False, na=False), 'dest_discharge'] = 'other'

In [36]:
# Drop columns we will not need
df.drop(columns=['discharge_location'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['discharge_location'], inplace = True)


In [37]:
# Harmonize the marital status column
df['marital_status'].fillna('unknown', inplace=True)
df.loc[df['marital_status'].str.contains('MARRIED|LIFE PARTNER', case=False, na='unknown'),'marital_status'] = 'couple'
df.loc[df['marital_status'].str.contains('DIVORCED|SEPARATED', case=False), 'marital_status'] = 'separated'
df.loc[df['marital_status'].str.contains('SINGLE', case=False), 'marital_status'] = 'single'
df.loc[df['marital_status'].str.contains('WIDOWED', case=False), 'marital_status'] = 'widowed'
df.loc[df['marital_status'].str.contains('unknown', case=False), 'marital_status'] = 'unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['marital_status'].fillna('unknown', inplace=True)


In [38]:
# And check now the results
df['marital_status'].value_counts()

couple       19785
single       11672
widowed       5440
separated     3174
unknown       2940
Name: marital_status, dtype: int64

In [39]:
# Harmonize the religion column
df['language'].fillna('unknown', inplace=True)
df.loc[df['language'].str.contains('ENGLISH', case=False), 'language'] = 'English'
df.loc[~df['language'].str.contains('ENGLISH', case=False), 'language'] = 'Other'
df.language = df.language.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['language'].fillna('unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.language = df.language.str.lower()


In [40]:
# And check now the results
df['language'].value_counts()

english    38644
other       4367
Name: language, dtype: int64

In [44]:
# Harmonize the admission type column
df.loc[df['admission_type'].str.contains('EMERGENCY ROOM|URGENT|EW EMER\.', case=False), 'admission_type'] = 'emergency'
df.loc[df['admission_type'].str.contains('PHYSICIAN REFERRAL|CLINIC REFERRAL|OBSERVATION ADMIT|AMBULATORY SURGERY TRANSFER', case=False), 'admission_type'] = 'physician referral'
df.loc[df['admission_type'].str.contains('TRANSFER FROM SKILLED NURSING FACILITY|PACU|TRANSFER FROM HOSPITAL|INTERNAL TRANSFER TO OR FROM PSYCH|PROCEDURE SITE', case=False), 'admission_type'] = 'transfer from facility'
df.loc[df['admission_type'].str.contains('WALK-IN/SELF REFERRAL', case=False), 'admission_type'] = 'self-admission'
df['admission_type'].fillna('Other', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['admission_type'].fillna('Other', inplace=True)


In [45]:
# And check now the results
df['admission_type'].value_counts()

emergency                 20146
physician referral        12216
transfer from facility    10076
self-admission              573
Name: admission_type, dtype: int64

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43011 entries, 0 to 43010
Data columns (total 54 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   hadm_id             43011 non-null  int64          
 1   subject_id          43011 non-null  int64          
 2   race                43011 non-null  object         
 3   admission_type      43011 non-null  object         
 4   admission_location  43011 non-null  object         
 5   insurance           43011 non-null  object         
 6   language            43011 non-null  object         
 7   marital_status      43011 non-null  object         
 8   icu_los             43011 non-null  float64        
 9   gender              43011 non-null  object         
 10  age                 43011 non-null  int64          
 11  urea_n_min          42940 non-null  float64        
 12  urea_n_max          42940 non-null  float64        
 13  urea_n_mean         42940 non-n

In [47]:
# Replace the age category column with the proper names
df['age_cat'] = df['age_cat'].replace({1:'< 18 years',2:'18-44 years',3:'45-64 years',4:'65-84 years',5:'85+ years'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age_cat'] = df['age_cat'].replace({1:'< 18 years',2:'18-44 years',3:'45-64 years',4:'65-84 years',5:'85+ years'})


In [48]:
# And check the Age Distribution
df['age_cat'].value_counts()

65-84 years    18070
45-64 years    15317
18-44 years     5543
85+ years       4081
Name: age_cat, dtype: int64

## ICD CODE

In [49]:
icd = pd.read_csv("D:\python\data science mission\icd.csv") #The icd file

In [50]:
icd.head()

Unnamed: 0,Chapter,Code Range,Description
0,1,001-139,Infectious and Parasitic Diseases
1,2,140-239,Neoplasms
2,3,240-279,"Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders"
3,4,280-289,Diseases of the Blood and Blood-forming Organs and Certain Disorders involving the Immune Mechanism
4,5,290-319,Mental Disorders


In [51]:
icd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Chapter      42 non-null     int64 
 1   Code Range   42 non-null     object
 2   Description  42 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.1+ KB


In [52]:
# Generate a dictionary for all ICD codes
icd_dict = {key:value for key,value in zip(icd['Chapter'],icd['Description'])}

In [53]:
icd_dict

{1: 'Infectious and Parasitic Diseases',
 2: 'Neoplasms ',
 3: 'Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders ',
 4: 'Diseases of the Blood and Blood-forming Organs and Certain Disorders involving the Immune Mechanism',
 5: 'Mental Disorders ',
 6: 'Diseases of the Nervous System ',
 7: 'Diseases of the Sense Organs',
 8: 'Diseases of the Circulatory System ',
 9: 'Diseases of the Respiratory System ',
 10: 'Diseases of the Digestive System ',
 11: 'Diseases of the Genitourinary System ',
 12: 'Complications of Pregnancy, Childbirth, and the Puerperium ',
 13: 'Diseases of the Skin and Subcutaneous Tissue ',
 14: 'Diseases of the Musculoskeletal System and Connective Tissue ',
 15: 'Congenital Anomalies ',
 16: 'Certain Conditions originating in the Perinatal Period ',
 17: 'Symptoms, Signs, and Ill-defined Conditions ',
 18: 'Injury and Poisoning ',
 19: 'Supp Factors Health Status ',
 20: 'Supp External Causes Injury Poisoning ',
 21: 'Certain Infectious and P

In [54]:
# Replace each ICD chapter with its proper description
df['icd_chapter'] = df['icd_chapter'].replace(icd_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['icd_chapter'] = df['icd_chapter'].replace(icd_dict)


In [55]:
# Finally save the file to a CSV
path_to_processed = "processed_data"
df.to_csv(f'{path_to_processed}/df_mixed_discharge.csv.gzip', compression = 'gzip', index = False)
df.to_csv(f'{path_to_processed}/df_mixed_discharge.csv', index = False)