In [None]:
#######################################################################################
#
# Perform data preprocssing for icu and hospitalization prediction
# using panda
# 
#######################################################################################

In [None]:
# data from https://www.kaggle.com/tanmoyx/covid19-patient-precondition-dataset
#
FILENAME = "source/covid.csv"

# locatin for data files
TARGET = "../visualization/data/"

In [None]:
import numpy as np
import pandas as pd

In [None]:
# inspect original data structure
#

df = pd.read_csv(FILENAME)
print("Original dataframe shape: ", df.shape)
df.head()

In [None]:
# no males who are pregnant (verification only)
#

df[(df.sex == 2) & (df.pregnancy == 1)]

In [None]:
# patients whose date of entry is greater than the day they are expired
#

from datetime import datetime

# copy dataframe records without date_died field that has "9999-99-99"
predf = df[(df.date_died != "9999-99-99")].copy() # *** use .copy() beofre to_datetime()
print(predf.shape)

# convert string to datetime
predf.loc[:, "entry_date"] = pd.to_datetime(predf.loc[:, "entry_date"] , format='%d-%m-%Y', errors='ignore')
predf.loc[:, "date_died"] = pd.to_datetime(predf.loc[:, "date_died"], format='%d-%m-%Y', errors='ignore')

print(predf[(predf.entry_date > predf.date_died)].shape)

df.drop(predf[(predf.entry_date > predf.date_died)].index, inplace = True)

print(df.shape)

In [None]:
# only include patients with covid-19 positive
#

df = df[df.covid_res == 1]

print("COVID-19 only dataframe shape: ", df.shape)
df.head()

In [None]:
all_selected_columns = [
    'age',
    'sex',
    'pneumonia',
    'diabetes',
    'copd',
    'asthma',
    'inmsupr',
    'hypertension',
    'other_disease',
    'cardiovascular',
    'obesity',
    'renal_chronic',
    'tobacco',
    'icu',   # label
    'patient_type'   # label
]

icu_columns = [
    'age',
    'sex',
    'pneumonia',
    'diabetes',
    'copd',
    'asthma',
    'inmsupr',
    'hypertension',
    'other_disease',
    'cardiovascular',
    'obesity',
    'renal_chronic',
    'tobacco',
    'icu'   # label
]

hospitalization_columns = [
    'sex',
    'patient_type',   # label
    'pneumonia',
    'age',
    'diabetes',
    'copd',
    'asthma',
    'inmsupr',
    'hypertension',
    'other_disease',
    'cardiovascular',
    'obesity',
    'renal_chronic',
    'tobacco'
]


In [None]:
# select relevant features and labels from df

df_all = df[all_selected_columns].copy()
print("Selected dataframe shape: ", df_all.shape)
df_all.head()

In [None]:
# convert values to 0, 1 and 2
#

# female: 0, male: 1
df_all['sex'] = df_all['sex'].replace(1, 0) # female 1 => 0  
df_all['sex'] = df_all['sex'].replace(2, 1)  # male 2 => 1

# no: 0, yes: 1
df_all['pneumonia'] = df_all['pneumonia'].replace(2, 0)
df_all['diabetes'] = df_all['diabetes'].replace(2, 0)
df_all['copd'] = df_all['copd'].replace(2, 0)
df_all['asthma'] = df_all['asthma'].replace(2, 0)
df_all['inmsupr'] = df_all['inmsupr'].replace(2, 0)
df_all['hypertension'] = df_all['hypertension'].replace(2, 0)
df_all['other_disease'] = df_all['other_disease'].replace(2, 0)
df_all['cardiovascular'] = df_all['cardiovascular'].replace(2, 0)
df_all['obesity'] = df_all['obesity'].replace(2, 0)
df_all['renal_chronic'] = df_all['renal_chronic'].replace(2, 0)
df_all['tobacco'] = df_all['tobacco'].replace(2, 0)

# no_icu: 2, yes_icu = 1
df_all['icu'] = df_all['icu'].replace(2, 0) # no_icu 2 => 0

# no_hospitalization: 1, yes_hospitalization = 2
df_all['patient_type'] = df_all['patient_type'].replace(1, 0) # no_hospitalization 1 => 0
df_all['patient_type'] = df_all['patient_type'].replace(2, 1) # yes_hospitalization 2 => 1

# temporarily change age 97,98,99 to 997,998,998
df_all['age'] = df_all['age'].replace(97, 997)
df_all['age'] = df_all['age'].replace(98, 998)
df_all['age'] = df_all['age'].replace(99, 999)

# replace all unknown values (97, 98 and 99) with 2 in all columns including age column
df_all = df_all.replace(97, 2)
df_all = df_all.replace(98, 2)
df_all = df_all.replace(99, 2)

# change age 997,998,998 back to 97,98,99
df_all['age'] = df_all['age'].replace(997, 97)
df_all['age'] = df_all['age'].replace(998, 98)
df_all['age'] = df_all['age'].replace(999, 99)

print("Dataframe shape: ", df_all.shape)
df_all.head()

In [None]:
# generate csv file

df_all.to_csv (TARGET+'covid_cleaned.csv', index = False, header=True)

In [None]:
# icu preprocssing

df_icu = df_all[icu_columns].copy()

# remove all rows that contain unknown values (== 2)
df_icu = df_icu[
                (df_icu.sex != 2) & \
                (df_icu.pneumonia != 2) & \
                (df_icu.diabetes != 2) & \
                (df_icu.copd != 2) & \
                (df_icu.asthma != 2) & \
                (df_icu.inmsupr != 2) & \
                (df_icu.hypertension != 2) & \
                (df_icu.other_disease != 2) & \
                (df_icu.cardiovascular != 2) & \
                (df_icu.obesity != 2) & \
                (df_icu.renal_chronic != 2) & \
                (df_icu.tobacco != 2) & \
                (df_icu.icu != 2)  # label
            ]

print("Dataframe shape: ", df_icu.shape)
df_icu.head()

In [None]:
# generate csv file for icu prediction

df_icu.to_csv (TARGET+'icu_cleaned.csv', index = False, header=True)
df_icu.to_csv (TARGET+'cleaned_with_icu_preprocessed_no_noise_corrected.csv', index = False, header=True)

In [None]:
# hospitalization preprocssing

df_hospitalization = df_all[hospitalization_columns]

# remove all rows that contain unknown values (== 2)
df_hospitalization = df_hospitalization[
                        (df_hospitalization.sex != 2) & \
                        (df_hospitalization.pneumonia != 2) & \
                        (df_hospitalization.diabetes != 2) & \
                        (df_hospitalization.copd != 2) & \
                        (df_hospitalization.asthma != 2) & \
                        (df_hospitalization.inmsupr != 2) & \
                        (df_hospitalization.hypertension != 2) & \
                        (df_hospitalization.other_disease != 2) & \
                        (df_hospitalization.cardiovascular != 2) & \
                        (df_hospitalization.obesity != 2) & \
                        (df_hospitalization.renal_chronic != 2) & \
                        (df_hospitalization.tobacco != 2) & \
                        (df_hospitalization.patient_type != 2) # label
                    ]

print("Dataframe shape: ", df_hospitalization.shape)
df_hospitalization.head()

In [None]:
# generate csv file for hospitalization prediction

df_hospitalization.to_csv (TARGET+'hospitalization_cleaned.csv', index = False, header=True)
df_hospitalization.to_csv (TARGET+'cleaned_with_hosp_modified.csv', index = False, header=True)

# generate sample batches

df_hospitalization.sample(frac=0.01).to_csv(TARGET+'hospitalization_sample_1.csv', index=False, header=True)
df_hospitalization.sample(frac=0.01).to_csv(TARGET+'hospitalization_sample_2.csv', index=False, header=True)
df_hospitalization.sample(frac=0.01).to_csv(TARGET+'hospitalization_sample_3.csv', index=False, header=True)
df_hospitalization.sample(frac=0.01).to_csv(TARGET+'hospitalization_sample_4.csv', index=False, header=True)
df_hospitalization.sample(frac=0.01).to_csv(TARGET+'hospitalization_sample_5.csv', index=False, header=True)

In [None]:
#############################################################################################3