In [1]:
import time

import sklearn as sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

random_seed = 100

# 1. Load data

In [2]:
origin_data = pd.read_csv("data/covid_death_dataset.csv")

In [3]:
origin_data.describe(include='all')

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
unique,,,,,401,,,,,,,,,,,,,,,,
top,,,,,9999-99-99,,,,,,,,,,,,,,,,
freq,,,,,971633,,,,,,,,,,,,,,,,
mean,1.632194,8.980565,1.499259,1.190765,,79.52288,3.346831,41.7941,49.76558,2.186404,2.260569,2.242626,2.298132,2.128989,2.435143,2.26181,2.125176,2.25718,2.214333,5.305653,79.55397
std,0.4822084,3.723278,0.4999997,0.3929041,,36.86889,11.91288,16.90739,47.51073,5.424242,5.132258,5.114089,5.462843,5.236397,6.646676,5.19485,5.175445,5.135354,5.323097,1.881165,36.82307
min,1.0,1.0,1.0,1.0,,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,4.0,1.0,1.0,,97.0,2.0,30.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,97.0
50%,2.0,12.0,1.0,1.0,,97.0,2.0,40.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,6.0,97.0
75%,2.0,12.0,2.0,1.0,,97.0,2.0,53.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,97.0


In [4]:
origin_data.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,2,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,2,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,2,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,2,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,2,2,2,1,2,2,2,2,2,3,97


# 2. Data processing

The dataset was provided by the Mexican government (https://datos.gob.mx/busca/dataset/informacion-referente-a-casos-covid-19-en-mexico). This dataset contains an enormous number of anonymized patient-related information including pre-conditions. The raw dataset consists of 21 unique features and 1,048,576 unique patients. In the Boolean features, 1 means "yes" and 2 means "no". values as 97 and 99 are missing data.

classification_final: covid test findings. Values 1-3 mean that the patient was diagnosed with covid in different degrees. 4 or higher means that the patient is not a carrier of covid or that the test is inconclusive.  
sex: 1 for female and 2 for male.  
age: of the patient.  
patient type: type of care the patient received in the unit. 1 for returned home and 2 for hospitalization.  
pneumonia: whether the patient already have air sacs inflammation or not.  
pregnancy: whether the patient is pregnant or not.  
diabetes: whether the patient has diabetes or not.  
copd: Indicates whether the patient has Chronic obstructive pulmonary disease or not.  
asthma: whether the patient has asthma or not.  
inmsupr: whether the patient is immunosuppressed or not.  
hypertension: whether the patient has hypertension or not.  
cardiovascular: whether the patient has heart or blood vessels related disease.   
renal chronic: whether the patient has chronic renal disease or not.  
other disease: whether the patient has other disease or not.  
obesity: whether the patient is obese or not.  
tobacco: whether the patient is a tobacco user.  
usmr: Indicates whether the patient treated medical units of the first, second or third level.  
medical unit: type of institution of the National Health System that provided the care.  
intubed: whether the patient was connected to the ventilator.  
icu: Indicates whether the patient had been admitted to an Intensive Care Unit.  
date died: If the patient died indicate the date of death, and 9999-99-99 otherwise.  

In [5]:
def code_converter(coded_value):
    if coded_value == 1:
        value = "Yes"
    elif coded_value == 2:
        value = "No"
    else:
        value = "Unknown"
    return value

In [6]:
origin_data.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,2,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,2,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,2,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,2,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,2,2,2,1,2,2,2,2,2,3,97


In [7]:
data = origin_data.copy()
data['SEX'] = origin_data['SEX'].apply(lambda x: "female" if x==1 else "male")
data['PATIENT_TYPE'] = origin_data['PATIENT_TYPE'].apply(lambda x: "Return_home" if x==1 else "Hospitalization")
data['INTUBED'] = origin_data['INTUBED'].apply(code_converter)

data['PNEUMONIA'] = origin_data['PNEUMONIA'].apply(code_converter)
data['PREGNANT'] = origin_data['PREGNANT'].apply(code_converter)
data['DIABETES'] = origin_data['DIABETES'].apply(code_converter)
data['COPD'] = origin_data['COPD'].apply(code_converter)
data['ASTHMA'] = origin_data['ASTHMA'].apply(code_converter)

data['INMSUPR'] = origin_data['INMSUPR'].apply(code_converter)
data['HIPERTENSION'] = origin_data['HIPERTENSION'].apply(code_converter)
data['OTHER_DISEASE'] = origin_data['OTHER_DISEASE'].apply(code_converter)
data['CARDIOVASCULAR'] = origin_data['CARDIOVASCULAR'].apply(code_converter)

data['OBESITY'] = origin_data['OBESITY'].apply(code_converter)
data['RENAL_CHRONIC'] = origin_data['RENAL_CHRONIC'].apply(code_converter)
data['TOBACCO'] = origin_data['TOBACCO'].apply(code_converter)
data['CLASIFFICATION_FINAL'] = origin_data['CLASIFFICATION_FINAL'].apply(code_converter)
data['ICU'] = origin_data['ICU'].apply(code_converter)

# 3. Feature engineering

## 3.1 Features

In [8]:
data.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,female,Return_home,03/05/2020,Unknown,Yes,65,No,No,No,No,No,Yes,No,No,No,No,No,Unknown,Unknown
1,2,1,male,Return_home,03/06/2020,Unknown,Yes,72,Unknown,No,No,No,No,Yes,No,No,Yes,Yes,No,Unknown,Unknown
2,2,1,male,Hospitalization,09/06/2020,Yes,No,55,Unknown,Yes,No,No,No,No,No,No,No,No,No,Unknown,No
3,2,1,female,Return_home,12/06/2020,Unknown,No,53,No,No,No,No,No,No,No,No,No,No,No,Unknown,Unknown
4,2,1,male,Return_home,21/06/2020,Unknown,No,68,Unknown,Yes,No,No,No,Yes,No,No,No,No,No,Unknown,Unknown


In [9]:
def one_hot_encode_column(df, column_name):
    one_hot = pd.get_dummies(df[column_name], prefix=column_name)
    df = df.drop(column_name,axis = 1)
    df = df.join(one_hot)
    return df

In [10]:
feature_df = data.copy()
feature_df = one_hot_encode_column(feature_df, 'USMER')
feature_df = one_hot_encode_column(feature_df, 'MEDICAL_UNIT')
feature_df = one_hot_encode_column(feature_df, 'SEX')
feature_df = one_hot_encode_column(feature_df, 'PATIENT_TYPE')
feature_df = one_hot_encode_column(feature_df, 'INTUBED')

feature_df = one_hot_encode_column(feature_df, 'PNEUMONIA')
feature_df = one_hot_encode_column(feature_df, 'PREGNANT')
feature_df = one_hot_encode_column(feature_df, 'DIABETES')
feature_df = one_hot_encode_column(feature_df, 'COPD')
feature_df = one_hot_encode_column(feature_df, 'ASTHMA')

feature_df = one_hot_encode_column(feature_df, 'INMSUPR')
feature_df = one_hot_encode_column(feature_df, 'HIPERTENSION')
feature_df = one_hot_encode_column(feature_df, 'OTHER_DISEASE')
feature_df = one_hot_encode_column(feature_df, 'CARDIOVASCULAR')

feature_df = one_hot_encode_column(feature_df, 'OBESITY')
feature_df = one_hot_encode_column(feature_df, 'RENAL_CHRONIC')
feature_df = one_hot_encode_column(feature_df, 'TOBACCO')
feature_df = one_hot_encode_column(feature_df, 'CLASIFFICATION_FINAL')
feature_df = one_hot_encode_column(feature_df, 'ICU')

## 3.2 Generate death data as y

In [11]:
print(f"# of death: {data[data['DATE_DIED'] != '9999-99-99'].shape[0]}, from total {data.shape[0]}. Death rate= {data[data['DATE_DIED'] != '9999-99-99'].shape[0]/data.shape[0]*100}%")
feature_df['DEATH'] = feature_df['DATE_DIED'].apply(lambda x: False if x=='9999-99-99' else True)

# of death: 76942, from total 1048575. Death rate= 7.337767923133777%


## 3.3 Feature selection

In [12]:
feature_df.head()

Unnamed: 0,DATE_DIED,AGE,USMER_1,USMER_2,MEDICAL_UNIT_1,MEDICAL_UNIT_2,MEDICAL_UNIT_3,MEDICAL_UNIT_4,MEDICAL_UNIT_5,MEDICAL_UNIT_6,MEDICAL_UNIT_7,MEDICAL_UNIT_8,MEDICAL_UNIT_9,MEDICAL_UNIT_10,MEDICAL_UNIT_11,MEDICAL_UNIT_12,MEDICAL_UNIT_13,SEX_female,SEX_male,PATIENT_TYPE_Hospitalization,PATIENT_TYPE_Return_home,INTUBED_No,INTUBED_Unknown,INTUBED_Yes,PNEUMONIA_No,PNEUMONIA_Unknown,PNEUMONIA_Yes,PREGNANT_No,PREGNANT_Unknown,PREGNANT_Yes,DIABETES_No,DIABETES_Unknown,DIABETES_Yes,COPD_No,COPD_Unknown,COPD_Yes,ASTHMA_No,ASTHMA_Unknown,ASTHMA_Yes,INMSUPR_No,INMSUPR_Unknown,INMSUPR_Yes,HIPERTENSION_No,HIPERTENSION_Unknown,HIPERTENSION_Yes,OTHER_DISEASE_No,OTHER_DISEASE_Unknown,OTHER_DISEASE_Yes,CARDIOVASCULAR_No,CARDIOVASCULAR_Unknown,CARDIOVASCULAR_Yes,OBESITY_No,OBESITY_Unknown,OBESITY_Yes,RENAL_CHRONIC_No,RENAL_CHRONIC_Unknown,RENAL_CHRONIC_Yes,TOBACCO_No,TOBACCO_Unknown,TOBACCO_Yes,CLASIFFICATION_FINAL_No,CLASIFFICATION_FINAL_Unknown,CLASIFFICATION_FINAL_Yes,ICU_No,ICU_Unknown,ICU_Yes,DEATH
0,03/05/2020,65,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False,True
1,03/06/2020,72,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,False,False,True,False,False,True,True,False,False,False,True,False,False,True,False,True
2,09/06/2020,55,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,True,False,False,False,True,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,True
3,12/06/2020,53,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False,True
4,21/06/2020,68,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,False,True,False,False,False,True,True,False,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False,True


In [13]:
feature_df.columns

Index(['DATE_DIED', 'AGE', 'USMER_1', 'USMER_2', 'MEDICAL_UNIT_1',
       'MEDICAL_UNIT_2', 'MEDICAL_UNIT_3', 'MEDICAL_UNIT_4', 'MEDICAL_UNIT_5',
       'MEDICAL_UNIT_6', 'MEDICAL_UNIT_7', 'MEDICAL_UNIT_8', 'MEDICAL_UNIT_9',
       'MEDICAL_UNIT_10', 'MEDICAL_UNIT_11', 'MEDICAL_UNIT_12',
       'MEDICAL_UNIT_13', 'SEX_female', 'SEX_male',
       'PATIENT_TYPE_Hospitalization', 'PATIENT_TYPE_Return_home',
       'INTUBED_No', 'INTUBED_Unknown', 'INTUBED_Yes', 'PNEUMONIA_No',
       'PNEUMONIA_Unknown', 'PNEUMONIA_Yes', 'PREGNANT_No', 'PREGNANT_Unknown',
       'PREGNANT_Yes', 'DIABETES_No', 'DIABETES_Unknown', 'DIABETES_Yes',
       'COPD_No', 'COPD_Unknown', 'COPD_Yes', 'ASTHMA_No', 'ASTHMA_Unknown',
       'ASTHMA_Yes', 'INMSUPR_No', 'INMSUPR_Unknown', 'INMSUPR_Yes',
       'HIPERTENSION_No', 'HIPERTENSION_Unknown', 'HIPERTENSION_Yes',
       'OTHER_DISEASE_No', 'OTHER_DISEASE_Unknown', 'OTHER_DISEASE_Yes',
       'CARDIOVASCULAR_No', 'CARDIOVASCULAR_Unknown', 'CARDIOVASCULAR_Yes',
  

In [14]:
selected_df = feature_df[['DEATH', 'AGE', 'USMER_1', 'USMER_2', 'MEDICAL_UNIT_1',
       'MEDICAL_UNIT_2', 'MEDICAL_UNIT_3', 'MEDICAL_UNIT_4', 'MEDICAL_UNIT_5',
       'MEDICAL_UNIT_6', 'MEDICAL_UNIT_7', 'MEDICAL_UNIT_8', 'MEDICAL_UNIT_9',
       'MEDICAL_UNIT_10', 'MEDICAL_UNIT_11', 'MEDICAL_UNIT_12',
       'MEDICAL_UNIT_13', 'SEX_female', 'SEX_male',
       'PATIENT_TYPE_Hospitalization', 'PATIENT_TYPE_Return_home',
       'INTUBED_No', 'INTUBED_Unknown', 'INTUBED_Yes', 'PNEUMONIA_No',
       'PNEUMONIA_Unknown', 'PNEUMONIA_Yes', 'PREGNANT_No', 'PREGNANT_Unknown',
       'PREGNANT_Yes', 'DIABETES_No', 'DIABETES_Unknown', 'DIABETES_Yes',
       'COPD_No', 'COPD_Unknown', 'COPD_Yes', 'ASTHMA_No', 'ASTHMA_Unknown',
       'ASTHMA_Yes', 'INMSUPR_No', 'INMSUPR_Unknown', 'INMSUPR_Yes',
       'HIPERTENSION_No', 'HIPERTENSION_Unknown', 'HIPERTENSION_Yes',
       'OTHER_DISEASE_No', 'OTHER_DISEASE_Unknown', 'OTHER_DISEASE_Yes',
       'CARDIOVASCULAR_No', 'CARDIOVASCULAR_Unknown', 'CARDIOVASCULAR_Yes',
       'OBESITY_No', 'OBESITY_Unknown', 'OBESITY_Yes', 'RENAL_CHRONIC_No',
       'RENAL_CHRONIC_Unknown', 'RENAL_CHRONIC_Yes', 'TOBACCO_No',
       'TOBACCO_Unknown', 'TOBACCO_Yes', 'CLASIFFICATION_FINAL_No',
       'CLASIFFICATION_FINAL_Unknown', 'CLASIFFICATION_FINAL_Yes', 'ICU_No',
       'ICU_Unknown', 'ICU_Yes']]

## 3.4 Train test data split

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [16]:
X = selected_df.drop("DEATH", axis=1)
y = selected_df[["DEATH"]]

In [17]:
X.describe(include='all')

Unnamed: 0,AGE,USMER_1,USMER_2,MEDICAL_UNIT_1,MEDICAL_UNIT_2,MEDICAL_UNIT_3,MEDICAL_UNIT_4,MEDICAL_UNIT_5,MEDICAL_UNIT_6,MEDICAL_UNIT_7,MEDICAL_UNIT_8,MEDICAL_UNIT_9,MEDICAL_UNIT_10,MEDICAL_UNIT_11,MEDICAL_UNIT_12,MEDICAL_UNIT_13,SEX_female,SEX_male,PATIENT_TYPE_Hospitalization,PATIENT_TYPE_Return_home,INTUBED_No,INTUBED_Unknown,INTUBED_Yes,PNEUMONIA_No,PNEUMONIA_Unknown,PNEUMONIA_Yes,PREGNANT_No,PREGNANT_Unknown,PREGNANT_Yes,DIABETES_No,DIABETES_Unknown,DIABETES_Yes,COPD_No,COPD_Unknown,COPD_Yes,ASTHMA_No,ASTHMA_Unknown,ASTHMA_Yes,INMSUPR_No,INMSUPR_Unknown,INMSUPR_Yes,HIPERTENSION_No,HIPERTENSION_Unknown,HIPERTENSION_Yes,OTHER_DISEASE_No,OTHER_DISEASE_Unknown,OTHER_DISEASE_Yes,CARDIOVASCULAR_No,CARDIOVASCULAR_Unknown,CARDIOVASCULAR_Yes,OBESITY_No,OBESITY_Unknown,OBESITY_Yes,RENAL_CHRONIC_No,RENAL_CHRONIC_Unknown,RENAL_CHRONIC_Yes,TOBACCO_No,TOBACCO_Unknown,TOBACCO_Yes,CLASIFFICATION_FINAL_No,CLASIFFICATION_FINAL_Unknown,CLASIFFICATION_FINAL_Yes,ICU_No,ICU_Unknown,ICU_Yes
count,1048575.0,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575,1048575
unique,,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
top,,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,True,False,True,False,False,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False
freq,,662903,662903,1048424,1048406,1029400,734170,1041331,1007991,1047684,1038176,1010459,1040702,1042998,602995,1047579,525064,525064,848544,848544,889525,855869,1014919,892534,1032572,908537,535396,527265,1040444,920248,1045237,923586,1030510,1045572,1033513,1014024,1045596,1017003,1031001,1045171,1034405,882742,1045471,885846,1015490,1043530,1020535,1024730,1045499,1027806,885727,1045543,888759,1026665,1045569,1029671,960979,1045355,964199,1046724,1038123,1039974,872890,856032,1031717
mean,41.7941,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
std,16.90739,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25%,30.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50%,40.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75%,53.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [18]:
X.head()

Unnamed: 0,AGE,USMER_1,USMER_2,MEDICAL_UNIT_1,MEDICAL_UNIT_2,MEDICAL_UNIT_3,MEDICAL_UNIT_4,MEDICAL_UNIT_5,MEDICAL_UNIT_6,MEDICAL_UNIT_7,MEDICAL_UNIT_8,MEDICAL_UNIT_9,MEDICAL_UNIT_10,MEDICAL_UNIT_11,MEDICAL_UNIT_12,MEDICAL_UNIT_13,SEX_female,SEX_male,PATIENT_TYPE_Hospitalization,PATIENT_TYPE_Return_home,INTUBED_No,INTUBED_Unknown,INTUBED_Yes,PNEUMONIA_No,PNEUMONIA_Unknown,PNEUMONIA_Yes,PREGNANT_No,PREGNANT_Unknown,PREGNANT_Yes,DIABETES_No,DIABETES_Unknown,DIABETES_Yes,COPD_No,COPD_Unknown,COPD_Yes,ASTHMA_No,ASTHMA_Unknown,ASTHMA_Yes,INMSUPR_No,INMSUPR_Unknown,INMSUPR_Yes,HIPERTENSION_No,HIPERTENSION_Unknown,HIPERTENSION_Yes,OTHER_DISEASE_No,OTHER_DISEASE_Unknown,OTHER_DISEASE_Yes,CARDIOVASCULAR_No,CARDIOVASCULAR_Unknown,CARDIOVASCULAR_Yes,OBESITY_No,OBESITY_Unknown,OBESITY_Yes,RENAL_CHRONIC_No,RENAL_CHRONIC_Unknown,RENAL_CHRONIC_Yes,TOBACCO_No,TOBACCO_Unknown,TOBACCO_Yes,CLASIFFICATION_FINAL_No,CLASIFFICATION_FINAL_Unknown,CLASIFFICATION_FINAL_Yes,ICU_No,ICU_Unknown,ICU_Yes
0,65,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False
1,72,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,False,False,True,False,False,True,True,False,False,False,True,False,False,True,False
2,55,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,True,False,False,False,True,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False
3,53,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False
4,68,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,False,True,False,False,False,True,True,False,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False


In [19]:
X_scale = X.copy()
for col in X.columns:
    # scaler = MinMaxScaler(feature_range=(0, 1))
    # X_scale[col] = scaler.fit(X[col].values.reshape(-1, 1))
    X_scale[col] = X[col].astype(float)

In [20]:
test_rate = 0.20
random_state = 100
cv_folds = 5

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=test_rate, random_state=random_state)

In [22]:
print(f"Train data Death rate: {100*y_train[y_train['DEATH']].shape[0]/y_train.shape[0]}%, # of death: {y_train[y_train['DEATH']].shape[0]}")
print(f"Test data Death rate: {100*y_test[y_test['DEATH']].shape[0]/y_test.shape[0]}%, # of death: {y_test[y_test['DEATH']].shape[0]}")

Train data Death rate: 7.336027465846506%, # of death: 61539
Test data Death rate: 7.34472975228286%, # of death: 15403


In [23]:
import mlrose_hiive

In [92]:
hidden_nodes = [10, 3]
activation='relu'
max_iters = 500
iterations_list = [100, 200, 500, 1000, 2000]
bias=True
is_classifier=True
learning_rate=0.1
early_stopping=False
clip_max=5
restarts=0
schedule=mlrose_hiive.ExpDecay()
pop_size=200
mutation_prob=0.1
max_attempts=100
random_state=100
curve=True
algorithm_list = ['gradient_descent', 'random_hill_climb', 'simulated_annealing', 'genetic_alg']

In [93]:
result_dict = {}
model_curve = {}

In [95]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import learning_curve

for algorithm in algorithm_list:

    print("algorithm:", algorithm)

    model = mlrose_hiive.NeuralNetwork(hidden_nodes=hidden_nodes,
                     activation=activation,
                     algorithm=algorithm,
                     max_iters=max_iters,
                     bias=bias,
                     is_classifier=is_classifier,
                     learning_rate=learning_rate,
                     early_stopping=early_stopping,
                     clip_max=clip_max,
                     restarts=restarts,
                     schedule=schedule,
                     pop_size=pop_size,
                     mutation_prob=mutation_prob,
                     max_attempts=max_attempts,
                     random_state=random_state,
                     curve=curve)
    
    scoring_func = make_scorer(lambda y_true, y_pred: f1_score(y_true, y_pred, average='macro'))
    train_sizes, train_scores, test_scores, fit_times, score_times = learning_curve(model, 
                                                          X_scale, y, 
                                                          cv=2, #stratified k-fold
                                                          n_jobs=4,
                                                          shuffle=True,
                                                          return_times=True,
                                                          scoring=scoring_func)
    
    result_dict[algorithm] = [algorithm, train_scores, test_scores, fit_times, score_times]
    model_curve[algorithm] = model.fitness_curve

algorithm: gradient_descent


KeyboardInterrupt: 

In [None]:
result_dict

In [97]:
model = mlrose_hiive.NeuralNetwork(hidden_nodes=hidden_nodes,
                     activation=activation,
                     algorithm='random_hill_climb',
                     max_iters=max_iters,
                     bias=bias,
                     is_classifier=is_classifier,
                     learning_rate=learning_rate,
                     early_stopping=early_stopping,
                     clip_max=clip_max,
                     restarts=restarts,
                     schedule=schedule,
                     pop_size=pop_size,
                     mutation_prob=mutation_prob,
                     max_attempts=max_attempts,
                     random_state=random_state,
                     curve=curve)

start_time = time.time()
model.fit(X_train, y_train)
end = time.time()

y_pred = model.predict(X_test)
y_train_accuracy = accuracy_score(y_test, y_pred)

In [98]:
y_train_accuracy

0.15916839520301362