## Importing Packages & Displaying Available Data Files

In [1]:
# Import required packages
import math
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, recall_score, precision_score, SCORERS

from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 999

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/characteristics-corona-patients/Characteristics_Corona_patients_version_6 - 19-7-2020.csv
/kaggle/input/characteristics-corona-patients/Characteristics_Corona_patients version 5 19-7-20.csv


## Loading Dataset

In [2]:
raw_data = pd.read_csv("/kaggle/input/characteristics-corona-patients/Characteristics_Corona_patients_version_6 - 19-7-2020.csv")
raw_data

Columns (8) have mixed types.Specify dtype option on import or set low_memory=False.


Unnamed: 0,age,age_band,background_diseases_binary,country,origin,severity_illness,sex,smoking,treatment,confirmed_date_D,...,background_diseases_encephalomalacia,background_diseases_hip_replacement,background_diseases_parkinson,background_diseases_gastrointestinal_bleeding,background_diseases_stenocardia,background_diseases_myeloma,background_diseases_azotemia,background_diseases_atrial_fibrillation,background_diseases_arrhythmia,background_diseases_hypothyroidism
0,,,,0.0,france,0.0,0.0,,0,119.0,...,,,,,,,,,,
1,,,,0.0,france,1.0,0.0,,0,119.0,...,,,,,,,,,,
2,,,,0.0,france,1.0,1.0,,0,119.0,...,,,,,,,,,,
3,,,,0.0,france,0.0,0.0,,0,123.0,...,,,,,,,,,,
4,,,,0.0,france,1.0,0.0,,1,125.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106606,,,,,kerla,,,,0,247.0,...,,,,,,,,,,
1106607,,,,,kerla,,,,0,247.0,...,,,,,,,,,,
1106608,,,,,kerla,,,,0,247.0,...,,,,,,,,,,
1106609,,,,,kerla,,,,0,247.0,...,,,,,,,,,,


## Data Cleaning

In [3]:
df = raw_data.copy()

def is_dead(row):
    if math.isnan(row['deceased_date_D']):
        return 0
    else:
        return 1

df['is_dead'] = df.apply(is_dead, axis=1)

df.drop(columns=['confirmed_date_D',
                 'deceased_date_D',
                 'released_date_D',
                 'return_date_D',
                 'date_onset_symptoms_D',
                 'age_band',
                 'background_diseases_binary',
                 'country',
                 'origin',
                 'return_date_until_date_onset_symptoms',
                 'confirmed_date_until_released_date',
                 'confirmed_date_until_deceased_date'], inplace=True)

df = pd.concat([df.pop('is_dead'), df], axis=1)

df.isna().sum()/len(df)

is_dead                                                      0.000000
age                                                          0.507619
severity_illness                                             0.780757
sex                                                          0.498654
smoking                                                      0.701655
treatment                                                    0.602841
date_onset_symptoms_until_confirmed_date                     0.549736
severity_illness_infectious_person                           0.999285
symptoms_pneumonia                                           0.998526
symptoms_sore_throat                                         0.998526
symptoms_fever                                               0.998526
symptoms_cough                                               0.998526
symptoms_breathing_difficulty                                0.998526
symptoms_organ_failure                                       0.998526
symptoms_respiratory

In [4]:
# Cleaning 'treatment' column
def clean_treatment(data):
    if data == '1':
        return 1.0
    elif type(data) == str:
        return np.NaN
    elif math.isnan(data):
        return np.NaN
    else:
        return data
    
df['treatment'] = df['treatment'].apply(clean_treatment)

## Data Wrangling

In [5]:
df2 = df.copy()
df2

Unnamed: 0,is_dead,age,severity_illness,sex,smoking,treatment,date_onset_symptoms_until_confirmed_date,severity_illness_infectious_person,symptoms_pneumonia,symptoms_sore_throat,...,background_diseases_encephalomalacia,background_diseases_hip_replacement,background_diseases_parkinson,background_diseases_gastrointestinal_bleeding,background_diseases_stenocardia,background_diseases_myeloma,background_diseases_azotemia,background_diseases_atrial_fibrillation,background_diseases_arrhythmia,background_diseases_hypothyroidism
0,0,,0.0,0.0,,0.0,,,,,...,,,,,,,,,,
1,0,,1.0,0.0,,0.0,,,,,...,,,,,,,,,,
2,0,,1.0,1.0,,0.0,,,,,...,,,,,,,,,,
3,0,,0.0,0.0,,0.0,,,,,...,,,,,,,,,,
4,0,,1.0,0.0,,1.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106606,0,,,,,0.0,,,,,...,,,,,,,,,,
1106607,0,,,,,0.0,,,,,...,,,,,,,,,,
1106608,0,,,,,0.0,,,,,...,,,,,,,,,,
1106609,0,,,,,0.0,,,,,...,,,,,,,,,,


In [6]:
# Collecting all symptoms and background_diseases features
all_symptoms = []
all_bd = []

for col in list(df2.columns):
    if col[:8] == "symptoms":
        all_symptoms.append(col)
    elif col[:19] == "background_diseases":
        all_bd.append(col)

# Check which features contain missing data for dataset with symptoms
temp = df2.copy()
temp.dropna(how='any', subset=all_symptoms, inplace=True)
temp.reset_index(drop=True, inplace=True)

print("Length of symptoms data:", len(temp))

temp.iloc[:,:8].isna().sum()/len(temp)

Length of symptoms data: 1631


is_dead                                     0.000000
age                                         0.344574
severity_illness                            0.002452
sex                                         0.028204
smoking                                     1.000000
treatment                                   0.898835
date_onset_symptoms_until_confirmed_date    0.285101
severity_illness_infectious_person          0.986511
dtype: float64

In [7]:
# Check which features contain missing data for dataset with background diseases
temp = df2.copy()
temp.dropna(how='any', subset=all_bd, inplace=True)
temp.reset_index(drop=True, inplace=True)

print("Length of background_diseases data:", len(temp))

temp.iloc[:,:8].isna().sum()/len(temp)

Length of background_diseases data: 138746


is_dead                                     0.000000
age                                         0.000079
severity_illness                            0.789810
sex                                         0.000022
smoking                                     0.003315
treatment                                   0.001103
date_onset_symptoms_until_confirmed_date    0.001081
severity_illness_infectious_person          0.999950
dtype: float64

## Further Data Cleaning

In [8]:
### Cleaning only symptoms
df.drop(columns=['severity_illness_infectious_person',
                 'severity_illness'], inplace=True)

all_symptoms = []

for col in list(df.columns):
    if col[:8] == "symptoms":
        all_symptoms.append(col)

df.drop(columns=all_symptoms, inplace=True)

df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,is_dead,age,sex,smoking,treatment,date_onset_symptoms_until_confirmed_date,background_diseases_diabetic_nephropathy,background_diseases_diabetes,background_diseases_hypertension,background_diseases_POTS,...,background_diseases_encephalomalacia,background_diseases_hip_replacement,background_diseases_parkinson,background_diseases_gastrointestinal_bleeding,background_diseases_stenocardia,background_diseases_myeloma,background_diseases_azotemia,background_diseases_atrial_fibrillation,background_diseases_arrhythmia,background_diseases_hypothyroidism
0,0,62.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,31.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,36.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,49.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,47.0,0.0,0.0,1.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138281,1,78.0,1.0,1.0,0.0,4.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138282,0,80.0,1.0,0.0,1.0,8.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138283,0,25.0,0.0,0.0,1.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138284,0,59.0,0.0,0.0,1.0,12.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
list(df.columns)

In [10]:
temp = df.copy()

for col in list(df.columns):
    print("Column:", col)
    print(temp[col].value_counts().append(pd.Series(temp[col].isna().sum(),index=['NaN'])))
    print("\n")

Column: is_dead
0      111777
1       26509
NaN         0
dtype: int64


Column: age
52.0     3463
48.0     3433
50.0     3417
51.0     3386
56.0     3358
49.0     3340
46.0     3328
53.0     3327
47.0     3315
54.0     3251
55.0     3249
57.0     3216
45.0     3157
58.0     3117
59.0     3072
60.0     2962
44.0     2943
61.0     2856
43.0     2842
63.0     2750
42.0     2680
40.0     2586
62.0     2565
41.0     2540
65.0     2443
39.0     2388
64.0     2351
66.0     2332
38.0     2329
37.0     2303
67.0     2215
68.0     2176
36.0     2131
35.0     2112
34.0     1991
70.0     1978
69.0     1937
33.0     1860
31.0     1825
32.0     1774
71.0     1701
30.0     1678
72.0     1643
73.0     1540
29.0     1488
28.0     1420
74.0     1409
75.0     1342
27.0     1292
76.0     1232
77.0     1146
26.0     1128
78.0     1108
25.0      936
80.0      923
79.0      882
24.0      800
81.0      724
82.0      685
83.0      634
23.0      614
84.0      573
85.0      482
22.0      480
21.0      394
86.0 

In [11]:
for col in list(df.columns):
    if col == "age" or col == "date_onset_symptoms_until_confirmed_date":
        continue
    else:
        df[col] = df[col].astype('bool')

print(df.info())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138286 entries, 0 to 138285
Data columns (total 62 columns):
 #   Column                                                     Non-Null Count   Dtype  
---  ------                                                     --------------   -----  
 0   is_dead                                                    138286 non-null  bool   
 1   age                                                        138286 non-null  float64
 2   sex                                                        138286 non-null  bool   
 3   smoking                                                    138286 non-null  bool   
 4   treatment                                                  138286 non-null  bool   
 5   date_onset_symptoms_until_confirmed_date                   138286 non-null  float64
 6   background_diseases_diabetic_nephropathy                   138286 non-null  bool   
 7   background_diseases_diabetes                               138286 non-null  bool   

Unnamed: 0,is_dead,age,sex,smoking,treatment,date_onset_symptoms_until_confirmed_date,background_diseases_diabetic_nephropathy,background_diseases_diabetes,background_diseases_hypertension,background_diseases_POTS,...,background_diseases_encephalomalacia,background_diseases_hip_replacement,background_diseases_parkinson,background_diseases_gastrointestinal_bleeding,background_diseases_stenocardia,background_diseases_myeloma,background_diseases_azotemia,background_diseases_atrial_fibrillation,background_diseases_arrhythmia,background_diseases_hypothyroidism
0,False,62.0,False,False,False,3.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,False,31.0,False,False,True,3.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,36.0,False,True,True,2.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,49.0,False,False,True,2.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,False,47.0,False,False,True,3.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138281,True,78.0,True,True,False,4.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
138282,False,80.0,True,False,True,8.0,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
138283,False,25.0,False,False,True,12.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
138284,False,59.0,False,False,True,12.0,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
# Train-Test Split
X = df.iloc[:,1:].copy()
y = df.iloc[:,0:1].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

In [13]:
df_train = pd.concat([y_train, X_train], axis=1)
df_train

Unnamed: 0,is_dead,age,sex,smoking,treatment,date_onset_symptoms_until_confirmed_date,background_diseases_diabetic_nephropathy,background_diseases_diabetes,background_diseases_hypertension,background_diseases_POTS,...,background_diseases_encephalomalacia,background_diseases_hip_replacement,background_diseases_parkinson,background_diseases_gastrointestinal_bleeding,background_diseases_stenocardia,background_diseases_myeloma,background_diseases_azotemia,background_diseases_atrial_fibrillation,background_diseases_arrhythmia,background_diseases_hypothyroidism
57860,False,8.0,True,False,True,4.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
50485,False,67.0,False,False,False,5.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
77945,False,23.0,False,False,True,2.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
114448,True,41.0,False,False,False,8.0,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
28367,False,51.0,False,False,True,0.0,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,False,81.0,True,False,False,6.0,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
119879,False,42.0,False,False,False,10.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
103694,False,52.0,True,False,False,0.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
131932,False,64.0,False,False,True,10.0,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
df_test = pd.concat([y_test, X_test], axis=1)
df_test

Unnamed: 0,is_dead,age,sex,smoking,treatment,date_onset_symptoms_until_confirmed_date,background_diseases_diabetic_nephropathy,background_diseases_diabetes,background_diseases_hypertension,background_diseases_POTS,...,background_diseases_encephalomalacia,background_diseases_hip_replacement,background_diseases_parkinson,background_diseases_gastrointestinal_bleeding,background_diseases_stenocardia,background_diseases_myeloma,background_diseases_azotemia,background_diseases_atrial_fibrillation,background_diseases_arrhythmia,background_diseases_hypothyroidism
41558,False,46.0,True,False,True,1.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
137228,False,42.0,True,False,True,12.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
47220,False,41.0,True,False,True,11.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
47294,False,24.0,False,False,True,4.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
96083,True,72.0,True,False,False,1.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50086,False,56.0,False,False,True,7.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
43165,False,50.0,False,False,True,2.0,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
104369,False,45.0,True,False,True,1.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
62738,False,64.0,False,False,True,6.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
df_train.to_csv("Exception_Datasets (Train).csv", index=False)
df_test.to_csv("Exception_Datasets (Test).csv", index=False)