In [256]:
import pandas as pd
df = pd.read_csv("COVID19_data.csv", index_col="ID")
print(df.shape)

(2054, 12)


### Missing values in categorical variables

In [257]:
from collections import Counter

for var in df:
  if df[var].dtypes == object:
    print(var, Counter(df[var]))

SEX Counter({'MALE': 1227, 'FEMALE': 825, nan: 2})
EXITUS Counter({'NO': 1684, 'YES': 329, nan: 41})
DESTINATION Counter({nan: 1383, 'ADMISSION': 671})


### Zero values in numerical variables

In [258]:
for var in df:
  if df[var].dtypes != object:
    print(var, Counter(df[var])[0])

AGE 0
DAYS_HOSPITAL 21
DAYS_ICU 1947
TEMP 467
HEART_RATE 448
GLUCOSE 2035
SAT_O2 422
BLOOD_PRES_SYS 750
BLOOD_PRES_DIAS 750


### Relevant variables

In [259]:
# No values
df.pop("GLUCOSE")
df.pop("DESTINATION")

# Values to predict
days_hosp = df.pop("DAYS_HOSPITAL")
days_icu = df.pop("DAYS_ICU")
exitus_col = df.pop("EXITUS")

print(exitus_col)

ID
1       NO
2       NO
3       NO
4       NO
5       NO
        ..
2050    NO
2051    NO
2052    NO
2053    NO
2054    NO
Name: EXITUS, Length: 2054, dtype: object


In [260]:
print(df.isnull().any())

AGE                 True
SEX                 True
TEMP               False
HEART_RATE         False
SAT_O2             False
BLOOD_PRES_SYS     False
BLOOD_PRES_DIAS    False
dtype: bool


## Categorical Variables

In [261]:
# Mask for categorical variables
cat_mask = df.dtypes==object

cat_cols = df.columns[cat_mask].tolist()
print(cat_cols)

['SEX']


In [262]:
# Divide in categorical and numeric columns
df_cat = df[cat_cols]
df_num = df.drop(cat_cols, axis=1)

In [263]:
def imput_categorical(cat_var, imputer):
  return pd.DataFrame(
    imputer.fit_transform(cat_var),
    columns=cat_var.columns,
    index=cat_var.index
  )

In [264]:
from sklearn.impute import SimpleImputer

imp_cat = SimpleImputer(strategy='most_frequent')
df_cat = imput_categorical(df_cat, imp_cat)
df_exitus = imput_categorical(pd.DataFrame(exitus_col), imp_cat)

print(df_cat)
print(df_cat.isnull().any())

print(df_exitus)
print(df_exitus.isna().any())

         SEX
ID          
1     FEMALE
2     FEMALE
3       MALE
4       MALE
5       MALE
...      ...
2050  FEMALE
2051  FEMALE
2052  FEMALE
2053    MALE
2054    MALE

[2054 rows x 1 columns]
SEX    False
dtype: bool
     EXITUS
ID         
1        NO
2        NO
3        NO
4        NO
5        NO
...     ...
2050     NO
2051     NO
2052     NO
2053     NO
2054     NO

[2054 rows x 1 columns]
EXITUS    False
dtype: bool


In [265]:
def categorical_to_onehot(cat_var, encoder):
  return pd.DataFrame(
    encoder.fit_transform(cat_var),
    columns=encoder.get_feature_names_out(cat_var.columns.tolist()),
    index=cat_var.index
  )

In [266]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)

df_cat_one = categorical_to_onehot(df_cat, ohe)
df_exitus_one = categorical_to_onehot(df_exitus, ohe)
print(df_exitus_one)

      EXITUS_NO  EXITUS_YES
ID                         
1           1.0         0.0
2           1.0         0.0
3           1.0         0.0
4           1.0         0.0
5           1.0         0.0
...         ...         ...
2050        1.0         0.0
2051        1.0         0.0
2052        1.0         0.0
2053        1.0         0.0
2054        1.0         0.0

[2054 rows x 2 columns]


## Numerical Variables

In [267]:
df_num.isna().any()

AGE                 True
TEMP               False
HEART_RATE         False
SAT_O2             False
BLOOD_PRES_SYS     False
BLOOD_PRES_DIAS    False
dtype: bool

In [268]:
df_num['AGE'].fillna(0.0, inplace=True)
df_num.isna().any()

AGE                False
TEMP               False
HEART_RATE         False
SAT_O2             False
BLOOD_PRES_SYS     False
BLOOD_PRES_DIAS    False
dtype: bool

In [269]:
df_num.describe()

Unnamed: 0,AGE,TEMP,HEART_RATE,SAT_O2,BLOOD_PRES_SYS,BLOOD_PRES_DIAS
count,2054.0,2054.0,2054.0,2054.0,2054.0,2054.0
mean,70.718598,28.386319,70.787731,73.39776,83.571568,48.32814
std,20.674469,15.419158,41.802038,37.863716,67.450853,44.225438
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,57.0,35.4,63.0,81.0,0.0,0.0
50%,68.0,36.4,84.0,93.0,115.0,64.0
75%,98.0,36.9,98.0,96.0,137.0,79.0
max,189.0,40.1,593.0,99.0,772.0,845.0


In [270]:
metrics_limits = {
  'HEART_RATE': (40, 200),
  'SAT_O2': (40, 100),
  'BLOOD_PRES_SYS': (10, 200),
  'BLOOD_PRES_DIAS': (10, 130)
}

def not_within(col, min_val, max_val):
  return (col < min_val) | (col > max_val)

#### Remove outlier values

In [278]:
for metric in metrics_limits:
  min_value, max_value = metrics_limits[metric]
  df_num.loc[not_within(df_num[metric], min_value, max_value), metric] = 0.0

df_num.describe()

Unnamed: 0,AGE,TEMP,HEART_RATE,SAT_O2,BLOOD_PRES_SYS,BLOOD_PRES_DIAS
count,2054.0,2054.0,2054.0,2054.0,2054.0,2054.0
mean,70.856585,36.739445,90.264339,92.427345,131.146585,75.023041
std,20.436992,0.721404,14.950901,6.054193,18.187327,10.385303
min,15.0,33.2,41.0,40.0,10.0,11.0
25%,57.0,36.3,82.0,92.0,125.0,72.0
50%,68.0,36.739445,90.264339,93.0,131.146585,75.023041
75%,98.0,36.9,98.0,96.0,137.0,79.0
max,189.0,40.1,190.0,99.0,200.0,127.0


In [272]:
imp_num = SimpleImputer(missing_values=0.0, strategy='mean')
df_num = pd.DataFrame(
  imp_num.fit_transform(df_num),
  columns=df_num.columns,
  index=df_num.index
)
print(df_num.isnull().any())
print(df_num)

AGE                False
TEMP               False
HEART_RATE         False
SAT_O2             False
BLOOD_PRES_SYS     False
BLOOD_PRES_DIAS    False
dtype: bool
             AGE       TEMP  HEART_RATE     SAT_O2  BLOOD_PRES_SYS  \
ID                                                                   
1      15.000000  37.000000   90.264339  92.000000      131.146585   
2      18.000000  37.300000  105.000000  97.000000      131.146585   
3      21.000000  38.500000  112.000000  95.000000       85.000000   
4      21.000000  39.200000  113.000000  97.000000      131.146585   
5      22.000000  36.300000   80.000000  92.000000      111.000000   
...          ...        ...         ...        ...             ...   
2050  189.000000  36.739445   90.264339  92.427345      131.146585   
2051   70.856585  36.500000   90.264339  92.427345      131.146585   
2052   70.856585  36.800000  190.000000  98.000000      131.146585   
2053   70.856585  36.739445  120.000000  93.000000      131.146585  

## Merge all the processed variables

In [273]:
df_processed = pd.merge(
  left=df_cat_one,
  right=df_num,
  on='ID'
)

print(df_processed)

      SEX_FEMALE  SEX_MALE         AGE       TEMP  HEART_RATE     SAT_O2  \
ID                                                                         
1            1.0       0.0   15.000000  37.000000   90.264339  92.000000   
2            1.0       0.0   18.000000  37.300000  105.000000  97.000000   
3            0.0       1.0   21.000000  38.500000  112.000000  95.000000   
4            0.0       1.0   21.000000  39.200000  113.000000  97.000000   
5            0.0       1.0   22.000000  36.300000   80.000000  92.000000   
...          ...       ...         ...        ...         ...        ...   
2050         1.0       0.0  189.000000  36.739445   90.264339  92.427345   
2051         1.0       0.0   70.856585  36.500000   90.264339  92.427345   
2052         1.0       0.0   70.856585  36.800000  190.000000  98.000000   
2053         0.0       1.0   70.856585  36.739445  120.000000  93.000000   
2054         0.0       1.0   70.856585  36.800000   90.264339  92.427345   

      BLOOD

In [274]:
df_processed.describe()

Unnamed: 0,SEX_FEMALE,SEX_MALE,AGE,TEMP,HEART_RATE,SAT_O2,BLOOD_PRES_SYS,BLOOD_PRES_DIAS
count,2054.0,2054.0,2054.0,2054.0,2054.0,2054.0,2054.0,2054.0
mean,0.401655,0.598345,70.856585,36.739445,90.264339,92.427345,131.146585,75.023041
std,0.490352,0.490352,20.436992,0.721404,14.950901,6.054193,18.187327,10.385303
min,0.0,0.0,15.0,33.2,41.0,40.0,10.0,11.0
25%,0.0,0.0,57.0,36.3,82.0,92.0,125.0,72.0
50%,0.0,1.0,68.0,36.739445,90.264339,93.0,131.146585,75.023041
75%,1.0,1.0,98.0,36.9,98.0,96.0,137.0,79.0
max,1.0,1.0,189.0,40.1,190.0,99.0,200.0,127.0


## Feature Selection

In [275]:
from sklearn.feature_selection import SelectKBest, chi2

fs_k_best_chi2 = SelectKBest(chi2, k=5)
fs_k_best_chi2.fit(df_processed, df_exitus_one)
col_filter = fs_k_best_chi2.get_support()
df_k_best_chi2 = df_processed.iloc[:, col_filter]

print(df_k_best_chi2)

      SEX_FEMALE         AGE     SAT_O2  BLOOD_PRES_SYS  BLOOD_PRES_DIAS
ID                                                                      
1            1.0   15.000000  92.000000      131.146585        75.023041
2            1.0   18.000000  97.000000      131.146585        75.023041
3            0.0   21.000000  95.000000       85.000000        47.000000
4            0.0   21.000000  97.000000      131.146585        75.023041
5            0.0   22.000000  92.000000      111.000000        70.000000
...          ...         ...        ...             ...              ...
2050         1.0  189.000000  92.427345      131.146585        75.023041
2051         1.0   70.856585  92.427345      131.146585        75.023041
2052         1.0   70.856585  98.000000      131.146585        75.023041
2053         0.0   70.856585  93.000000      131.146585        75.023041
2054         0.0   70.856585  92.427345      131.146585        75.023041

[2054 rows x 5 columns]


In [276]:
from sklearn.feature_selection import SelectPercentile, mutual_info_classif

fs_perc_mi = SelectPercentile(mutual_info_classif, percentile=40)
fs_perc_mi.fit(df_processed, df_exitus['EXITUS'])
col_filter = fs_perc_mi.get_support()
df_perc_mi = df_processed.iloc[:, col_filter]

print(df_perc_mi)

             AGE     SAT_O2  BLOOD_PRES_SYS
ID                                         
1      15.000000  92.000000      131.146585
2      18.000000  97.000000      131.146585
3      21.000000  95.000000       85.000000
4      21.000000  97.000000      131.146585
5      22.000000  92.000000      111.000000
...          ...        ...             ...
2050  189.000000  92.427345      131.146585
2051   70.856585  92.427345      131.146585
2052   70.856585  98.000000      131.146585
2053   70.856585  93.000000      131.146585
2054   70.856585  92.427345      131.146585

[2054 rows x 3 columns]


### Feature Scaling

In [277]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
# df_num_mms = mms.fit_transform(df_k_best_chi2)
df_num_mms = mms.fit_transform(df_processed)
print(df_num_mms.shape)
print(df_num_mms)

(2054, 8)
[[1.         0.         0.         ... 0.88135593 0.6376136  0.55192277]
 [1.         0.         0.01724138 ... 0.96610169 0.6376136  0.55192277]
 [0.         1.         0.03448276 ... 0.93220339 0.39473684 0.31034483]
 ...
 [1.         0.         0.32101486 ... 0.98305085 0.6376136  0.55192277]
 [0.         1.         0.32101486 ... 0.89830508 0.6376136  0.55192277]
 [0.         1.         0.32101486 ... 0.88859907 0.6376136  0.55192277]]


## DATA EXPLORATION