## Demographic and Medical Condition data analysis + ML exploration

Data_Source = https://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/datos_abiertos_covid19.zip

In [60]:
# Initial imports
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import hvplot.pandas
from collections import Counter

In [47]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("./Resources/220614COVID19MEXICO.csv")
df.info()

  exec(code_obj, self.user_global_ns, self.user_ns)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16192035 entries, 0 to 16192034
Data columns (total 40 columns):
 #   Column                 Dtype 
---  ------                 ----- 
 0   FECHA_ACTUALIZACION    object
 1   ID_REGISTRO            object
 2   ORIGEN                 int64 
 3   SECTOR                 int64 
 4   ENTIDAD_UM             int64 
 5   SEXO                   int64 
 6   ENTIDAD_NAC            int64 
 7   ENTIDAD_RES            int64 
 8   MUNICIPIO_RES          int64 
 9   TIPO_PACIENTE          int64 
 10  FECHA_INGRESO          object
 11  FECHA_SINTOMAS         object
 12  FECHA_DEF              object
 13  INTUBADO               int64 
 14  NEUMONIA               int64 
 15  EDAD                   int64 
 16  NACIONALIDAD           int64 
 17  EMBARAZO               int64 
 18  HABLA_LENGUA_INDIG     int64 
 19  INDIGENA               int64 
 20  DIABETES               int64 
 21  EPOC                   int64 
 22  ASMA                   int64 
 23  INMUS

In [3]:
cols=[]
for x in df.columns:
    cols.append(x)

In [4]:
cols

['FECHA_ACTUALIZACION',
 'ID_REGISTRO',
 'ORIGEN',
 'SECTOR',
 'ENTIDAD_UM',
 'SEXO',
 'ENTIDAD_NAC',
 'ENTIDAD_RES',
 'MUNICIPIO_RES',
 'TIPO_PACIENTE',
 'FECHA_INGRESO',
 'FECHA_SINTOMAS',
 'FECHA_DEF',
 'INTUBADO',
 'NEUMONIA',
 'EDAD',
 'NACIONALIDAD',
 'EMBARAZO',
 'HABLA_LENGUA_INDIG',
 'INDIGENA',
 'DIABETES',
 'EPOC',
 'ASMA',
 'INMUSUPR',
 'HIPERTENSION',
 'OTRA_COM',
 'CARDIOVASCULAR',
 'OBESIDAD',
 'RENAL_CRONICA',
 'TABAQUISMO',
 'OTRO_CASO',
 'TOMA_MUESTRA_LAB',
 'RESULTADO_LAB',
 'TOMA_MUESTRA_ANTIGENO',
 'RESULTADO_ANTIGENO',
 'CLASIFICACION_FINAL',
 'MIGRANTE',
 'PAIS_NACIONALIDAD',
 'PAIS_ORIGEN',
 'UCI']

In [5]:
# Update Columns in English
new_cols = ['DATE UPDATE',
 'REGISTRATION_ID',
 'SOURCE',
 'SECTOR',
 'ENTITY_UM',
 'SEX',
 'ENTITY_NAC',
 'ENTITY_RES',
 'MUNICIPALITY_RES',
 'TYPE_PATIENT',
 'ADMISSION DATE',
 'DATE_SYMPTOMS',
 'DATE_DEF',
 'INTUBATED',
 'PNEUMONIA',
 'AGE',
 'NATIONALITY',
 'PREGNANCY',
 'SPEAK_LANGUAGE_INDIG',
 'INDIGENOUS',
 'DIABETES',
 'COPD',
 'ASTHMA',
 'INMUSUPR',
 'HYPERTENSION',
 'OTHER_COM',
 'CARDIOVASCULAR',
 'OBESITY',
 'RENAL_CHRONIC',
 'SMOKING',
 'ANOTHER CASE',
 'TAKE_LAB_SAMPLE',
 'LAB_RESULT',
 'TAKE_SAMPLE_ANTIGEN',
 'RESULT_ANTIGEN',
 'FINAL_CLASSIFICATION',
 'MIGRANT',
 'COUNTRY_NATIONALITY',
 'COUNTRY OF ORIGIN',
 'ICU']

In [6]:
df.columns = new_cols
df.head()

Unnamed: 0,DATE UPDATE,REGISTRATION_ID,SOURCE,SECTOR,ENTITY_UM,SEX,ENTITY_NAC,ENTITY_RES,MUNICIPALITY_RES,TYPE_PATIENT,...,ANOTHER CASE,TAKE_LAB_SAMPLE,LAB_RESULT,TAKE_SAMPLE_ANTIGEN,RESULT_ANTIGEN,FINAL_CLASSIFICATION,MIGRANT,COUNTRY_NATIONALITY,COUNTRY OF ORIGIN,ICU
0,2022-06-14,z24953,1,12,9,1,9,9,10,1,...,1,1,2,2,97,7,99,México,97,97
1,2022-06-14,z23d9d,1,12,22,2,24,22,9,1,...,2,2,97,2,97,6,99,México,97,97
2,2022-06-14,z49a69,1,12,23,1,23,23,4,2,...,1,2,97,2,97,2,99,México,97,1
3,2022-06-14,z482b8,2,12,9,2,9,9,12,1,...,2,2,97,2,97,1,99,México,97,97
4,2022-06-14,z3bf80,2,12,8,2,8,8,37,1,...,2,1,1,2,97,3,99,México,97,97


In [7]:
# Drop date update, registration ID,SOURCE,SECTOR,ENTITY_UM,MUNICIPALITY_RES
df1 = df.drop(columns = ['DATE UPDATE','REGISTRATION_ID',"SOURCE","SECTOR","ENTITY_UM",'MUNICIPALITY_RES'])
df1

Unnamed: 0,SEX,ENTITY_NAC,ENTITY_RES,TYPE_PATIENT,ADMISSION DATE,DATE_SYMPTOMS,DATE_DEF,INTUBATED,PNEUMONIA,AGE,...,ANOTHER CASE,TAKE_LAB_SAMPLE,LAB_RESULT,TAKE_SAMPLE_ANTIGEN,RESULT_ANTIGEN,FINAL_CLASSIFICATION,MIGRANT,COUNTRY_NATIONALITY,COUNTRY OF ORIGIN,ICU
0,1,9,9,1,2020-10-15,2020-10-15,9999-99-99,97,2,40,...,1,1,2,2,97,7,99,México,97,97
1,2,24,22,1,2021-01-05,2021-01-05,9999-99-99,97,2,29,...,2,2,97,2,97,6,99,México,97,97
2,1,23,23,2,2020-07-20,2020-07-17,2020-07-21,1,1,66,...,1,2,97,2,97,2,99,México,97,1
3,2,9,9,1,2020-10-16,2020-10-16,9999-99-99,97,2,41,...,2,2,97,2,97,1,99,México,97,97
4,2,8,8,1,2020-07-28,2020-07-20,9999-99-99,97,2,35,...,2,1,1,2,97,3,99,México,97,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16192030,2,13,15,1,2022-03-04,2022-03-04,9999-99-99,97,99,25,...,99,2,97,2,97,6,99,México,97,97
16192031,1,15,15,1,2022-03-04,2022-03-04,9999-99-99,97,99,23,...,99,2,97,2,97,6,99,México,97,97
16192032,2,15,15,1,2022-03-05,2022-03-05,9999-99-99,97,99,44,...,99,2,97,2,97,6,99,México,97,97
16192033,1,9,15,1,2022-03-05,2022-03-05,9999-99-99,97,99,48,...,99,2,97,2,97,6,99,México,97,97


In [8]:
df1[df1['TAKE_LAB_SAMPLE']==2].groupby(['RESULT_ANTIGEN']).count()

Unnamed: 0_level_0,SEX,ENTITY_NAC,ENTITY_RES,TYPE_PATIENT,ADMISSION DATE,DATE_SYMPTOMS,DATE_DEF,INTUBATED,PNEUMONIA,AGE,...,SMOKING,ANOTHER CASE,TAKE_LAB_SAMPLE,LAB_RESULT,TAKE_SAMPLE_ANTIGEN,FINAL_CLASSIFICATION,MIGRANT,COUNTRY_NATIONALITY,COUNTRY OF ORIGIN,ICU
RESULT_ANTIGEN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3155285,3155285,3155285,3155285,3155285,3155285,3155285,3155285,3155285,3155285,...,3155285,3155285,3155285,3155285,3155285,3155285,3155285,3155285,3155285,3155285
2,6725173,6725173,6725173,6725173,6725173,6725173,6725173,6725173,6725173,6725173,...,6725173,6725173,6725173,6725173,6725173,6725173,6725173,6725173,6725173,6725173
97,582153,582153,582153,582153,582153,582153,582153,582153,582153,582153,...,582153,582153,582153,582153,582153,582153,582153,582153,582153,582153


In [9]:
# Drop who didn't do Lab test and don't have an Antigen result
reqd_Index = df1[(df1['TAKE_LAB_SAMPLE']==2) & (df1['RESULT_ANTIGEN'] == 97)].index.tolist()

In [10]:
len(reqd_Index)

582153

In [11]:
df2 = df1.drop(index = reqd_Index)

In [12]:
# Is the data valid? Age more than 121? remove edge cases;
reqd_Index2 = df2[(df2['AGE']>121) & (df2['ANOTHER CASE']==99)].index.tolist()

In [13]:
len(reqd_Index2)

50

In [14]:
df3 = df2.drop(index = reqd_Index2)

In [15]:
# Invalid entry, result not attribute to COVID, we should drop it.
df3[(df3['LAB_RESULT']==4) & (df3['RESULT_ANTIGEN']==97) ].groupby(['FINAL_CLASSIFICATION']).count()

Unnamed: 0_level_0,SEX,ENTITY_NAC,ENTITY_RES,TYPE_PATIENT,ADMISSION DATE,DATE_SYMPTOMS,DATE_DEF,INTUBATED,PNEUMONIA,AGE,...,SMOKING,ANOTHER CASE,TAKE_LAB_SAMPLE,LAB_RESULT,TAKE_SAMPLE_ANTIGEN,RESULT_ANTIGEN,MIGRANT,COUNTRY_NATIONALITY,COUNTRY OF ORIGIN,ICU
FINAL_CLASSIFICATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,14078,14078,14078,14078,14078,14078,14078,14078,14078,14078,...,14078,14078,14078,14078,14078,14078,14078,14078,14078,14078
2,2947,2947,2947,2947,2947,2947,2947,2947,2947,2947,...,2947,2947,2947,2947,2947,2947,2947,2947,2947,2947
4,11661,11661,11661,11661,11661,11661,11661,11661,11661,11661,...,11661,11661,11661,11661,11661,11661,11661,11661,11661,11661
5,90944,90944,90944,90944,90944,90944,90944,90944,90944,90944,...,90944,90944,90944,90944,90944,90944,90944,90944,90944,90944
6,88144,88144,88144,88144,88144,88144,88144,88144,88144,88144,...,88144,88144,88144,88144,88144,88144,88144,88144,88144,88144


In [16]:
reqd_Index3 = df3[(df3['LAB_RESULT']==4) & (df3['RESULT_ANTIGEN']==97) &(df3['FINAL_CLASSIFICATION'] != 1) &(df3['FINAL_CLASSIFICATION'] != 2) ].index.tolist()

In [17]:
len(reqd_Index3)

190749

In [18]:
df4 = df3.drop(index = reqd_Index3)

In [19]:
df4 = df4.drop(columns = 'ENTITY_NAC')

In [20]:
sample_df = df4.sample(20000)
sample_df= sample_df.reset_index(drop=True)

In [21]:
sample_df.sort_values('ADMISSION DATE',ascending = True)

Unnamed: 0,SEX,ENTITY_RES,TYPE_PATIENT,ADMISSION DATE,DATE_SYMPTOMS,DATE_DEF,INTUBATED,PNEUMONIA,AGE,NATIONALITY,...,ANOTHER CASE,TAKE_LAB_SAMPLE,LAB_RESULT,TAKE_SAMPLE_ANTIGEN,RESULT_ANTIGEN,FINAL_CLASSIFICATION,MIGRANT,COUNTRY_NATIONALITY,COUNTRY OF ORIGIN,ICU
11440,2,9,1,2020-01-27,2020-01-26,9999-99-99,97,99,15,1,...,2,1,2,2,97,7,99,México,97,97
11484,1,29,1,2020-01-29,2020-01-28,9999-99-99,97,99,27,1,...,99,1,2,2,97,7,99,México,97,97
16146,1,30,2,2020-02-05,2020-02-05,9999-99-99,99,99,28,1,...,99,1,2,2,97,7,99,México,97,99
6780,2,25,2,2020-02-09,2020-02-09,9999-99-99,99,99,1,1,...,99,1,2,2,97,7,99,México,97,99
4964,2,15,2,2020-02-09,2020-02-08,9999-99-99,99,99,1,1,...,2,1,2,2,97,7,99,México,97,99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19693,1,15,1,2022-06-13,2022-06-11,9999-99-99,97,2,42,1,...,2,1,3,1,1,3,99,México,97,97
3242,1,9,1,2022-06-13,2022-06-10,9999-99-99,97,2,76,1,...,1,2,97,1,2,7,99,México,97,97
15205,1,11,1,2022-06-13,2022-06-10,9999-99-99,97,2,5,1,...,2,2,97,1,2,7,99,México,97,97
13306,1,19,1,2022-06-13,2022-06-12,9999-99-99,97,2,30,1,...,2,2,97,1,1,3,99,México,97,97


In [22]:
sample_df.groupby(['DATE_DEF']).count()

Unnamed: 0_level_0,SEX,ENTITY_RES,TYPE_PATIENT,ADMISSION DATE,DATE_SYMPTOMS,INTUBATED,PNEUMONIA,AGE,NATIONALITY,PREGNANCY,...,ANOTHER CASE,TAKE_LAB_SAMPLE,LAB_RESULT,TAKE_SAMPLE_ANTIGEN,RESULT_ANTIGEN,FINAL_CLASSIFICATION,MIGRANT,COUNTRY_NATIONALITY,COUNTRY OF ORIGIN,ICU
DATE_DEF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-04,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2020-04-13,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2020-04-17,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2020-04-18,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2020-04-20,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-21,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2022-03-23,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2022-03-26,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2022-04-16,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [23]:
# Encode the target: people with deceased date as 1, and people alive as 0
sample_df.loc[sample_df['DATE_DEF'] == "9999-99-99", 'Target'] = int(0) 
sample_df.loc[sample_df['DATE_DEF'] != "9999-99-99", 'Target'] = int(1) 

In [24]:
sample_df.groupby(['Target']).count()

Unnamed: 0_level_0,SEX,ENTITY_RES,TYPE_PATIENT,ADMISSION DATE,DATE_SYMPTOMS,DATE_DEF,INTUBATED,PNEUMONIA,AGE,NATIONALITY,...,ANOTHER CASE,TAKE_LAB_SAMPLE,LAB_RESULT,TAKE_SAMPLE_ANTIGEN,RESULT_ANTIGEN,FINAL_CLASSIFICATION,MIGRANT,COUNTRY_NATIONALITY,COUNTRY OF ORIGIN,ICU
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,19471,19471,19471,19471,19471,19471,19471,19471,19471,19471,...,19471,19471,19471,19471,19471,19471,19471,19471,19471,19471
1.0,529,529,529,529,529,529,529,529,529,529,...,529,529,529,529,529,529,529,529,529,529


In [25]:
sample_df

Unnamed: 0,SEX,ENTITY_RES,TYPE_PATIENT,ADMISSION DATE,DATE_SYMPTOMS,DATE_DEF,INTUBATED,PNEUMONIA,AGE,NATIONALITY,...,TAKE_LAB_SAMPLE,LAB_RESULT,TAKE_SAMPLE_ANTIGEN,RESULT_ANTIGEN,FINAL_CLASSIFICATION,MIGRANT,COUNTRY_NATIONALITY,COUNTRY OF ORIGIN,ICU,Target
0,1,21,2,2020-12-21,2020-12-12,2020-12-21,2,1,50,1,...,1,2,2,97,7,99,México,97,2,1.0
1,1,30,1,2022-03-31,2022-03-30,9999-99-99,97,2,9,1,...,2,97,1,2,7,99,México,97,97,0.0
2,2,25,1,2022-01-17,2022-01-14,9999-99-99,97,2,47,1,...,2,97,1,2,7,99,México,97,97,0.0
3,1,9,1,2021-07-23,2021-07-21,9999-99-99,97,2,59,1,...,2,97,1,2,7,99,México,97,97,0.0
4,1,16,1,2021-04-09,2021-04-09,9999-99-99,97,2,21,1,...,2,97,1,2,7,99,México,97,97,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1,3,1,2021-07-28,2021-07-26,9999-99-99,97,2,36,1,...,2,97,1,1,3,99,México,97,97,0.0
19996,1,6,1,2021-08-12,2021-08-09,9999-99-99,97,2,29,1,...,2,97,1,1,3,99,México,97,97,0.0
19997,2,27,1,2021-08-24,2021-08-23,9999-99-99,97,2,71,1,...,2,97,1,2,7,99,México,97,97,0.0
19998,1,26,1,2022-01-25,2022-01-20,9999-99-99,97,2,48,1,...,2,97,1,1,3,99,México,97,97,0.0


In [26]:
# pick columns(features and targets) for ml_model
ml_col = [
    'SEX',
    'Target',
    'PNEUMONIA',
     'AGE',
     'DIABETES',
     'COPD',
     'ASTHMA',
     'CARDIOVASCULAR',
     'OBESITY',
     'RENAL_CHRONIC',
     'SMOKING',
     'FINAL_CLASSIFICATION'
]
ml_df = sample_df.loc[:,ml_col]

In [27]:
ml_df.groupby(['FINAL_CLASSIFICATION']).count()

Unnamed: 0_level_0,SEX,Target,PNEUMONIA,AGE,DIABETES,COPD,ASTHMA,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,SMOKING
FINAL_CLASSIFICATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,371,371,371,371,371,371,371,371,371,371,371
2,3,3,3,3,3,3,3,3,3,3,3
3,7147,7147,7147,7147,7147,7147,7147,7147,7147,7147,7147
6,17,17,17,17,17,17,17,17,17,17,17
7,12462,12462,12462,12462,12462,12462,12462,12462,12462,12462,12462


In [28]:
# drop those whose result are not clear; Add one column "COVID_RESULT": Negative as 0, positive as 1
ml_clean_df = ml_df.loc[ml_df['FINAL_CLASSIFICATION'] != 6]
ml_clean_df.loc[ml_clean_df['FINAL_CLASSIFICATION'] == 7, 'COVID_RESULT'] = int(0)
ml_clean_df.loc[ml_clean_df['FINAL_CLASSIFICATION'] <= 3, 'COVID_RESULT'] = int(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [29]:
ml_clean_df = ml_clean_df.drop(columns = 'FINAL_CLASSIFICATION')

In [30]:
ml_clean_df['Target'] = ml_clean_df['Target'].astype(int)
ml_clean_df['COVID_RESULT'] = ml_clean_df['COVID_RESULT'].astype(int)

In [43]:
ml_clean_df

Unnamed: 0,SEX,Target,PNEUMONIA,AGE,DIABETES,COPD,ASTHMA,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,SMOKING,COVID_RESULT
0,1,1,1,50,2,2,2,2,2,2,2,0
1,1,0,2,9,2,2,1,2,2,2,2,0
2,2,0,2,47,2,2,2,2,1,2,2,0
3,1,0,2,59,2,2,2,2,2,2,2,0
4,1,0,2,21,2,2,2,2,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1,0,2,36,2,2,2,2,2,2,2,1
19996,1,0,2,29,2,2,2,2,2,2,2,1
19997,2,0,2,71,2,2,2,1,2,2,2,0
19998,1,0,2,48,2,2,2,2,1,2,2,1


In [31]:
ml_clean_df.dtypes

SEX               int64
Target            int32
PNEUMONIA         int64
AGE               int64
DIABETES          int64
COPD              int64
ASTHMA            int64
CARDIOVASCULAR    int64
OBESITY           int64
RENAL_CHRONIC     int64
SMOKING           int64
COVID_RESULT      int32
dtype: object

In [41]:
y = ml_clean_df['Target']
X = ml_clean_df.drop(columns = 'Target')

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1)

# # Creating StandardScaler instance
# scaler = StandardScaler()

# # Fitting Standard Scaler
# X_scaler = scaler.fit(X_train)

# # Scaling data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [45]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1,n_estimators = 100)
eec.fit(X_train,y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [48]:
# Calculated the balanced accuracy score
y_pred_eec = eec.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred_eec)

0.9112952232899943

In [49]:
# Display the confusion matrix
confusion_matrix(y_test,y_pred_eec)

array([[4477,  403],
       [  11,  105]], dtype=int64)

In [50]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_eec))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.92      0.91      0.96      0.91      0.83      4880
          1       0.21      0.91      0.92      0.34      0.91      0.83       116

avg / total       0.98      0.92      0.91      0.94      0.91      0.83      4996



## Logistic Regression Model

In [51]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [61]:
# Undersample the data using `RandomUnderSampler`
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 412, 1: 412})

In [62]:
classifier.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

In [71]:
y_pred_log = classifier.predict(X_test)
confusion_matrix(y_test, y_pred_log)

array([[4519,  361],
       [  17,   99]], dtype=int64)

In [72]:
balanced_accuracy_score(y_test, y_pred_log)

0.8897364330130018

In [73]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_log))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.93      0.85      0.96      0.89      0.80      4880
          1       0.22      0.85      0.93      0.34      0.89      0.78       116

avg / total       0.98      0.92      0.86      0.95      0.89      0.80      4996

