In [1]:
# Import libraries
# SQLAlchemy
from sqlalchemy import create_engine
from sqlalchemy import inspect
from config import db_password

# Pandas
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report

from sklearn.svm import SVC

In [2]:
# Create an engine that can talk to the database
db_string = f"postgresql://postgres:{db_password}@prodsamplecovidpatients.cqbgcjbaetrj.us-west-1.rds.amazonaws.com:5432/sample_covid_patients"
engine = create_engine(db_string)

In [3]:
# Get the name of the table. 
inspector = inspect(engine)
inspector.get_table_names()

['patient']

In [4]:
# Make a connection to the SQL database
conn = engine.connect()

In [5]:
# Query 20000 Records in the the patient db
prov_df = pd.read_sql("SELECT * FROM patient LIMIT 20000", conn)

In [6]:
prov_df.head()

Unnamed: 0,index,data_file_updated,id_patient,resp_monitoring,type_institution,state_medical_unit,gender,state_patient_birth,state_residence,city_patient_birth,...,closed_contanct,lab_sample,lab_result,antigen_sample,antigen_result,final_class,migrant,country_nationality,country_patient_birth,icu
0,0,2022-06-16,z3bf80,2,12,8,2,8,8,37,...,2,1,1,2,97,3,99,México,97,97
1,1,2022-06-16,z1e370,1,12,14,1,14,14,85,...,2,1,2,2,97,7,99,México,97,97
2,2,2022-06-16,zze974,1,6,24,1,24,24,35,...,1,1,2,2,97,7,99,México,97,97
3,3,2022-06-16,zz7067,1,12,9,2,9,9,7,...,2,1,2,2,97,7,99,México,97,97
4,4,2022-06-16,z1da1e,1,12,1,2,1,1,1,...,1,1,2,2,97,7,99,México,97,97


In [7]:
prov_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   index                  20000 non-null  int64 
 1   data_file_updated      20000 non-null  object
 2   id_patient             20000 non-null  object
 3   resp_monitoring        20000 non-null  int64 
 4   type_institution       20000 non-null  int64 
 5   state_medical_unit     20000 non-null  int64 
 6   gender                 20000 non-null  int64 
 7   state_patient_birth    20000 non-null  int64 
 8   state_residence        20000 non-null  int64 
 9   city_patient_birth     20000 non-null  int64 
 10  type_patient           20000 non-null  int64 
 11  date_admitted          20000 non-null  object
 12  date_patient_symp      20000 non-null  object
 13  date_patient_death     20000 non-null  object
 14  intubated              20000 non-null  int64 
 15  pneumonia          

In [8]:
# columns to keep
keep_col = ['gender',
        'intubated', 'pneumonia', 'age',
       'pregnant', 'diabetes', 'copd',
       'asthma', 'immunosup', 'hypertension',
       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco']

In [9]:
# Encode the target: people with high risk date as 1, and people low risk as 0
prov_df['high_risk'] = (prov_df['date_patient_death']!='9999-99-99').astype(int)
prov_df.head()

Unnamed: 0,index,data_file_updated,id_patient,resp_monitoring,type_institution,state_medical_unit,gender,state_patient_birth,state_residence,city_patient_birth,...,lab_sample,lab_result,antigen_sample,antigen_result,final_class,migrant,country_nationality,country_patient_birth,icu,high_risk
0,0,2022-06-16,z3bf80,2,12,8,2,8,8,37,...,1,1,2,97,3,99,México,97,97,0
1,1,2022-06-16,z1e370,1,12,14,1,14,14,85,...,1,2,2,97,7,99,México,97,97,0
2,2,2022-06-16,zze974,1,6,24,1,24,24,35,...,1,2,2,97,7,99,México,97,97,0
3,3,2022-06-16,zz7067,1,12,9,2,9,9,7,...,1,2,2,97,7,99,México,97,97,0
4,4,2022-06-16,z1da1e,1,12,1,2,1,1,1,...,1,2,2,97,7,99,México,97,97,0


In [10]:
# Create the feature df X and the outcome variable y
y = prov_df['high_risk']
X = prov_df.loc[:, keep_col]

In [11]:
X.dtypes

gender            int64
intubated         int64
pneumonia         int64
age               int64
pregnant          int64
diabetes          int64
copd              int64
asthma            int64
immunosup         int64
hypertension      int64
cardiovascular    int64
obesity           int64
renal_chronic     int64
tobacco           int64
dtype: object

In [12]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({0: 14385, 1: 615})

# Combination Sampling for imbalanced data

In [13]:
# Use the SMOTEENN technique to perform combination sampling on the data
# Count the resampled classes
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 12915, 1: 11132})