In [12]:
from sqlalchemy import create_engine
from config import db_password
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import pandas as pd

In [2]:
engine = create_engine(f"postgresql://postgres:{db_password}@127.0.0.1:5432/capston_project_db")

In [3]:
df = pd.read_sql('SELECT * FROM covid_data', engine)
df.head()

Unnamed: 0,age_group,condition_group,condition,deaths,risk,id
0,0-24,Respiratory diseases,Influenza and pneumonia,1430,high,1
1,25-34,Respiratory diseases,Influenza and pneumonia,5647,highest,2
2,35-44,Respiratory diseases,Influenza and pneumonia,14738,highest,3
3,45-54,Respiratory diseases,Influenza and pneumonia,36674,highest,4
4,55-64,Respiratory diseases,Influenza and pneumonia,80438,highest,5


In [4]:
df = df.drop(['id'], axis=1)
df.head()

Unnamed: 0,age_group,condition_group,condition,deaths,risk
0,0-24,Respiratory diseases,Influenza and pneumonia,1430,high
1,25-34,Respiratory diseases,Influenza and pneumonia,5647,highest
2,35-44,Respiratory diseases,Influenza and pneumonia,14738,highest
3,45-54,Respiratory diseases,Influenza and pneumonia,36674,highest
4,55-64,Respiratory diseases,Influenza and pneumonia,80438,highest


In [5]:
sample_df = df.sample(frac=0.5, replace=True, random_state=1)

In [6]:
X = pd.get_dummies(sample_df, columns=['age_group', 'condition_group', 'condition', 'deaths']).drop('risk', axis=1)

y = sample_df['risk']
X.head()

Unnamed: 0,age_group_0-24,age_group_25-34,age_group_35-44,age_group_45-54,age_group_55-64,age_group_65-74,age_group_75-84,age_group_85+,condition_group_All other conditions and causes (residual),condition_group_Alzheimer disease,...,deaths_62882,deaths_68876,deaths_79412,deaths_95253,deaths_110753,deaths_110981,deaths_112535,deaths_122785,deaths_123714,deaths_152680
128037,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229611,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
208780,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5192,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229119,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
y.head()

128037    no risk
229611    no risk
208780    no risk
5192       medium
229119    no risk
Name: risk, dtype: object

In [8]:
X.describe()

Unnamed: 0,age_group_0-24,age_group_25-34,age_group_35-44,age_group_45-54,age_group_55-64,age_group_65-74,age_group_75-84,age_group_85+,condition_group_All other conditions and causes (residual),condition_group_Alzheimer disease,...,deaths_62882,deaths_68876,deaths_79412,deaths_95253,deaths_110753,deaths_110981,deaths_112535,deaths_122785,deaths_123714,deaths_152680
count,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,...,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0
mean,0.156998,0.140953,0.125457,0.115582,0.111996,0.115701,0.116544,0.116767,0.043515,0.052015,...,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06,1.6e-05,8e-06
std,0.363801,0.347975,0.331238,0.319724,0.315363,0.319868,0.320878,0.321144,0.204015,0.222058,...,0.00282,0.00282,0.00282,0.00282,0.00282,0.00282,0.00282,0.00282,0.003988,0.00282
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
y.value_counts()

no risk    78876
low        35621
medium      8107
high        2090
highest     1078
Name: risk, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(94329, 2037)

In [13]:
print(Counter(y_train))
print(Counter(y_test))

Counter({'no risk': 59137, 'low': 26717, 'medium': 6074, 'high': 1572, 'highest': 829})
Counter({'no risk': 19739, 'low': 8904, 'medium': 2033, 'high': 518, 'highest': 249})


In [14]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
# Instantiate
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

In [None]:
# Fit
brf.fit(X_train, y_train)