In [1]:
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Importing csv into a DataFrame

file_path = 'clean_covid_data.csv'
cleaned_df = pd.read_csv(file_path)
cleaned_df.head()

Unnamed: 0,Age_Group,Condition_Group,Condition,Deaths,Risk
0,0-24,Respiratory diseases,Influenza and pneumonia,1430,high
1,25-34,Respiratory diseases,Influenza and pneumonia,5647,highest
2,35-44,Respiratory diseases,Influenza and pneumonia,14738,highest
3,45-54,Respiratory diseases,Influenza and pneumonia,36674,highest
4,55-64,Respiratory diseases,Influenza and pneumonia,80438,highest


In [3]:
# Creating features
X = pd.get_dummies(cleaned_df, columns=['Age_Group', 'Condition_Group', 'Condition', 'Deaths']).drop('Risk', axis=1)

# Creating target
y = cleaned_df['Risk']
X.head()

Unnamed: 0,Age_Group_0-24,Age_Group_25-34,Age_Group_35-44,Age_Group_45-54,Age_Group_55-64,Age_Group_65-74,Age_Group_75-84,Age_Group_85+,Condition_Group_All other conditions and causes (residual),Condition_Group_Alzheimer disease,...,Deaths_110981,Deaths_111687,Deaths_112535,Deaths_122785,Deaths_123714,Deaths_130142,Deaths_152680,Deaths_240927,Deaths_274327,Deaths_278148
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X.describe()

Unnamed: 0,Age_Group_0-24,Age_Group_25-34,Age_Group_35-44,Age_Group_45-54,Age_Group_55-64,Age_Group_65-74,Age_Group_75-84,Age_Group_85+,Condition_Group_All other conditions and causes (residual),Condition_Group_Alzheimer disease,...,Deaths_110981,Deaths_111687,Deaths_112535,Deaths_122785,Deaths_123714,Deaths_130142,Deaths_152680,Deaths_240927,Deaths_274327,Deaths_278148
count,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0,...,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0,251543.0
mean,0.156661,0.140183,0.125645,0.11544,0.113166,0.114819,0.116938,0.117149,0.043766,0.05246,...,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06
std,0.363482,0.347177,0.331449,0.319552,0.316796,0.318804,0.321347,0.321598,0.204574,0.222954,...,0.001994,0.001994,0.001994,0.001994,0.001994,0.001994,0.001994,0.001994,0.001994,0.001994
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
y.value_counts()

no risk    157359
low         71054
medium      16649
high         4251
highest      2230
Name: Risk, dtype: int64

In [7]:
# Splitting dataset into training data and testing data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(188657, 3246)

In [8]:
print(Counter(y_train))
print(Counter(y_test))

Counter({'no risk': 118167, 'low': 53152, 'medium': 12498, 'high': 3187, 'highest': 1653})
Counter({'no risk': 39192, 'low': 17902, 'medium': 4151, 'high': 1064, 'highest': 577})


In [9]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
# Instantiate
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

In [10]:
# Fit
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [11]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7152477819644788

In [12]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[  517,   429,    27,    91,     0],
       [  167,   358,    10,    42,     0],
       [  443,  1146, 16123,   179,    11],
       [  534,  1176,    78,  2363,     0],
       [    0,     0,     0,     0, 39192]], dtype=int64)

In [13]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.31      0.49      0.98      0.38      0.69      0.45      1064
    highest       0.12      0.62      0.96      0.19      0.77      0.57       577
        low       0.99      0.90      1.00      0.94      0.95      0.89     17902
     medium       0.88      0.57      0.99      0.69      0.75      0.54      4151
    no risk       1.00      1.00      1.00      1.00      1.00      1.00     39192

avg / total       0.97      0.93      1.00      0.95      0.96      0.92     62886

