In [2]:
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Importing csv into a DataFrame

file_path = 'clean_covid_data.csv'
cleaned_df = pd.read_csv(file_path)
cleaned_df.head()

Unnamed: 0,Age_Group,Condition_Group,Condition,Deaths,Risk
0,0-24,Respiratory diseases,Influenza and pneumonia,1430,high
1,25-34,Respiratory diseases,Influenza and pneumonia,5647,highest
2,35-44,Respiratory diseases,Influenza and pneumonia,14738,highest
3,45-54,Respiratory diseases,Influenza and pneumonia,36674,highest
4,55-64,Respiratory diseases,Influenza and pneumonia,80438,highest


In [4]:
# Creating a sample dataframe for training

sample_df = cleaned_df.sample(frac=0.5, replace=True, random_state=1)

In [5]:
# Creating features
X = pd.get_dummies(sample_df, columns=['Age_Group', 'Condition_Group', 'Condition', 'Deaths']).drop('Risk', axis=1)

# Creating target
y = sample_df['Risk']
X.head()

Unnamed: 0,Age_Group_0-24,Age_Group_25-34,Age_Group_35-44,Age_Group_45-54,Age_Group_55-64,Age_Group_65-74,Age_Group_75-84,Age_Group_85+,Condition_Group_All other conditions and causes (residual),Condition_Group_Alzheimer disease,...,Deaths_68876,Deaths_79412,Deaths_95253,Deaths_106897,Deaths_110753,Deaths_110981,Deaths_122785,Deaths_123714,Deaths_152680,Deaths_240927
128037,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229611,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
208780,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5192,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229119,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X.describe()

Unnamed: 0,Age_Group_0-24,Age_Group_25-34,Age_Group_35-44,Age_Group_45-54,Age_Group_55-64,Age_Group_65-74,Age_Group_75-84,Age_Group_85+,Condition_Group_All other conditions and causes (residual),Condition_Group_Alzheimer disease,...,Deaths_68876,Deaths_79412,Deaths_95253,Deaths_106897,Deaths_110753,Deaths_110981,Deaths_122785,Deaths_123714,Deaths_152680,Deaths_240927
count,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,...,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0,125772.0
mean,0.156927,0.140635,0.125831,0.115646,0.111758,0.115582,0.117125,0.116497,0.043213,0.052134,...,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06,8e-06,1.6e-05,8e-06,8e-06
std,0.363733,0.347646,0.33166,0.319801,0.315069,0.319724,0.32157,0.320821,0.203337,0.222298,...,0.00282,0.00282,0.00282,0.00282,0.00282,0.00282,0.00282,0.003988,0.00282,0.00282
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
y.value_counts()

no risk    78804
low        35590
medium      8221
high        2083
highest     1074
Name: Risk, dtype: int64

In [9]:
# Splitting dataset into training data and testing data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(94329, 2062)

In [10]:
print(Counter(y_train))
print(Counter(y_test))

Counter({'no risk': 59166, 'low': 26655, 'medium': 6121, 'high': 1555, 'highest': 832})
Counter({'no risk': 19638, 'low': 8935, 'medium': 2100, 'high': 528, 'highest': 242})


In [11]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
# Instantiate
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

In [12]:
# Fit
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [13]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7378198423306019

In [14]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[  312,   148,    10,    58,     0],
       [   57,   164,     4,    17,     0],
       [  424,   694,  7595,   206,    16],
       [  387,   468,    47,  1198,     0],
       [    0,     0,     0,     0, 19638]], dtype=int64)

In [15]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.26      0.59      0.97      0.37      0.76      0.55       528
    highest       0.11      0.68      0.96      0.19      0.81      0.63       242
        low       0.99      0.85      1.00      0.92      0.92      0.84      8935
     medium       0.81      0.57      0.99      0.67      0.75      0.54      2100
    no risk       1.00      1.00      1.00      1.00      1.00      1.00     19638

avg / total       0.97      0.92      1.00      0.94      0.95      0.91     31443

