In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# perform basic data cleaning
data = pd.read_csv('machine_learning.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               50 non-null     int64  
 1   Unnamed: 0.1             50 non-null     int64  
 2   state                    50 non-null     object 
 3   mask_willingness         50 non-null     int64  
 4   vaccine_unwilling        50 non-null     int64  
 5   supply_percent_used      50 non-null     float64
 6   % population passed      50 non-null     float64
 7   election_result          50 non-null     int64  
 8   postvax_deaths           50 non-null     int64  
 9   prevax_deaths            50 non-null     int64  
 10  % population vaccinated  50 non-null     float64
dtypes: float64(3), int64(7), object(1)
memory usage: 4.4+ KB


In [7]:
# split for train and test data
X = pd.get_dummies(data, columns = ['state', 'mask_willingness','vaccine_unwilling', 'supply_percent_used', '% population passed', 'postvax_deaths', 'prevax_deaths', '% population vaccinated']).drop('election_result',axis=1)
y = data['election_result'] 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [8]:
#create model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=1)
logreg

LogisticRegression(random_state=1)

In [10]:
# Train the data
logreg.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [12]:
# Predict outcomes for test data set
predictions = logreg.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
36,1,1
19,0,1
24,0,0
2,0,1
25,1,0
28,1,1
49,0,0
7,1,1
17,0,0
13,0,0


In [13]:
#validate the test
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.6153846153846154

In [15]:
from sklearn.metrics import balanced_accuracy_score
y_pred= logreg.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6309523809523809

In [18]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.56      0.83      0.43      0.67      0.60      0.37         6
          1       0.75      0.43      0.83      0.55      0.60      0.34         7

avg / total       0.66      0.62      0.65      0.60      0.60      0.36        13



In [19]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
#create instance
forrest = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

forrest.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [20]:
# Calculated the balanced accuracy score
y_pred = forrest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6904761904761905

In [21]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.67      0.67      0.71      0.67      0.69      0.47         6
          1       0.71      0.71      0.67      0.71      0.69      0.48         7

avg / total       0.69      0.69      0.69      0.69      0.69      0.48        13

