In [1]:
#importing possible libraries and dependencies
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from path import Path
from config import db_password
from sqlalchemy import create_engine
from sqlalchemy import inspect
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Load the crypto_data.csv dataset.
#file path will be update to the table created in our sql database
# file_path = "/Users/lesleyrodriguez/Desktop/Arizona_Election_Project/Machine_learning/dummy data.csv"
# Arizona_Election_df = pd.read_csv(file_path)
# Arizona_Election_df.head(10)
# host= "localhost"
# database = "Arizona_Elections"
# user= "postgres"
# password = "db_password"

In [3]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Arizona_Elections"

In [4]:
engine = create_engine(db_string)


In [5]:
Arizona_Election_df = pd.read_sql('SELECT * from machinelearning', engine)

In [6]:
Arizona_Election_df.head()

Unnamed: 0,Voter ID,Party,Sex,Age,Ethnicity,Voter Score,Turnout Score,Kids in HH,Liberal Ideology,2020 marriage,Zip,Pscore
0,15777,Republican,M,72,Caucasian,8.15,99.22,6.75,4.14,80.49,85224,False
1,22507,Democrat,M,56,Hispanic,91.74,99.13,34.25,70.85,66.12,85286,False
2,24594,Democrat,F,70,Caucasian,98.02,98.35,13.59,94.71,83.72,85248,False
3,40503,Democrat,M,70,Caucasian,93.63,97.76,13.28,79.04,96.15,85225,False
4,48534,Other,F,66,Caucasian,91.58,98.97,7.58,78.62,61.7,85249,False


In [7]:
##Preprocessing TEe Data this step might not be needed once data is clean and database tables are set
##and full with information needed 



In [8]:
#seeing all data types
Arizona_Election_df.dtypes

Voter ID              int64
Party                object
Sex                  object
Age                   int64
Ethnicity            object
Voter Score         float64
Turnout Score       float64
Kids in HH          float64
Liberal Ideology    float64
2020 marriage       float64
Zip                   int64
Pscore               object
dtype: object

In [9]:
#Drop all columns contained unecesarry features or null nan 
Arizona_Election_df.drop(columns=['Voter Score','Voter ID'], inplace=True)
Arizona_Election_df


Unnamed: 0,Party,Sex,Age,Ethnicity,Turnout Score,Kids in HH,Liberal Ideology,2020 marriage,Zip,Pscore
0,Republican,M,72,Caucasian,99.22,6.75,4.14,80.49,85224,False
1,Democrat,M,56,Hispanic,99.13,34.25,70.85,66.12,85286,False
2,Democrat,F,70,Caucasian,98.35,13.59,94.71,83.72,85248,False
3,Democrat,M,70,Caucasian,97.76,13.28,79.04,96.15,85225,False
4,Other,F,66,Caucasian,98.97,7.58,78.62,61.70,85249,False
...,...,...,...,...,...,...,...,...,...,...
86672,Democrat,M,22,Caucasian,90.56,4.31,57.35,3.04,85225,False
86673,Other,M,27,Caucasian,30.00,0.98,58.02,1.30,85224,False
86674,Other,F,56,Caucasian,89.69,27.86,26.66,95.50,85248,False
86675,Democrat,F,76,Caucasian,95.07,3.56,76.52,86.19,85248,False


In [10]:
# #drop any null or nan in the data frame 
# Arizona_Election_df.dropna(axis = 0, how = "any", thresh = None, subset = None, inplace=False)
# Arizona_Election_df not longer needed 



In [11]:
# Convert the target column values to low_chance and high_chance based on their values
x = {'False': 'Low_chance'}   
Arizona_Election_df = Arizona_Election_df.replace(x)

x = dict.fromkeys(['True'], 'High_Chance')    
Arizona_Election_df = Arizona_Election_df.replace(x)

Arizona_Election_df.reset_index(inplace=True, drop=True)

Arizona_Election_df

Unnamed: 0,Party,Sex,Age,Ethnicity,Turnout Score,Kids in HH,Liberal Ideology,2020 marriage,Zip,Pscore
0,Republican,M,72,Caucasian,99.22,6.75,4.14,80.49,85224,Low_chance
1,Democrat,M,56,Hispanic,99.13,34.25,70.85,66.12,85286,Low_chance
2,Democrat,F,70,Caucasian,98.35,13.59,94.71,83.72,85248,Low_chance
3,Democrat,M,70,Caucasian,97.76,13.28,79.04,96.15,85225,Low_chance
4,Other,F,66,Caucasian,98.97,7.58,78.62,61.70,85249,Low_chance
...,...,...,...,...,...,...,...,...,...,...
86672,Democrat,M,22,Caucasian,90.56,4.31,57.35,3.04,85225,Low_chance
86673,Other,M,27,Caucasian,30.00,0.98,58.02,1.30,85224,Low_chance
86674,Other,F,56,Caucasian,89.69,27.86,26.66,95.50,85248,Low_chance
86675,Democrat,F,76,Caucasian,95.07,3.56,76.52,86.19,85248,Low_chance


In [12]:
#converting label columns from txt to numerical model can only work with numberical numbers
X = pd.get_dummies(Arizona_Election_df, 
                   columns=["Sex","Party",'Ethnicity','Zip']).drop('Pscore', axis=1)

# Create our target
y = Arizona_Election_df['Pscore']
X.head()

Unnamed: 0,Age,Turnout Score,Kids in HH,Liberal Ideology,2020 marriage,Sex_F,Sex_M,Party_Democrat,Party_Other,Party_Republican,...,Zip_85224,Zip_85225,Zip_85226,Zip_85233,Zip_85234,Zip_85248,Zip_85249,Zip_85286,Zip_85296,Zip_85297
0,72,99.22,6.75,4.14,80.49,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,56,99.13,34.25,70.85,66.12,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,70,98.35,13.59,94.71,83.72,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,70,97.76,13.28,79.04,96.15,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
4,66,98.97,7.58,78.62,61.7,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [13]:
X.describe()

Unnamed: 0,Age,Turnout Score,Kids in HH,Liberal Ideology,2020 marriage,Sex_F,Sex_M,Party_Democrat,Party_Other,Party_Republican,...,Zip_85224,Zip_85225,Zip_85226,Zip_85233,Zip_85234,Zip_85248,Zip_85249,Zip_85286,Zip_85296,Zip_85297
count,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0,...,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0,86677.0
mean,55.159973,80.031703,38.353084,38.090964,69.290473,0.525226,0.474774,0.289419,0.298511,0.41207,...,0.059924,0.17654,0.005895,0.164588,0.000196,0.204229,0.177948,0.199499,5.8e-05,0.001904
std,16.973011,25.741421,33.875442,29.772311,33.471216,0.499366,0.499366,0.453495,0.457607,0.49221,...,0.237347,0.381282,0.076556,0.37081,0.014003,0.40314,0.382471,0.399626,0.007595,0.043589
min,22.0,0.42,0.06,0.5,0.1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,42.0,70.84,5.64,10.16,43.29,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,55.0,91.75,27.22,30.52,86.11,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,68.0,98.9,72.84,64.6,97.42,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,100.0,99.84,99.53,99.42,99.99,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
#check the balance of our target very important spent a good time trying to figurere out 
y.value_counts()

Low_chance     80369
High_Chance     6308
Name: Pscore, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [16]:
# Train the model with Balanced Random Forest Classifier
from imblearn.ensemble import BalancedRandomForestClassifier
model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_train, y_train) 

BalancedRandomForestClassifier(random_state=1)

In [17]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7501312776376154

In [18]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 1230,   347],
       [ 5620, 14473]])

In [19]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

High_Chance       0.18      0.78      0.72      0.29      0.75      0.57      1577
 Low_chance       0.98      0.72      0.78      0.83      0.75      0.56     20093

avg / total       0.92      0.72      0.78      0.79      0.75      0.56     21670



In [20]:
# List the features sorted in descending order by feature importance
important_features = pd.Series(data=model.feature_importances_,index=X.columns)
important_features.sort_values(ascending=False,inplace=True)
with pd.option_context('display.max_rows', 100):
    print(important_features)

Liberal Ideology              0.166444
Turnout Score                 0.165865
Party_Other                   0.130845
2020 marriage                 0.123673
Kids in HH                    0.118988
Age                           0.102338
Party_Republican              0.053483
Party_Democrat                0.031250
Ethnicity_Caucasian           0.011881
Zip_85286                     0.009893
Zip_85225                     0.009677
Zip_85249                     0.009488
Zip_85233                     0.009346
Zip_85248                     0.009182
Sex_M                         0.008397
Ethnicity_Hispanic            0.008242
Sex_F                         0.007857
Zip_85224                     0.006485
Ethnicity_Uncoded             0.005312
Ethnicity_Asian               0.003866
Zip_85210                     0.002684
Zip_85226                     0.001862
Ethnicity_African-American    0.001707
Zip_85297                     0.000589
Ethnicity_Native American     0.000505
Zip_85142                

In [21]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual High_chane", "Actual low_Chance"], columns=["Predicted high_Chance", "Predicted low_Chance"])
cm_df

Unnamed: 0,Predicted high_Chance,Predicted low_Chance
Actual High_chane,1230,347
Actual low_Chance,5620,14473
