In [1]:
#importing possible libraries and dependencies
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from path import Path
from config import db_password
from sqlalchemy import create_engine
from sqlalchemy import inspect
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from collections import Counter

In [2]:
#creating string to our Database, engine and calling in dataset
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Arizona_Elections"
engine = create_engine(db_string)
df_voters = pd.read_sql('SELECT * from machinelearning', engine)

In [3]:
df_voters.head(10)

Unnamed: 0,voter_id,Party,Sex,Age,Ethnicity,Voter Score,Turnout Score,Kids in HH,Liberal Ideology,Zip,Swing Voter
0,15777,Republican,M,72,Caucasian,8.15,99.22,6.75,4.14,85224,False
1,22507,Democrat,M,56,Hispanic,91.74,99.13,34.25,70.85,85286,False
2,24594,Democrat,F,70,Caucasian,98.02,98.35,13.59,94.71,85248,False
3,40503,Democrat,M,70,Caucasian,93.63,97.76,13.28,79.04,85225,False
4,48534,Other,F,66,Caucasian,91.58,98.97,7.58,78.62,85249,False
5,58057,Republican,F,40,Caucasian,8.0,91.88,91.41,12.05,85286,False
6,65093,Republican,M,65,Uncoded,7.67,98.95,18.53,8.35,85225,False
7,118128,Republican,M,66,Caucasian,11.07,87.95,14.9,6.01,85225,False
8,118720,Democrat,M,51,Caucasian,97.2,98.53,57.13,77.11,85249,False
9,121001,Democrat,M,71,Caucasian,93.85,98.97,4.66,73.14,85248,False


In [4]:
#Drop all columns contained unecesarry features or null nan 
df_voters.drop(columns=['Voter Score','voter_id','Turnout Score','Kids in HH','Liberal Ideology'], inplace=True)
df_voters


Unnamed: 0,Party,Sex,Age,Ethnicity,Zip,Swing Voter
0,Republican,M,72,Caucasian,85224,False
1,Democrat,M,56,Hispanic,85286,False
2,Democrat,F,70,Caucasian,85248,False
3,Democrat,M,70,Caucasian,85225,False
4,Other,F,66,Caucasian,85249,False
...,...,...,...,...,...,...
86672,Democrat,M,22,Caucasian,85225,False
86673,Other,M,27,Caucasian,85224,False
86674,Other,F,56,Caucasian,85248,False
86675,Democrat,F,76,Caucasian,85248,False


In [5]:
# Convert the target column values to low_chance and high_chance based on values

x = {'False':'Low_Chance'}
df_voters = df_voters.replace(x)

x = dict.fromkeys(['True'],'High_Chance')
df_voters = df_voters.replace(x)

df_voters.reset_index(inplace=True, drop=True)

df_voters

Unnamed: 0,Party,Sex,Age,Ethnicity,Zip,Swing Voter
0,Republican,M,72,Caucasian,85224,Low_Chance
1,Democrat,M,56,Hispanic,85286,Low_Chance
2,Democrat,F,70,Caucasian,85248,Low_Chance
3,Democrat,M,70,Caucasian,85225,Low_Chance
4,Other,F,66,Caucasian,85249,Low_Chance
...,...,...,...,...,...,...
86672,Democrat,M,22,Caucasian,85225,Low_Chance
86673,Other,M,27,Caucasian,85224,Low_Chance
86674,Other,F,56,Caucasian,85248,Low_Chance
86675,Democrat,F,76,Caucasian,85248,Low_Chance


In [6]:
# Create a list of our conditions
conditions = [
    (df_voters['Age'] >= 18) & (df_voters['Age'] <= 24),
    (df_voters['Age'] >= 25) & (df_voters['Age'] <= 34),
    (df_voters['Age'] >= 35) & (df_voters['Age'] <=44),
    (df_voters['Age'] >= 45) & (df_voters['Age'] <=54),
    (df_voters['Age'] >= 55) & (df_voters['Age'] <=64),
    (df_voters['Age'] >= 65),
    ]

# Create of values we want assigned to the conditions
values = ['1', '2', '3','4','5','6']

# Create a new column with np.select to assign values to it using our lists as arguments
df_voters['Age'] = np.select(conditions, values)

# Display updated DataFrame
df_voters

Unnamed: 0,Party,Sex,Age,Ethnicity,Zip,Swing Voter
0,Republican,M,6,Caucasian,85224,Low_Chance
1,Democrat,M,5,Hispanic,85286,Low_Chance
2,Democrat,F,6,Caucasian,85248,Low_Chance
3,Democrat,M,6,Caucasian,85225,Low_Chance
4,Other,F,6,Caucasian,85249,Low_Chance
...,...,...,...,...,...,...
86672,Democrat,M,1,Caucasian,85225,Low_Chance
86673,Other,M,2,Caucasian,85224,Low_Chance
86674,Other,F,5,Caucasian,85248,Low_Chance
86675,Democrat,F,6,Caucasian,85248,Low_Chance


In [7]:
#seeing if age code is working properly by adding unique values
print(df_voters['Age'].value_counts())

6    26608
4    17753
5    17248
3    13065
2    10683
1     1320
Name: Age, dtype: int64


In [8]:
# Converting label columns from text to numerical data as model only works with numerical data

X = pd.get_dummies(df_voters, columns=["Party","Sex","Ethnicity",'Zip']).drop("Swing Voter", axis=1)

# Create our target

y = df_voters["Swing Voter"]
X.head()

Unnamed: 0,Age,Party_Democrat,Party_Other,Party_Republican,Sex_F,Sex_M,Ethnicity_African-American,Ethnicity_Asian,Ethnicity_Caucasian,Ethnicity_Hispanic,...,Zip_85224,Zip_85225,Zip_85226,Zip_85233,Zip_85234,Zip_85248,Zip_85249,Zip_85286,Zip_85296,Zip_85297
0,6,0,0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,5,1,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,6,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,6,1,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
4,6,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [9]:
#verifying out target was selected correclty
y.value_counts()

Low_Chance     76246
High_Chance    10431
Name: Swing Voter, dtype: int64

In [10]:
#creating our training sample and testing sample 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
# Check balances
print(Counter(y_train))
print(Counter(y_test))

Counter({'Low_Chance': 57184, 'High_Chance': 7823})
Counter({'Low_Chance': 19062, 'High_Chance': 2608})


## undersampling using logistic regresion

In [11]:
#checking our resample counters

from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'High_Chance': 7823, 'Low_Chance': 7823})

In [12]:
#logistic regression being process
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [13]:
#checking out our matrix

from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 2018,   590],
       [ 4518, 14544]])

In [14]:
#viewing accuracy scores
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7683784766274468

In [15]:
#Creating the classification report to see our scores
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

High_Chance       0.31      0.77      0.76      0.44      0.77      0.59      2608
 Low_Chance       0.96      0.76      0.77      0.85      0.77      0.59     19062

avg / total       0.88      0.76      0.77      0.80      0.77      0.59     21670



In [16]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual High_chane", "Actual low_Chance"], columns=["Predicted high_Chance", "Predicted low_Chance"])
cm_df

Unnamed: 0,Predicted high_Chance,Predicted low_Chance
Actual High_chane,2018,590
Actual low_Chance,4518,14544


In [18]:
#another way to view our results

predictions = model.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}) 

Unnamed: 0,Prediction,Actual
57563,High_Chance,High_Chance
74145,Low_Chance,Low_Chance
7373,Low_Chance,Low_Chance
55763,Low_Chance,Low_Chance
16687,Low_Chance,Low_Chance
...,...,...
2622,High_Chance,Low_Chance
61884,Low_Chance,Low_Chance
5173,Low_Chance,Low_Chance
6507,Low_Chance,Low_Chance
