In [1]:
#importing possible libraries and dependencies
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# import os
import pandas as pd
from path import Path
from config import db_password
from sqlalchemy import create_engine
from sqlalchemy import inspect
import psycopg2


In [2]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Arizona_Elections"

In [3]:
engine = create_engine(db_string)

In [4]:
Arizona_Elections_df = pd.read_sql('SELECT * from district_9', engine)

In [5]:
Arizona_Elections_df.head()

Unnamed: 0,Voter File VANID,PartyName,Sex,Age,Ethnicity,PartisanScore,NatTO,Partisanship,2020:CivKidsinHH,2020:CivLibIdeo,2020:CivMarriage,Zip5
0,14282,Democrat,F,65,Caucasian,95.0,98.85,98.2,10.6,62.81,38.42,85224
1,54358,Republican,M,48,Caucasian,8.0,74.6,3.3,74.27,17.17,81.75,85204
2,54724,Democrat,M,62,Hispanic,86.78,21.81,80.5,66.87,36.31,80.53,85203
3,69453,Other,M,55,Caucasian,76.36,87.51,29.3,84.56,27.37,99.13,85204
4,69454,Democrat,F,53,Caucasian,79.28,77.65,97.1,95.67,52.75,98.01,85204


In [6]:
# seeing all data types
Arizona_Elections_df.dtypes

Voter File VANID      int64
PartyName            object
Sex                  object
Age                   int64
Ethnicity            object
PartisanScore       float64
NatTO               float64
Partisanship        float64
2020:CivKidsinHH    float64
2020:CivLibIdeo     float64
2020:CivMarriage    float64
Zip5                  int64
dtype: object

In [7]:
#drop any null or nan in the data frame 
Arizona_Elections_df.dropna(axis = 0, how = "any", thresh = None, subset = None, inplace=False)
Arizona_Elections_df

Unnamed: 0,Voter File VANID,PartyName,Sex,Age,Ethnicity,PartisanScore,NatTO,Partisanship,2020:CivKidsinHH,2020:CivLibIdeo,2020:CivMarriage,Zip5
0,14282,Democrat,F,65,Caucasian,95.00,98.85,98.2,10.60,62.81,38.42,85224
1,54358,Republican,M,48,Caucasian,8.00,74.60,3.3,74.27,17.17,81.75,85204
2,54724,Democrat,M,62,Hispanic,86.78,21.81,80.5,66.87,36.31,80.53,85203
3,69453,Other,M,55,Caucasian,76.36,87.51,29.3,84.56,27.37,99.13,85204
4,69454,Democrat,F,53,Caucasian,79.28,77.65,97.1,95.67,52.75,98.01,85204
...,...,...,...,...,...,...,...,...,...,...,...,...
73054,8942692,Other,F,39,Caucasian,40.66,80.30,7.8,71.10,37.48,96.63,85225
73055,8943183,Other,M,32,Hispanic,63.18,41.69,62.4,4.15,49.98,4.53,85204
73056,8943401,Republican,M,24,Caucasian,13.15,34.99,44.9,1.77,43.65,2.60,85202
73057,8943588,Other,M,30,Caucasian,39.17,34.78,24.1,5.10,46.04,1.74,85204


In [8]:
#converting label columns from txt to numerical model can only work with numberical numbers
X = pd.get_dummies(Arizona_Elections_df, columns=["Sex","PartyName",]).drop('PartisanScore', axis=1)

# Create our target
y = Arizona_Elections_df['PartisanScore']
X.head()

Unnamed: 0,Voter File VANID,Age,Ethnicity,NatTO,Partisanship,2020:CivKidsinHH,2020:CivLibIdeo,2020:CivMarriage,Zip5,Sex_F,Sex_M,PartyName_Democrat,PartyName_Other,PartyName_Republican
0,14282,65,Caucasian,98.85,98.2,10.6,62.81,38.42,85224,1,0,1,0,0
1,54358,48,Caucasian,74.6,3.3,74.27,17.17,81.75,85204,0,1,0,0,1
2,54724,62,Hispanic,21.81,80.5,66.87,36.31,80.53,85203,0,1,1,0,0
3,69453,55,Caucasian,87.51,29.3,84.56,27.37,99.13,85204,0,1,0,1,0
4,69454,53,Caucasian,77.65,97.1,95.67,52.75,98.01,85204,1,0,1,0,0


In [9]:
X.describe()

Unnamed: 0,Voter File VANID,Age,NatTO,Partisanship,2020:CivKidsinHH,2020:CivLibIdeo,2020:CivMarriage,Zip5,Sex_F,Sex_M,PartyName_Democrat,PartyName_Other,PartyName_Republican
count,73059.0,73059.0,73059.0,73059.0,73059.0,73059.0,73059.0,73059.0,73059.0,73059.0,73059.0,73059.0,73059.0
mean,3875704.0,52.131264,73.451643,45.913013,30.592992,39.82396,52.777277,85210.656962,0.524932,0.475068,0.31428,0.312583,0.373137
std,3006246.0,17.740461,29.070357,41.90181,29.524888,27.964183,35.184707,17.966252,0.499381,0.499381,0.464232,0.463549,0.483641
min,32.0,22.0,0.32,0.2,0.08,0.7,0.1,85201.0,0.0,0.0,0.0,0.0,0.0
25%,625319.5,37.0,49.36,2.0,5.04,13.03,18.31,85202.0,0.0,0.0,0.0,0.0,0.0
50%,3074600.0,52.0,86.62,38.7,18.03,36.46,52.34,85204.0,1.0,0.0,0.0,0.0,0.0
75%,6088556.0,66.0,98.53,94.6,54.17,63.61,90.48,85210.0,1.0,1.0,1.0,1.0,1.0
max,8975425.0,100.0,99.79,99.6,99.54,99.28,99.97,85296.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# check the balance of our taget very important can be hindering
X.value_counts()

Voter File VANID  Age  Ethnicity         NatTO  Partisanship  2020:CivKidsinHH  2020:CivLibIdeo  2020:CivMarriage  Zip5   Sex_F  Sex_M  PartyName_Democrat  PartyName_Other  PartyName_Republican
32                78   Caucasian         71.41  0.5           1.88              15.97            27.29             85203  0      1      0                   0                1                       1
5562765           55   Hispanic          98.75  65.9          87.06             8.60             96.53             85204  1      0      0                   1                0                       1
5563015           40   Hispanic          63.89  1.4           76.25             12.83            82.54             85204  0      1      0                   0                1                       1
5562910           34   African-American  30.06  55.8          14.02             50.91            15.02             85210  0      1      0                   1                0                       1
5562857   

In [11]:
# check the balance of our target very important spent a good time trying to figurere out 
y.value_counts()

4.87     57
4.98     56
5.74     53
5.50     51
5.18     51
         ..
58.10     1
52.37     1
98.32     1
98.39     1
49.54     1
Name: PartisanScore, Length: 9350, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [None]:
# importing train_test_split from sklearn
from sklearn.model_selection import train_test_split
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) 

In [None]:
# importing module
from sklearn.linear_model import LinearRegression
# creating our model and assigning its function
model = LinearRegression()
# fitting the training data
model .fit(X,y)

In [None]:
y_prediction =  model.predict(X_test)
y_prediction

In [None]:
# importing r2_score module
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Fitting our model with all our features in X
model.fit(X, y)

score = model.score(X, y)
print(f"R2 Score: {score}")

In [None]:
#importing possible libraries and dependencies
import pandas as pd
from path import Path
from sqlalchemy import create_engine
from config import db_password