In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [8]:
voter16 = pd.read_excel("voter16.xlsx")
inter16 = pd.read_excel("inter16.xlsx")

In [9]:
voter16.head()

Unnamed: 0,State,Percent registered\n(Total),Percent voted\n(Total),Turnout,Label (>mean 0.872)
0,ALABAMA,67.9617,56.3712,0.829455,0
1,ALASKA,69.0653,59.4061,0.860144,0
2,ARIZONA,60.5332,53.288,0.88031,1
3,ARKANSAS,65.6858,56.0106,0.852705,0
4,CALIFORNIA,53.8432,48.2247,0.895651,1


In [10]:
inter16.head()


Unnamed: 0,State,Households with computer,internet access.,Poverty,Winning Margin,"BG(Y=1,N=0)"
0,Alabama .........................................,83.909992,75.28483,16.3,27.7,0
1,Alaska ..........................................,94.433488,86.695671,10.9,14.7,1
2,Arizona .........................................,91.044885,83.804423,16.6,3.5,0
3,Arkansas ........................................,84.479373,71.134348,16.1,26.9,0
4,California ......................................,92.120089,85.606221,13.9,30.1,1


In [11]:
# dummy variables for State
inter16 = pd.get_dummies(inter16, columns = ['State'])
voter16 = pd.get_dummies(voter16, columns = ['State'])

In [12]:
#Original Model
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod1 = LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)

# majority class classifier
print(y_test.mean())
print(mod1.score(X_test, y_test))

0.625
0.6875


In [13]:
#Refining measures
inter = inter16.drop(labels = ['Poverty', 'Winning Margin', 'BG(Y=1,N=0)'], axis = 1)

y = voter16['Label (>mean 0.872)']
X = inter.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod2= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod2.score(X_test, y_test))

0.625
0.375


In [15]:
pov = inter16.drop(labels = ['Households with computer', 'internet access.'], axis = 1)

y = voter16['Label (>mean 0.872)']
X = pov.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod3= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod3.score(X_test, y_test))

0.625
0.375


In [16]:
dict(zip(mod2.feature_names_in_, mod2.coef_[0]))

{'internet access.': -0.002268938499832133,
 'State_Alabama ...................................................': -0.311685413783106,
 'State_Alaska ...................................................': -0.3060141747924848,
 'State_Arizona ....................................................': 0.0,
 'State_Arkansas ..................................................': 0.0,
 'State_California ........................................': 0.5013907273173555,
 'State_Colorado ..................................................': 0.5021603311265305,
 'State_Connecticut ............................................': 0.5008817179101951,
 'State_Delaware ............................................................................': -0.30742929581050943,
 'State_District of Columbia ................................................................................. . .': 0.49801911014175554,
 'State_Florida ...........................................................': 0.4991515677535497,
 'State_Geor

In [17]:
#Feature Importance W/ Logistic Regression

#with computer only
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,2:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod4= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod4.score(X_test, y_test))

0.625
0.5625


In [18]:
#with internet access only
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,3:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod4= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod4.score(X_test, y_test))

0.625
0.375


In [19]:
#with Poverty(2-year average) only
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,4:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod4= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod4.score(X_test, y_test))

0.625
0.375


In [21]:
#with Winning Margin only
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,5:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod4= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod4.score(X_test, y_test))

0.625
0.375


In [22]:
#with Battleground State only
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,6:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod4= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod4.score(X_test, y_test))

0.625
0.375


In [23]:
#Feature importance with Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

In [24]:
# Full model
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

fullforest = RandomForestClassifier(random_state=0).fit(X_train, y_train)

fullforest.score(X_test, y_test)


0.5

In [27]:
#without poverty & MR & BG data 
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,1:3]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

interforest = RandomForestClassifier(random_state=0).fit(X_train, y_train)

interforest.score(X_test, y_test)

0.625

In [26]:
#without internet data
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,4:6]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

povforest = RandomForestClassifier(random_state=0).fit(X_train, y_train)

povforest.score(X_test, y_test)

0.4375

In [28]:
#without MR and BG data
y = voter16['Label (>mean 0.872)']
X = inter16.iloc[:,1:4]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

povforest = RandomForestClassifier(random_state=0).fit(X_train, y_train)

povforest.score(X_test, y_test)

0.625