In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
voter20 = pd.read_excel("voter20.xlsx")
inter20 = pd.read_excel("inter20.xlsx")

In [3]:
voter20.head()

Unnamed: 0,State,Percent registered\n(Total),Percent voted\n(Total),Turnout,Label (>mean 0.915)
0,ALABAMA,67.0,59.6,0.889552,0
1,ALASKA,72.6,62.4,0.859504,0
2,ARIZONA,68.8,64.7,0.940407,1
3,ARKANSAS,59.6,51.9,0.870805,0
4,CALIFORNIA,59.3,55.7,0.939292,1


In [4]:
inter20.head()

Unnamed: 0,State,internet access,Poverty,Margin,"BG(Y=1, N=0)"
0,ALABAMA,78.3,14.6,25.5,0
1,ALASKA,82.3,12.2,10.1,0
2,ARIZONA,78.9,11.2,0.3,1
3,ARKANSAS,81.8,14.7,27.6,0
4,CALIFORNIA,81.2,11.0,29.2,0


In [5]:
# dummy variables for State
inter20 = pd.get_dummies(inter20, columns = ['State'])
voter20 = pd.get_dummies(voter20, columns = ['State'])

In [6]:
#Original Model
y = voter20['Label (>mean 0.915)']
X = inter20.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod1 = LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)

# majority class classifier
print(y_test.mean())
print(mod1.score(X_test, y_test))

0.75
0.6875


In [7]:
#Refining measures
inter = inter20.drop(labels = ['Poverty', 'Margin', 'BG(Y=1, N=0)'], axis = 1)

y = voter20['Label (>mean 0.915)']
X = inter.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod2= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod2.score(X_test, y_test))

0.75
0.75


In [8]:
pov = inter20.drop(labels = ['internet access'], axis = 1)

y = voter20['Label (>mean 0.915)']
X = pov.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod3= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod3.score(X_test, y_test))

0.75
0.4375


In [9]:
dict(zip(mod2.feature_names_in_, mod2.coef_[0]))

{'State_ALABAMA': -0.4110776362392723,
 'State_ALASKA': -0.4110776362392723,
 'State_ARIZONA': 0.0,
 'State_ARKANSAS': 0.0,
 'State_CALIFORNIA': 0.3911047334103042,
 'State_COLORADO': 0.3911047334103042,
 'State_CONNECTICUT': -0.4110776362392723,
 'State_DELAWARE': -0.4110776362392723,
 'State_DISTRICT OF COLUMBIA': 0.3911047334103042,
 'State_FLORIDA': 0.3911047334103042,
 'State_GEORGIA': 0.3911047334103042,
 'State_HAWAII': 0.3911047334103042,
 'State_IDAHO': 0.3911047334103042,
 'State_ILLINOIS': 0.3911047334103042,
 'State_INDIANA': -0.4110776362392723,
 'State_IOWA': 0.3911047334103042,
 'State_KANSAS': 0.3911047334103042,
 'State_KENTUCKY': -0.4110776362392723,
 'State_LOUISIANA': -0.4110776362392723,
 'State_MAINE': 0.3911047334103042,
 'State_MARYLAND': 0.3911047334103042,
 'State_MASSACHUSETTS': 0.0,
 'State_MICHIGAN': 0.0,
 'State_MINNESOTA': 0.3911047334103042,
 'State_MISSISSIPPI': -0.4110776362392723,
 'State_MISSOURI': -0.4110776362392723,
 'State_MONTANA': 0.0,
 'State_

In [10]:
#Feature Importance W/ Logistic Regression

#with internet access only
y = voter20['Label (>mean 0.915)']
X = inter20.iloc[:,2:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod4= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod4.score(X_test, y_test))

0.75
0.4375


In [11]:
#with poverty only
y = voter20['Label (>mean 0.915)']
X = inter20.iloc[:,3:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod4= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod4.score(X_test, y_test))

0.75
0.375


In [12]:
#with winning margin only
y = voter20['Label (>mean 0.915)']
X = inter20.iloc[:,4:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod4= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod4.score(X_test, y_test))

0.75
0.75


In [13]:
#with battle ground data only
y = voter20['Label (>mean 0.915)']
X = inter20.iloc[:,5:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

mod4= LogisticRegression(random_state = 1, solver = 'liblinear').fit(X_train, y_train)
print(y_test.mean())
print(mod4.score(X_test, y_test))

0.75
0.75


In [14]:
#Feature importance with Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

In [15]:
# Full model
y = voter20['Label (>mean 0.915)']
X = inter20.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

fullforest = RandomForestClassifier(random_state=0).fit(X_train, y_train)

fullforest.score(X_test, y_test)

0.75

In [17]:
#without poverty & MR & BG data 
y = voter20['Label (>mean 0.915)']
X = inter20.iloc[:,1:2]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

interforest = RandomForestClassifier(random_state=0).fit(X_train, y_train)

interforest.score(X_test, y_test)

0.8125

In [18]:
#without internet data
y = voter20['Label (>mean 0.915)']
X = inter20.iloc[:,3:6]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

povforest = RandomForestClassifier(random_state=0).fit(X_train, y_train)

povforest.score(X_test, y_test)

0.375

In [19]:
#without MR and BG data
y = voter20['Label (>mean 0.915)']
X = inter20.iloc[:,1:3]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

povforest = RandomForestClassifier(random_state=0).fit(X_train, y_train)

povforest.score(X_test, y_test)

0.625