In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

In [2]:
def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold, random_state = 42).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y.iloc[train]) # fit
        result += score_func(clf.predict(x[test]), y.iloc[test]) # evaluate score function on held-out data
    return result / nfold # average

In [3]:
df = pd.read_csv('clustered.csv')

In [4]:
variables = [col for col in df.columns if (col != 'four_cluster_label' and col!= "five_cluster_label" and col != 'Unnamed: 0')]
             
print(variables)

['gender', 'legalstatus', 'borderpatrol', 'legalstatusHS', 'deport', 'minwage12', 'faminc', 'mandatorymin', 'bodycamera', 'threestrikes', 'pew_bornagain', 'increasepolice', 'fuelefficiency', 'EPACO2', 'abortionchoice', 'abortioncoverage', 'abortion20wks', 'repealACA', 'banmostabortion', 'child18', 'TPP', 'NCLB', 'primary', 'Iransanctions', 'infraspending', 'backgroundcheck', 'medicarereform', 'concealedcarry', 'gunregistry', 'banassault', 'gaymarriage', 'hispanic', 'investor', 'trans', 'votereg_post', 'militaryoil', 'militaryterror', 'militarycivilwar', 'militarydemocracy', 'militaryally', 'militaryUN', 'polmeeting', 'polsign', 'campaignwork', 'campaigndonate', 'donateblood', 'edloan', 'runoffice', 'Obama', 'Romney', 'age', 'whiteadvantage_Disagree', 'whiteadvantage_Neutral', 'angryracism_Disagree', 'angryracism_Neutral', 'racismrare_Disagree', 'racismrare_Neutral', 'fearrace_Disagree', 'fearrace_Neutral', 'statewelfare_Increase', 'statewelfare_Maintain', 'stateedu_Increase', 'stateedu

In [5]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df[variables].values, df['five_cluster_label'], random_state = 42, test_size = 0.2)

In [6]:
rf_clf = RandomForestClassifier(random_state = 42, class_weight = 'balanced')
cv_score(rf_clf, Xtrain, ytrain)

0.83590808775801639

In [7]:
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)

rf_clf.fit(Xtrain, ytrain)
print(classification_report(ytrain, rf_clf.predict(Xtrain)))

             precision    recall  f1-score   support

          0       0.99      1.00      1.00      6967
          1       0.99      1.00      1.00      8095
          2       1.00      1.00      1.00      7561
          3       1.00      1.00      1.00     10671
          4       1.00      0.99      1.00      5221

avg / total       1.00      1.00      1.00     38515



this is pretty overfit

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
param_grid = {'max_depth':[10,20,100,500], 'min_impurity_decrease':[1e-7,1e-6,1e-5, 1e-4, 1e-3, 1e-2]}
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)
rf_clf_cv = GridSearchCV(rf_clf, param_grid, cv = 5)
rf_clf_cv.fit(Xtrain, ytrain)

print(rf_clf_cv.best_params_)

{'max_depth': 20, 'min_impurity_decrease': 0.0001}


In [10]:
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42, max_depth = 20, min_impurity_decrease = 0.0001)

cv_score(rf_clf, Xtrain, ytrain)

0.8445281059327534

CV score is slightly better after tuning. Narrowing down the grid search:

In [11]:
param_grid = {'max_depth':[10,20,25,30,50,100], 'min_impurity_decrease':[1e-6,5e-5,1e-4,5e-3,1e-3]}
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)
rf_clf_cv = GridSearchCV(rf_clf, param_grid, cv = 5)
rf_clf_cv.fit(Xtrain, ytrain)

print(rf_clf_cv.best_params_)

{'max_depth': 20, 'min_impurity_decrease': 0.0001}


In [19]:
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42, max_depth = 20, min_impurity_decrease = 0.0001)

rf_clf.fit(Xtrain, ytrain)
print(classification_report(ytrain, rf_clf.predict(Xtrain)))

             precision    recall  f1-score   support

          0       0.84      0.85      0.84      6967
          1       0.87      0.83      0.85      8095
          2       0.93      0.95      0.94      7561
          3       0.95      0.90      0.93     10671
          4       0.78      0.89      0.83      5221

avg / total       0.89      0.88      0.88     38515



Recall is between 0.83 and 0.95 for all four classes. Precision between 0.78 and 0.95. 

In [20]:
feature_importances = list(rf_clf.feature_importances_)

ranked = []
for var in zip(variables, feature_importances):
    ranked.append(var)
    
ranked = sorted(ranked, key = lambda x: x[1], reverse = True)
ranked

[('pres_Trump', 0.080483152008056558),
 ('repealACA', 0.062965601868580426),
 ('Obama', 0.059972521987492131),
 ('EPACO2', 0.05909675655173835),
 ('Party_Republican', 0.034514679342161181),
 ('religimp_Very important', 0.034406689489763401),
 ('whiteadvantage_Disagree', 0.031751399814640688),
 ('minwage12', 0.029711710814074789),
 ('primary', 0.029326002987890738),
 ('pew_bornagain', 0.028590703828110658),
 ('legalstatusHS', 0.02656161928276991),
 ('concealedcarry', 0.025676903616368825),
 ('abortionchoice', 0.025564543390386701),
 ('Romney', 0.025323456702262652),
 ('abortioncoverage', 0.025260245308892072),
 ('banmostabortion', 0.022323112592790462),
 ('fuelefficiency', 0.020690255873077764),
 ('pres_None', 0.020380286041267216),
 ('TPP', 0.02018830898400762),
 ('gaymarriage', 0.019192718361823603),
 ('banassault', 0.018292302720850563),
 ('race_White', 0.018106377183282878),
 ('abortion20wks', 0.015952118614262937),
 ('campaigndonate', 0.01584349188262666),
 ('race_Black', 0.0152976

2012 vote is a major factor, interestingly. Among issue polling, EPACO2, repealACA, and minwage12 were the most important. Religion importance and whether they were born again were also fairly important.

In [15]:
log_clf = LogisticRegression(class_weight = 'balanced')

cv_score(log_clf, Xtrain, ytrain)

0.90032454887706082

Quite good out of the box.

In [16]:
coefficients = log_clf.coef_

print 10 highest value coefficients for each cluster:

In [22]:
for i in range(0,5):
    cluster_coefs = []
    for item, item2 in zip(variables, coefficients[i]):
        cluster_coefs.append((item, item2))
    cluster_coefs = sorted(cluster_coefs, key = lambda x: abs(x[1]), reverse = True)
    print("Cluster",i)
    for j in range(15):
        print(cluster_coefs[j])

Cluster 0
('pew_bornagain', -1.2124189510890406)
('gaymarriage', 1.1819284387995026)
('Romney', -1.1631166142745784)
('ideo_Very liberal', -1.1256420694441285)
('religimp_Very important', -1.1059893211637961)
('campaigndonate', -1.1055431367141773)
('primary', -1.0589939980134271)
('pres_None', 1.0576775345633316)
('pres_McMullin', 0.98970382761133835)
('pres_Trump', -0.9591267048395048)
('pres_Johnson', 0.90069988269253654)
('abortionchoice', 0.82575703536006628)
('age', -0.81955714560231052)
('race_Black', -0.73094096859164104)
('ideo_Moderate or Not sure', 0.72187514242141004)
Cluster 1
('pres_McMullin', 2.0109643600139688)
('pres_Trump', 1.9653523292674162)
('pres_Other', 1.7443398627464297)
('pres_Johnson', 1.7188624605730916)
('Party_Republican', 1.0388775278809155)
('campaigndonate', -0.92593528652082602)
('Obama', -0.92293662062941373)
('backgroundcheck', 0.88982118516084352)
('repealACA', 0.82298233012653688)
('ideo_Very conservative', -0.8132351569075954)
('race_Black', -0.76