In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

In [2]:
def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold, random_state = 42).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y.iloc[train]) # fit
        result += score_func(clf.predict(x[test]), y.iloc[test]) # evaluate score function on held-out data
    return result / nfold # average

In [3]:
df = pd.read_csv('clustered.csv')

In [4]:
variables = [col for col in df.columns if (col != 'four_cluster_label' and col != 'Unnamed: 0')]
             
print(variables)

['gender', 'CC16_331_1', 'CC16_331_2', 'CC16_331_3', 'CC16_331_7', 'milstat_1', 'milstat_2', 'milstat_3', 'milstat_4', 'CC16_351K', 'faminc', 'CC16_334a', 'CC16_334b', 'CC16_334d', 'pew_bornagain', 'CC16_334c', 'CC16_333d', 'CC16_333b', 'CC16_333a', 'CC16_332a', 'CC16_332d', 'CC16_332e', 'CC16_333c', 'CC16_332c', 'CC16_332f', 'CC16_351I', 'CC16_332b', 'child18', 'CC16_351B', 'CC16_351E', 'CC16_327', 'CC16_351G', 'CC16_351F', 'CC16_330a', 'CC16_351H', 'CC16_330e', 'CC16_330b', 'CC16_330d', 'CC16_335', 'hispanic', 'investor', 'trans', 'votereg_post', 'CC16_414_1', 'CC16_414_2', 'CC16_414_3', 'CC16_414_4', 'CC16_414_5', 'CC16_414_6', 'CC16_417a_1', 'CC16_417a_2', 'CC16_417a_3', 'CC16_417a_4', 'CC16_417a_5', 'edloan', 'CC16_418a', 'voted', 'age', 'CC16_422d_Disagree', 'CC16_422d_Neutral', 'CC16_422c_Disagree', 'CC16_422c_Neutral', 'CC16_422f_Disagree', 'CC16_422f_Neutral', 'CC16_422e_Disagree', 'CC16_422e_Neutral', 'CC16_426_1_Increase', 'CC16_426_1_Maintain', 'CC16_426_2_Increase', 'CC16_

In [5]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df[variables].values, df['four_cluster_label'], random_state = 42, test_size = 0.2)

In [6]:
rf_clf = RandomForestClassifier(random_state = 42, class_weight = 'balanced')
cv_score(rf_clf, Xtrain, ytrain)

0.88834921084628926

In [7]:
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)

rf_clf.fit(Xtrain, ytrain)
print(classification_report(ytrain, rf_clf.predict(Xtrain)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      6789
          1       1.00      1.00      1.00     11323
          2       1.00      1.00      1.00      9349
          3       1.00      1.00      1.00     11043

avg / total       1.00      1.00      1.00     38504



damn overfitting!

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
param_grid = {'max_depth':[10,20,100,500], 'min_impurity_decrease':[1e-7,1e-6,1e-5, 1e-4, 1e-3, 1e-2]}
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)
rf_clf_cv = GridSearchCV(rf_clf, param_grid, cv = 5)
rf_clf_cv.fit(Xtrain, ytrain)

print(rf_clf_cv.best_params_)

{'max_depth': 100, 'min_impurity_decrease': 0.0001}


In [13]:
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42, max_depth = 100, min_impurity_decrease = 0.0001)

cv_score(rf_clf, Xtrain, ytrain)

0.89040106108668637

CV score is slightly better after tuning. Narrowing down the grid search:

In [15]:
param_grid = {'max_depth':[50,100,150,200], 'min_impurity_decrease':[1e-5,5e-4,1e-4,5e-3,1e-3]}
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)
rf_clf_cv = GridSearchCV(rf_clf, param_grid, cv = 5)
rf_clf_cv.fit(Xtrain, ytrain)

print(rf_clf_cv.best_params_)

{'max_depth': 50, 'min_impurity_decrease': 0.0001}


In [16]:
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42, max_depth = 50, min_impurity_decrease = 0.0001)

cv_score(rf_clf, Xtrain, ytrain)

0.89040106108668637

Looks like max depth 50 and max depth 100 are tied.

In [14]:
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42, max_depth = 100, min_impurity_decrease = 0.0001)

rf_clf.fit(Xtrain, ytrain)
print(classification_report(ytrain, rf_clf.predict(Xtrain)))

             precision    recall  f1-score   support

          0       0.85      0.93      0.89      6789
          1       0.96      0.93      0.95     11323
          2       0.88      0.90      0.89      9349
          3       0.96      0.92      0.94     11043

avg / total       0.92      0.92      0.92     38504



Recall is in the 90s for all four classes. Precision between 0.85 and 0.96. Overall classifier is pretty good.

In [17]:
feature_importances = list(rf_clf.feature_importances_)

ranked = []
for var in zip(variables, feature_importances):
    ranked.append(var)
    
ranked = sorted(ranked, key = lambda x: x[1], reverse = True)
ranked

[('pres_Trump', 0.096115933978131188),
 ('CC16_426_4_Increase', 0.062585514638449219),
 ('CC16_351I', 0.059975141613776352),
 ('CC16_426_3_Increase', 0.049960428933316564),
 ('CC16_426_5_Increase', 0.047319607826018933),
 ('CC16_333a', 0.042830284616385841),
 ('CC16_426_2_Maintain', 0.03831421756056607),
 ('CC16_333d', 0.033359403576747174),
 ('CC16_426_4_Maintain', 0.032103157221335166),
 ('CC16_332c', 0.029438673036061451),
 ('CC16_351K', 0.025064578823635865),
 ('CC16_426_2_Increase', 0.024296039754639894),
 ('CC16_332d', 0.023860372254666435),
 ('CC16_426_3_Maintain', 0.023002863574235304),
 ('CC16_426_5_Maintain', 0.022966501094107476),
 ('CC16_422d_Disagree', 0.02237878852590278),
 ('CC16_332e', 0.021835659581422757),
 ('CC16_332b', 0.02036824531908572),
 ('CC16_330d', 0.019423866594703222),
 ('CC16_333c', 0.018774839327500591),
 ('CC16_333b', 0.014265906181209121),
 ('pres_None', 0.013887778327782993),
 ('CC16_335', 0.013812551215843637),
 ('voted', 0.013229528343572414),
 ('CC1

CC16_351I is repeal ACA (should add this to EDA)

CC16_426_4 is spending on law enforcement

CC16_426_3 is spending on education (for state legislatures)

CC16_426_5 is spending on infrastructure spending

CC16_333a is power to regulate CO2