In [9]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

In [2]:
def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold, random_state = 42).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y.iloc[train]) # fit
        result += score_func(clf.predict(x[test]), y.iloc[test]) # evaluate score function on held-out data
    return result / nfold # average

In [4]:
df = pd.read_csv('clustered.csv')

In [8]:
variables = [col for col in df.columns if (col != 'four_cluster_label' and col != 'Unnamed: 0')]
             
print(variables)

['gender', 'CC16_331_1', 'CC16_331_2', 'CC16_331_3', 'CC16_331_7', 'milstat_1', 'milstat_2', 'milstat_3', 'milstat_4', 'CC16_351K', 'faminc', 'CC16_334a', 'CC16_334b', 'CC16_334d', 'pew_bornagain', 'CC16_334c', 'CC16_333d', 'CC16_333b', 'CC16_333a', 'CC16_332a', 'CC16_332d', 'CC16_332e', 'CC16_333c', 'CC16_332c', 'CC16_332f', 'CC16_351I', 'CC16_332b', 'child18', 'CC16_351B', 'CC16_351E', 'CC16_327', 'CC16_351G', 'CC16_351F', 'CC16_330a', 'CC16_351H', 'CC16_330e', 'CC16_330b', 'CC16_330d', 'CC16_335', 'hispanic', 'investor', 'trans', 'votereg_post', 'CC16_414_1', 'CC16_414_2', 'CC16_414_3', 'CC16_414_4', 'CC16_414_5', 'CC16_414_6', 'CC16_417a_1', 'CC16_417a_2', 'CC16_417a_3', 'CC16_417a_4', 'CC16_417a_5', 'edloan', 'CC16_418a', 'voted', 'age', 'CC16_422d_Disagree', 'CC16_422d_Neutral', 'CC16_422c_Disagree', 'CC16_422c_Neutral', 'CC16_422f_Disagree', 'CC16_422f_Neutral', 'CC16_422e_Disagree', 'CC16_422e_Neutral', 'CC16_426_1_Increase', 'CC16_426_1_Maintain', 'CC16_426_2_Increase', 'CC16_

In [10]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df[variables].values, df['four_cluster_label'], random_state = 42, test_size = 0.2)

In [11]:
rf_clf = RandomForestClassifier(random_state = 42, class_weight = 'balanced')
cv_score(rf_clf, Xtrain, ytrain)

0.88834921084628926

In [12]:
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)

rf_clf.fit(Xtrain, ytrain)
print(classification_report(ytrain, rf_clf.predict(Xtrain)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      6789
          1       1.00      1.00      1.00     11323
          2       1.00      1.00      1.00      9349
          3       1.00      1.00      1.00     11043

avg / total       1.00      1.00      1.00     38504



damn overfitting!

In [14]:
feature_importances = list(rf_clf.feature_importances_)

ranked = []
for var in zip(variables, feature_importances):
    ranked.append(var)
    
ranked = sorted(ranked, key = lambda x: x[1], reverse = True)
ranked

[('pres_Trump', 0.060176709664620687),
 ('CC16_351I', 0.05784089690354359),
 ('CC16_426_4_Increase', 0.055287113917492324),
 ('CC16_426_5_Increase', 0.046933198773469367),
 ('CC16_426_3_Increase', 0.035106716257637546),
 ('CC16_333a', 0.031600996615693114),
 ('CC16_332c', 0.028180691571850519),
 ('CC16_333d', 0.027263229037199133),
 ('CC16_426_2_Increase', 0.02584564960978401),
 ('CC16_426_4_Maintain', 0.025736640877372086),
 ('CC16_426_2_Maintain', 0.02060175156917014),
 ('CC16_426_3_Maintain', 0.020537828627936322),
 ('CC16_330d', 0.020090015168719517),
 ('CC16_332e', 0.019488369778445192),
 ('CC16_332d', 0.018534443584809848),
 ('CC16_351K', 0.018443363343905162),
 ('age', 0.018436027354864958),
 ('CC16_422d_Disagree', 0.017811307753732832),
 ('CC16_426_5_Maintain', 0.016041232024652026),
 ('CC16_333b', 0.015374843731018847),
 ('CC16_335', 0.014370402598153743),
 ('CC16_333c', 0.014237345974437363),
 ('voted', 0.013952261832539514),
 ('CC16_331_1', 0.013042176802694363),
 ('CC16_331