In [1]:
import csv
import math
import numpy as np
import pandas as pd
import string

# Classification utils
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import grid_search
from sklearn import metrics

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

task = pd.read_csv('data.csv')
quiz = pd.read_csv('quiz.csv')

In [2]:
# Name Columns (53 total)
alphabet = list(string.ascii_lowercase)
alphabet2 = alphabet + [l+l for l in alphabet] + ['aaa']

task.columns = alphabet2
# Leave out label column for test data
quiz.columns = alphabet2[:-1]

# Designate Boolean Columns (15 total)
boolean_cols = [
    'g', 'p', 'q', 's',
    'v', 'w', 'y', 'z',
    'oo', 'pp', 'qq', 'rr',
    'xx', 'yy', 'zz'
]

zero_one_two_cols = ['aa','bb','cc','dd','ee','ff','gg','hh','ii','jj','kk','ll','mm','nn']

# Designate Categorical Columns (16 total)
cols = task.columns
num_cols = task._get_numeric_data().columns
list(set(cols) - set(num_cols))

categorical_cols = ['a', 'c', 'd', 'e', 'f', 'h', 'i', 'j', 'k',
 'l', 'm', 'n', 'o', 
   'ss', 'tt', 'uu'
 ]

for col in categorical_cols:
    task[col] = task[col].astype('category')
    quiz[col] = quiz[col].astype('category')

# Designate Numeric Columns (37 total)
numeric_cols = ['b', 'g', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
       'z', 'aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii',
       'jj', 'kk', 'll', 'mm', 'nn', 'oo', 'pp', 'qq', 'rr', 'vv',
       'ww', 'xx', 'yy', 'zz']

numeric_indices = []
for i, letter in enumerate(alphabet2):
    if letter in numeric_cols:
        numeric_indices.append(i)
        
#Designate continuous columns
continuous_cols = [
   'vv', 'ww'
]
    
# [1, 6, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
# 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47, 48, 49, 50, 51, 52]

# Designate final columns
final_cols = categorical_cols + zero_one_two_cols + boolean_cols + continuous_cols

train_labels = np.array(task['aaa']).astype(int)

In [3]:
# One-hot encoded features for categorical vars
X_dummies = pd.get_dummies(task[final_cols])
# X_dummies = pd.get_dummies(task[categorical_cols + zero_one_two_cols + boolean_cols])
X_quiz_dummies = pd.get_dummies(quiz[categorical_cols + zero_one_two_cols + boolean_cols])

X_train_dummies = X_dummies[[col for col in X_dummies.columns if col in X_quiz_dummies.columns]]
X_quiz_dummies = X_quiz_dummies[[col for col in X_quiz_dummies.columns if col in X_train_dummies.columns]]

In [4]:
# Exploring different parameter settings with grid_search
# Features reduced with select k best
# Training size reduced with train_test_split

# X_train_k_best = SelectKBest(chi2, k=1000).fit_transform(X_train_dummies, task.ix[:,-1])

param_grid = [
    {'max_depth': [1, 10], 'max_features': [1, 5], 'n_estimators': [1, 10]}
]


rf = RandomForestClassifier(n_jobs=1)
clf = grid_search.GridSearchCV(rf, param_grid)

# x_train, x_test, y_train, y_test = train_test_split(X_train_k_best, task.ix[:,-1], train_size=0.1, test_size=0.1)
x_train, x_test, y_train, y_test = train_test_split(X_train_dummies, task.ix[:,-1], train_size=0.1, test_size=0.1)
clf_trained = clf.fit(x_train, y_train)

scores = cross_val_score(clf_trained, x_test, y_test, cv=2)

print(scores)
print('best params: ', clf_trained.best_params_)

[ 0.59366131  0.6311889 ]
('best params: ', {'max_features': 5, 'n_estimators': 10, 'max_depth': 10})


In [5]:
#Feature importance
import matplotlib.pyplot as plt

# fit an Extra Trees model to the data
et = ExtraTreesClassifier(n_estimators=250)
et.fit(x_train, y_train)

# compute feature importances
importances = et.feature_importances_
std = np.std([tree.feature_importances_ for tree in et.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [6]:
# Print the feature ranking
print("Feature ranking:")

#for f in range(x_train.shape[1]):
#    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    
#best_features = list(pd.Series(importances.sum(axis=0)).order(ascending=False).iloc[:50].index)
#best_cols = X_train_dummies.columns[importances]

for f in range(x_train.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, X_train_dummies.columns[indices[f]], importances[indices[f]]))
    
best_features = []
for f in range(0, 50):
    best_features.append(X_train_dummies.columns[indices[f]])

# display the relative importance of each attribute
# print(model.feature_importances_)

# Plot the feature importances of the forest
#plt.figure()
#plt.title("Feature importances")
#plt.bar(range(x_train.shape[1]), importances[indices],
#       color="r", yerr=std[indices], align="center")
#plt.xticks(range(x_train.shape[1]), indices)
#plt.xlim([-1, x_train.shape[1]])
#plt.show()

Feature ranking:
1. feature e_query_yn (0.032200)
2. feature a_indef (0.030156)
3. feature c_dobj (0.028711)
4. feature h_prep_at (0.028227)
5. feature a_pro (0.020945)
6. feature tt_root (0.020427)
7. feature ss_ccomp (0.016109)
8. feature i_f (0.012890)
9. feature i_g (0.012403)
10. feature a_null (0.010327)
11. feature c_prep_of (0.010208)
12. feature k_pro_def (0.009847)
13. feature h_dobj (0.009745)
14. feature j_query_yn (0.009298)
15. feature k_indef_def (0.009180)
16. feature j_clarify (0.009101)
17. feature f_pro (0.008914)
18. feature aa (0.008849)
19. feature e_clarify (0.008424)
20. feature bb (0.008412)
21. feature j_check (0.008233)
22. feature qq (0.007742)
23. feature j_instruct (0.007651)
24. feature f_def (0.007616)
25. feature j_align (0.007251)
26. feature a_def (0.007236)
27. feature cc (0.007227)
28. feature o_query_yn_instruct (0.006683)
29. feature n_g_g (0.006643)
30. feature a_dctc (0.006423)
31. feature j_explain (0.006374)
32. feature p (0.006302)
33. featur

In [25]:
# Test with best estimators

# test with 2500 if time
param_grid = [
    {'max_depth': [250], 'max_features': [50], 'n_estimators': [1000]}
]

rf2 = RandomForestClassifier(n_jobs=1)
clf2 = grid_search.GridSearchCV(rf2, param_grid)

X = X_train_dummies[best_features[:50]].as_matrix()

# x_train, x_test, y_train, y_test = train_test_split(X_train_k_best, task.ix[:,-1], train_size=0.1, test_size=0.1)
x_train, x_test, y_train, y_test = train_test_split(X, task.ix[:,-1], train_size=0.1, test_size=0.1)
clf_trained2 = clf2.fit(x_train, y_train)

scores = cross_val_score(clf_trained2, x_test, y_test, cv=2)

print(scores)
print('best params: ', clf_trained2.best_params_)

[ 0.86914709  0.86153604]
('best params: ', {'max_features': 50, 'n_estimators': 1000, 'max_depth': 250})


In [18]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFE

# create the RFE model and select 3 attributes
rfe = RFE(rf2, 3)
rfe = rfe.fit(x_train, y_train)

In [23]:
# summarize the selection of the attributes
print("N of features ", rfe.n_features_)
print("Mask of selected features ", rfe.support_)
print(rfe.ranking_)

# calculate and print scores
#scores = cross_val_score(rfe, x_test, y_test, cv=1)

('N of features ', 3)
('Mask of selected features ', array([ True, False,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False], dtype=bool))
[ 1  3  1  1  2  5 26 31  4 37 16 43  7  6 47 25 35 40 34 13 38 18 21  9 24
 32 19 48 17 36 11 28 23 42 22  8 15 39 14 10 46 29 44 45 12 20 27 30 33 41]


In [None]:
# PCA
