In [1]:
import csv
import math
import numpy as np
import pandas as pd
import string

# Classification utils
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import grid_search
from sklearn import metrics
from sklearn import svm
from sklearn.svm import SVC

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

task = pd.read_csv('data.csv')
quiz = pd.read_csv('quiz.csv')

In [3]:
# Method to print predicted test labels formatted for kaggle submission
def write_results(preds):
    with open('test_predictions.csv', 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['id', 'Prediction'])
        for i, pred in enumerate(preds):
            writer.writerow([i+1, pred])

In [4]:
# Name Columns (53 total)
alphabet = list(string.ascii_lowercase)
alphabet2 = alphabet + [l+l for l in alphabet] + ['aaa']

task.columns = alphabet2
# Leave out label column for test data
quiz.columns = alphabet2[:-1]

# Designate Boolean Columns (15 total)
boolean_cols = [
    'g', 'p', 'q', 's',
    'v', 'w', 'y', 'z',
    'oo', 'pp', 'qq', 'rr',
    'xx', 'yy', 'zz'
]

zero_one_two_cols = ['aa','bb','cc','dd','ee','ff','gg','hh','ii','jj','kk','ll','mm','nn']

# Designate Categorical Columns (16 total)
cols = task.columns
num_cols = task._get_numeric_data().columns
list(set(cols) - set(num_cols))

categorical_cols = ['a', 'c', 'd', 'e', 'f', 'h', 'i', 'j', 'k',
 'l', 'm', 'n', 'o', 
   'ss', 'tt', 'uu'
 ]

for col in categorical_cols:
    task[col] = task[col].astype('category')
    quiz[col] = quiz[col].astype('category')

# Designate Numeric Columns (37 total)
numeric_cols = ['b', 'g', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
       'z', 'aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii',
       'jj', 'kk', 'll', 'mm', 'nn', 'oo', 'pp', 'qq', 'rr', 'vv',
       'ww', 'xx', 'yy', 'zz']

numeric_indices = []
for i, letter in enumerate(alphabet2):
    if letter in numeric_cols:
        numeric_indices.append(i)
        
#Designate continuous columns
continuous_cols = [
   'vv', 'ww'
]
    
# [1, 6, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
# 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47, 48, 49, 50, 51, 52]

# Designate final columns
final_cols = categorical_cols + zero_one_two_cols + boolean_cols + continuous_cols

train_labels = np.array(task['aaa']).astype(int)
quiz_labels = np.array(task['aaa']).astype(int)

In [7]:
# One-hot encoded features for categorical vars
X_dummies = pd.get_dummies(task[final_cols])
# X_dummies = pd.get_dummies(task[categorical_cols + zero_one_two_cols + boolean_cols])
X_quiz_dummies = pd.get_dummies(quiz[categorical_cols + zero_one_two_cols + boolean_cols])
# X_quiz_dummies = pd.get_dummies(quiz[final_cols])

X_train_dummies = X_dummies[[col for col in X_dummies.columns if col in X_quiz_dummies.columns]]
X_quiz_dummies = X_quiz_dummies[[col for col in X_quiz_dummies.columns if col in X_train_dummies.columns]]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X_train_dummies, task.ix[:,-1], train_size=0.1, test_size=0.1)

In [6]:
#Feature importance
#import matplotlib.pyplot as plt

# fit an Extra Trees model to the data
et = ExtraTreesClassifier(n_estimators=250)
et.fit(x_train, y_train)

# compute feature importances
importances = et.feature_importances_
std = np.std([tree.feature_importances_ for tree in et.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [7]:
# Print the feature ranking
print("Feature ranking:")

#for f in range(x_train.shape[1]):
#    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    
#best_features = list(pd.Series(importances.sum(axis=0)).order(ascending=False).iloc[:50].index)
#best_cols = X_train_dummies.columns[importances]

for f in range(X_train_dummies.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, X_train_dummies.columns[indices[f]], importances[indices[f]]))
    
best_features = []
for f in range(0, 100):
    best_features.append(X_train_dummies.columns[indices[f]])

# display the relative importance of each attribute
# print(model.feature_importances_)

# Plot the feature importances of the forest
#plt.figure()
#plt.title("Feature importances")
#plt.bar(range(x_train.shape[1]), importances[indices],
#       color="r", yerr=std[indices], align="center")
#plt.xticks(range(x_train.shape[1]), indices)
#plt.xlim([-1, x_train.shape[1]])
#plt.show()

Feature ranking:
1. feature e_query_yn (0.035561)
2. feature c_dobj (0.030039)
3. feature h_prep_at (0.025889)
4. feature tt_root (0.022571)
5. feature a_indef (0.022438)
6. feature a_pro (0.020399)
7. feature i_g (0.013773)
8. feature i_f (0.013540)
9. feature ss_ccomp (0.013379)
10. feature j_query_yn (0.011519)
11. feature c_prep_of (0.010548)
12. feature f_pro (0.009996)
13. feature k_pro_def (0.009910)
14. feature h_dobj (0.009901)
15. feature k_indef_def (0.009570)
16. feature aa (0.008949)
17. feature j_clarify (0.008235)
18. feature qq (0.007854)
19. feature a_null (0.007758)
20. feature cc (0.007670)
21. feature j_check (0.007664)
22. feature j_instruct (0.007524)
23. feature f_def (0.007138)
24. feature bb (0.007117)
25. feature a_def (0.007050)
26. feature p (0.006867)
27. feature j_align (0.006797)
28. feature o_query_yn_instruct (0.006682)
29. feature n_g_g (0.006608)
30. feature j_explain (0.006559)
31. feature a_dctc (0.006435)
32. feature e_instruct (0.006426)
33. featu

In [8]:
# Test with best estimators

# test with 2500 if time
param_grid2 = [
    {'max_depth': [500], 'n_estimators': [2500]}
]

rf2 = RandomForestClassifier(n_jobs=-1)
clf2 = grid_search.GridSearchCV(rf2, param_grid2)

X = X_train_dummies[best_features[:1000]].as_matrix()

# x_train, x_test, y_train, y_test = train_test_split(X_train_k_best, task.ix[:,-1], train_size=0.1, test_size=0.1)
x_train, x_test, y_train, y_test = train_test_split(X, task.ix[:,-1], train_size=0.5, test_size=0.5)
clf_trained2 = clf2.fit(x_train, y_train)

scores = cross_val_score(clf_trained2, x_test, y_test, cv=2)

print(scores)
print('best params: ', clf_trained2.best_params_)

# predictions and write results to file np.array(quiz[numeric_cols[:-1]])
preds1 = clf_trained2.predict(X_quiz_dummies[best_features[:1000]].as_matrix())

[ 0.90996531  0.91456684]
('best params: ', {'n_estimators': 2500, 'max_depth': 500})


In [14]:
# Exploring different parameter settings with grid_search
# Features reduced with select k best
# Training size reduced with train_test_split

#X_train_k_best = SelectKBest(chi2, k=1000).fit_transform(X_train_dummies, task.ix[:,-1])
#X_quiz_k_best = SelectKBest(chi2, k=1000).fit_transform(X_quiz_dummies, task.ix[:,-1])

#param_grid = [
#    {'max_depth': [500], 'max_features': [500], 'n_estimators': [2500]}
#]

rf = RandomForestClassifier(n_jobs=1)
#clf = grid_search.GridSearchCV(rf, param_grid)

# x_train, x_test, y_train, y_test = train_test_split(X_train_k_best, task.ix[:,-1], train_size=0.5, test_size=0.5)
x_train, x_test, y_train, y_test = train_test_split(X_train_dummies, task.ix[:,-1], train_size=0.5, test_size=0.5)
clf_trained = rf.fit(x_train, y_train)

scores = cross_val_score(clf_trained, x_test, y_test, cv=2)

print(scores)
#print('best params: ', clf_trained.best_params_)

# predictions and write results to file np.array(quiz[numeric_cols[:-1]])
preds2 = clf_trained.predict(X_quiz_dummies)

[ 0.91415957  0.91497682]


In [22]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFE

# create the RFE model and select N attributes
rfe = RFE(rf2, 1000)
rfe = rfe.fit(x_train, y_train)

scores2 = cross_val_score(rfe, x_test, y_test, cv=2)

print(scores2)

[ 0.90586566  0.90911098]


In [21]:
# summarize the selection of the attributes
print("N of features ", rfe.n_features_)
print("Mask of selected features ", rfe.support_)
print(rfe.ranking_)

# calculate and print scores
# scores = cross_val_score(rfe, x_test, y_test, cv=1)

('N of features ', 3)
('Mask of selected features ', array([ True, False,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False], dtype=bool))
[ 1  2  1  1  5  3 30  4 23 21  7  8 41 44  9 29  6 45 14 33 27 40 17 36 28
 37 35 32 16 18 24 43 10 20 11 15 19 47 22 25 26 46 13 48 39 34 12 42 38 31]


In [23]:
# SVC
sv = SVC()
clf3 = sv.fit(x_train, y_train)

scores3 = cross_val_score(clf3, x_test, y_test, cv=2)

print(scores3)

[ 0.87426679  0.87807878]


In [15]:
# predictions and write results to file np.array(quiz[numeric_cols[:-1]])
write_results(preds2)