# Modeling

In [1]:
from __future__ import division
import cPickle
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
import warnings
import xgboost as xgb

warnings.filterwarnings("ignore")

Grab the engineered data

In [2]:
def read_pickle(file_name):
    f = open(file_name, 'rb')
    p = cPickle.load(f)
    f.close()
    return p


train = np.array(read_pickle('data/train.engineered'))
test = np.array(read_pickle('data/test.engineered'))
outcomes = read_pickle('data/outcomes.engineered')
outcomes_le = read_pickle('data/outcomes_le.engineered')

Do a quick check to see how many trees a random forest would need for the test error to settle down.

In [3]:
%%time

rf = GridSearchCV(
    RandomForestClassifier(random_state = 25), {
        'n_estimators': [int(x) for x in np.linspace(100, 3000, 11)]},
    n_jobs = -1, scoring = 'log_loss').fit(train, outcomes)

CPU times: user 2min 18s, sys: 1.56 s, total: 2min 20s
Wall time: 9min


In [4]:
rf.grid_scores_

[mean: -1.01710, std: 0.02760, params: {'n_estimators': 100},
 mean: -0.89021, std: 0.01241, params: {'n_estimators': 390},
 mean: -0.87188, std: 0.01040, params: {'n_estimators': 680},
 mean: -0.86587, std: 0.01177, params: {'n_estimators': 970},
 mean: -0.86333, std: 0.01397, params: {'n_estimators': 1260},
 mean: -0.86109, std: 0.01484, params: {'n_estimators': 1550},
 mean: -0.85698, std: 0.01294, params: {'n_estimators': 1840},
 mean: -0.85459, std: 0.01143, params: {'n_estimators': 2130},
 mean: -0.85363, std: 0.01038, params: {'n_estimators': 2420},
 mean: -0.85172, std: 0.00872, params: {'n_estimators': 2710},
 mean: -0.85168, std: 0.00869, params: {'n_estimators': 3000}]

Estimate test error of various models w/o doing hyperparameter optimization to get a sense of which models work better than others.

In [5]:
def try_models(X, y, estimators, scoring = 'log_loss', cv = 5, n_jobs = -1,
              verbose = True):
    """Estimate test error of models using kfold-cv w/ default hyperparameters,
    no hyperparameter tuning at all.
    """
    scores = []
    for name, estimator in estimators.items():
        try:
            start = time()
            score = np.absolute(np.mean(cross_val_score(
                        estimator, X, y, cv = cv, n_jobs = n_jobs, scoring = scoring)))
            scores.append([score, name])
            time_elapsed = round((time() - start) / 60, 2)
            if verbose:
                print 'Model:', name
                print 'Log loss:', score
                print 'Time elapsed (min):', time_elapsed
                print '--------------------------------------------------------------'
        except:
            print 'Failure to run model:', name
            print '--------------------------------------------------------------'
    return pd.DataFrame(sorted(scores), columns = ['Score', 'Model'])


try_models(train, outcomes, {
        'Logistic regression' : LogisticRegression(random_state = 50), 
        'SVM (RBF kernal)' : SVC(random_state = 25, probability = True, kernel = 'rbf'),
        'Random forest' : RandomForestClassifier(random_state = 25, n_estimators = 3000),
        'Extra trees' : ExtraTreesClassifier(random_state = 25, n_estimators = 3000),
        'Gradient boosted trees' : xgb.XGBClassifier(seed = 25)
    })

Model: Gradient boosted trees
Log loss: 0.831070681851
Time elapsed (min): 1.13
--------------------------------------------------------------
Model: Extra trees
Log loss: 1.0903370461
Time elapsed (min): 4.3
--------------------------------------------------------------
Model: SVM (RBF kernal)
Log loss: 0.909677745361
Time elapsed (min): 13.62
--------------------------------------------------------------
Model: Logistic regression
Log loss: 0.89151937115
Time elapsed (min): 0.06
--------------------------------------------------------------
Model: Random forest
Log loss: 0.842249054112
Time elapsed (min): 2.41
--------------------------------------------------------------


Unnamed: 0,Score,Model
0,0.831071,Gradient boosted trees
1,0.842249,Random forest
2,0.891519,Logistic regression
3,0.909678,SVM (RBF kernal)
4,1.090337,Extra trees


Let's take the best performing models and get test errors after we optimize for their hyperparamters.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    train, outcomes, test_size = 0.3, random_state = 1)


def hold_out_cv(model, hyperparameters, X_train, X_test, y_train, y_test,
                folds = 5):
    """Given a [model] and a set of possible [hyperparameters], along with 
    matricies corresponding to hold-out cross-validation, returns a model w/ 
    optimized hyperparameters, and prints out model evaluation metrics.
    """
    optimized_model = GridSearchCV(model, hyperparameters, cv = folds, 
                                   n_jobs = -1, scoring = 'log_loss')
    optimized_model.fit(X_train, y_train)
    print 'Optimized parameters:', optimized_model.best_params_
    print 'Mean absolute error:', np.absolute(
        optimized_model.score(X_test, y_test))
    return None

In [9]:
%%time
hold_out_cv(LogisticRegression(random_state = 50), {'C': np.logspace(-3, 3, 7)}, 
            X_train, X_test, y_train, y_test)

Optimized parameters: {'C': 0.10000000000000001}
Mean absolute error: 0.897359484785
CPU times: user 2 s, sys: 548 ms, total: 2.54 s
Wall time: 17.2 s


In [10]:
%%time
hold_out_cv(
    RandomForestClassifier(n_estimators = 3000, random_state = 25), {
        'min_samples_split': [1, 3, 10],
        'min_samples_leaf': [1, 3, 10],
        'max_depth': [3, None]}, 
    X_train, X_test, y_train, y_test)

Optimized parameters: {'min_samples_split': 10, 'max_depth': None, 'min_samples_leaf': 1}
Mean absolute error: 0.837451136547
CPU times: user 1min 8s, sys: 1.44 s, total: 1min 9s
Wall time: 14min 47s


In [11]:
%%time
hold_out_cv(
    xgb.XGBClassifier(learning_rate = 0.05, n_estimators = 200, seed = 25), {
        'max_depth': range(3, 10, 2), 
        'min_child_weight': range(1, 6, 2),
        'gamma': [i / 10.0 for i in range(0, 5)], 
        'reg_alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, 
    X_train, X_test, y_train, y_test)

KeyboardInterrupt: 