# Modeling

In [35]:
from __future__ import division
import cPickle
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from time import time

Grab the engineered data

In [2]:
def read_pickle(file_name):
    f = open(file_name, 'rb')
    p = cPickle.load(f)
    f.close()
    return p


train = np.array(read_pickle('data/train.engineered'))
test = np.array(read_pickle('data/test.engineered'))
outcomes = read_pickle('data/outcomes.engineered')
outcomes_le = read_pickle('data/outcomes_le.engineered')

Functions for nested cross-validation, model selection:

In [3]:
def nested_cv(X, y, estimator, params, scoring = 'log_loss', cv = 5, 
              n_jobs = -1, verbose = True):
    """Performs 5-fold nested cross-validation ([cv] folds in each loop) on an 
    estimator given a parameter grid of hyperparamaters to optimize over using 
    grid search.
    """
    start_time = time()
    inner_loop = GridSearchCV(estimator, params, cv = cv, n_jobs = n_jobs, 
    scoring = scoring)
    score = np.absolute(np.mean(cross_val_score(inner_loop, X, y, cv = cv, 
        n_jobs = 1)))
    if verbose:
        time_elapsed = time() - start_time
        print 'Model:', estimator
        print 'Score:', score
        print 'Time elapsed:', round(time_elapsed / 60, 1), '\n'
    return score


def model_selection(X, y, estimators_params, scoring = 'log_loss', cv = 5, 
                    n_jobs = -1, refit = True, higher_is_better = False, 
                    verbose = True):
    """Evalute multiple estimators using nested cross-validation. If refit is 
    True, the best scoring estimator is returned as part of a [cv]-fold 
    GridSearchCV estimator,such that fitting that model with X, y will find the 
    optimal hyperparameters and return a final model that can be used to make 
    predictions. [estimators_parms] is a dictionary where the key is the 
    estimator and the value is the hyperparameter grid for that estimator.
    """
    scores = []
    for estimator, params in estimators_params.items():
        try:
            score = nested_cv(X, y, estimator, params, scoring = scoring, 
                              cv = cv, n_jobs = n_jobs, verbose = verbose)
            scores.append([score, estimator])
        except:
            if verbose:
                print 'The following model failed to produced a nested-cv result:'
                print estimator, '\n'
    scores = sorted(scores, reverse = higher_is_better)
    best_model = scores[0][1]
    if refit:
        return GridSearchCV(best_model, estimators_params[best_model], 
                            cv = cv, n_jobs = n_jobs, scoring = scoring)
    else:
        return best_model

Get a sense of how long different classifiers take to run one iteration. In nested-CV, each model/parameter combo will be run 25 times, then the model will be run 5 times.

In [43]:
def estimate_nested_cv(X, y, estimator, params = {}, 
                       outer_folds = 5, inner_folds = 5):
    """Estimate the time it will take to perform nested cross-validation not
    including parallelization time.
    """
    outer_fold_samples = X.shape[0] * ((outer_folds - 1) / outer_folds)
    inner_fold_samples = outer_fold_samples * ((inner_folds - 1) / inner_folds)
    num_params = len(ParameterGrid(params))
    
    # Time an inner fold
    inner_start = time()
    estimator.fit(X[:inner_fold_samples, :], y[:inner_fold_samples])
    inner_time = time() - inner_start
    inner_times = inner_time * outer_folds * inner_folds * num_params
    
    # Time an outer fold
    outer_start = time()
    estimator.fit(X[:outer_fold_samples, :], y[:outer_fold_samples])
    outer_time = time() - outer_start
    outer_times = outer_time * outer_folds
    
    return (inner_times + outer_times) / 60

In [45]:
estimate_nested_cv(train, outcomes, LogisticRegression(random_state = 50), params = {
            'penalty': ['l1', 'l2'], 
            'class_weight': [None, 'balanced'],
            'C': np.logspace(-3, 3, 7)
    })



7.366394241650899

Try a bunch of different classifiers, from a baseline model like logistic regression all the way up to a model we expect to perform well like random forests.

In [39]:
len(ParameterGrid({}))

1

In [40]:
len(ParameterGrid({
            'penalty': ['l1', 'l2'], 
            'class_weight': [None, 'balanced'],
            'C': np.logspace(-3, 3, 7)
    }))

28

In [34]:
%%time

estimators_params = {
    
    # Logistic regression, good baseline model
    LogisticRegression(random_state = 50) :  {
            'penalty': ['l1', 'l2'], 
            'class_weight': [None, 'balanced'],
            'C': np.logspace(-3, 3, 7)
    }
}

model_selection(train, outcomes, estimators_params)



Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=50, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Score: 0.890816563039
Time elapsed: 23.9 

CPU times: user 40.3 s, sys: 852 ms, total: 41.2 s
Wall time: 23min 53s


In [None]:
estimators_params = {
    
    # Logistic regression, good baseline model
    LogisticRegression(random_state = 50) :  {
            'penalty': ['l1', 'l2'], 
            'class_weight': [None, 'balanced'],
            'C': np.logspace(-3, 3, 7)
    },
    
    # SGD logsitic regression w/ elastic net regularization - likely won't perform
    # as well as LR b/c it's using SGD, but I want to see if elasticnet helps
    SGDClassifier(random_state = 50, loss = 'log', penalty = 'elasticnet') : {
            'alpha': np.logspace(-3, 3, 7)
    },
    
    # SVM w/ a linear kernel - LinearSVC is faster than SVC for this kernel
    LinearSVC(random_state = 50) : {
        'penalty': ['l1', 'l2'],
        'class_weight': [None, 'balanced'],
        'C': np.logspace(-3, 3, 7)
    },
    
    # SVM w/ RBF kernel
    SVC(random_state = 25, probability = True, kernel = 'rbf') : {
        'C': np.logspace(-3, 3, 7), 
        'gamma': np.logspace(-6, 3, 10), 
        'class_weight' : [None, 'balanced']
    },
    
    # SVM w/ polynomial kernel
    SVC(random_state = 25, probability = True, kernel = 'poly') : {
        'C': np.logspace(-3, 3, 7), 
        'degree': [2, 3, 4, 5], 
        'coef0': [0, 1],
        'class_weight' : [None, 'balanced']
    },
    
    # Random forest, others in this challenge have had success w/ this algorithm
    RandomForestClassifier(random_state = 25, n_estimators = 1000) : {
        'max_features' : ['sqrt', 'log2'],
        'max_depth' : [3, None],
        'min_samples_split': [1, 2, 3, 7],
        'min_samples_leaf': [1, 3, 7],
        'class_weight' : [None, 'balanced']
    },
    
    # ExtraTreesClassifier - using same params as the random forest
    ExtraTreesClassifier(random_state = 25, n_estimators = 1000) : {
        'max_features' : ['sqrt', 'log2'],
        'max_depth' : [3, None],
        'min_samples_split': [1, 2, 3, 7],
        'min_samples_leaf': [1, 3, 7],
        'class_weight' : [None, 'balanced']
    }
}

best_performing_model = model_selection(train, outcomes, estimators_params)