# Modeling

In [5]:
from __future__ import division
import cPickle
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from time import time
import warnings
import xgboost as xgb

warnings.filterwarnings("ignore")

Grab the engineered data

In [2]:
def read_pickle(file_name):
    f = open(file_name, 'rb')
    p = cPickle.load(f)
    f.close()
    return p


train = np.array(read_pickle('data/train.engineered'))
test = np.array(read_pickle('data/test.engineered'))
outcomes = read_pickle('data/outcomes.engineered')
outcomes_le = read_pickle('data/outcomes_le.engineered')

Estimate test error of various models w/o doing hyperparameter optimization to get a sense of which models work better than others.

In [3]:
def try_models(X, y, estimators, scoring = 'log_loss', cv = 5, n_jobs = -1,
              verbose = True):
    """Estimate test error of models using kfold-cv w/ default hyperparameters,
    no hyperparameter tuning at all.
    """
    scores = []
    for estimator in estimators:
        try:
            start = time()
            score = np.absolute(np.mean(cross_val_score(
                        estimator, X, y, cv = cv, n_jobs = n_jobs, scoring = scoring)))
            scores.append([score, estimator])
            time_elapsed = round((time() - start) / 60, 2)
            if verbose:
                print 'Model:', estimator
                print 'Log loss:', score
                print 'Time elapsed (min):', time_elapsed
                print '--------------------------------------------------------------'
        except:
            print 'Failure to run model:', model
            print '--------------------------------------------------------------'
    return pd.DataFrame(sorted(scores), columns = ['Score', 'Model'])


try_models(train, outcomes, [
        LogisticRegression(random_state = 50), 
        SGDClassifier(random_state = 50, loss = 'log', penalty = 'elasticnet'),
        SVC(random_state = 25, probability = True, kernel = 'linear'),
        SVC(random_state = 25, probability = True, kernel = 'rbf'),
        SVC(random_state = 25, probability = True, kernel = 'poly'),
        RandomForestClassifier(random_state = 25, n_estimators = 1000),
        ExtraTreesClassifier(random_state = 25, n_estimators = 1000)
    ])

Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=50, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Log loss: 0.89151937115
Time elapsed (min): 1.15
--------------------------------------------------------------
Model: SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=50, shuffle=True,
       verbose=0, warm_start=False)
Log loss: 0.940998245507
Time elapsed (min): 0.05
--------------------------------------------------------------
Model: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=25, shrinking=T

Unnamed: 0,Score,Model
0,0.856702,()
1,0.891519,"LogisticRegression(C=1.0, class_weight=None, d..."
2,0.909678,"SVC(C=1.0, cache_size=200, class_weight=None, ..."
3,0.915336,"SVC(C=1.0, cache_size=200, class_weight=None, ..."
4,0.940998,"SGDClassifier(alpha=0.0001, average=False, cla..."
5,0.987149,"SVC(C=1.0, cache_size=200, class_weight=None, ..."
6,1.109537,()


The Random Forest model performed best, followed by the Logistic Regression and SVM w/ a Gaussian kernal.SVMs with linear and polynomial kernals didn't fare as well, and neither did a Logistic Regression whose parameters were optimized with stochastic gradient using an elasticnet regularization term. The Extra Trees Classifier performed the worst.

Hold on, I forgot to try out gradient boosted trees from `xgboost`:

In [6]:
try_models(train, outcomes, [
        xgb.XGBClassifier(learning_rate = 0.05, n_estimators = 500, seed = 25)
    ])

Process PoolWorker-64:
Process PoolWorker-63:
Process PoolWorker-62:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/jake/miniconda2/envs/shelter-animals/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/home/jake/miniconda2/envs/shelter-animals/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
  File "/home/jake/miniconda2/envs/shelter-animals/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
  File "/home/jake/miniconda2/envs/shelter-animals/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/home/jake/miniconda2/envs/shelter-animals/lib/python2.7/multiprocessing/pool.py", line 102, in worker
    task = get()
    self._target(*self._args, **self._kwargs)
  File "/home/jake/miniconda2/envs/shelter-animals/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line

Failure to run model:

NameError: global name 'model' is not defined