# Modeling

In [1]:
import cPickle
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

Grab the engineered data

In [2]:
def read_pickle(file_name):
    f = open(file_name, 'rb')
    p = cPickle.load(f)
    f.close()
    return p


train = read_pickle('data/train.engineered')
test = read_pickle('data/test.engineered')
outcomes = read_pickle('data/outcomes.engineered')
outcomes_le = read_pickle('data/outcomes_le.engineered')

Split the `train` data into training/test sets using the hold-out method. Though there is a DataFrame labeled `test`, this is really the set that we want to make predictions against (and, we don't have labeled examples for this set).

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    np.array(train), outcomes, test_size = 0.2, random_state = 10)

## Baseline model

Though I suspect other models will make more accurate predictions, let me quickly try out a logistic regression model w/ different regularization hyperparameters.

In [4]:
def train_test_model(model, hyperparameters, X_train, X_test, y_train, y_test):
    """
    Given a [model] and a set of possible [hyperparameters], along with 
    matricies corresponding to hold-out cross-validation, returns a model w/ 
    optimized hyperparameters using log-loss scoring and 5-fold cross-validation.
    """
    optimized_model = GridSearchCV(
        model, hyperparameters, cv = 5, n_jobs = -1, scoring = 'log_loss')
    optimized_model.fit(X_train, y_train)
    print 'Optimized parameters:', optimized_model.best_params_
    print 'Log loss:', np.absolute(optimized_model.score(X_test, y_test))
    return optimized_model

In [5]:
%%time
logit_model = train_test_model(
    LogisticRegression(), 
    {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']}, 
    X_train, X_test, y_train, y_test)

Optimized parameters: {'penalty': 'l1', 'C': 1}
Log loss: 0.888370971268
CPU times: user 10.6 s, sys: 96 ms, total: 10.7 s
Wall time: 1min 1s




In [24]:
logit_model.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
test_ids = pd.read_csv('data/train.csv')[['AnimalID']]

In [8]:
clf = logit_model.best_estimator_
clf.fit(np.array(train), outcomes)
probs = clf.predict_proba(np.array(test))

In [14]:
a = pd.DataFrame(probs)

In [16]:
list(outcomes_le.inverse_transform(list(a)))

['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']