In [1]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import explained_variance_score,r2_score,mean_absolute_error
from sklearn import preprocessing

import numpy as np
import pandas as pd

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand
import sys
import xgboost as xgb
import os
from utils import encode_numeric_zscore_list, encode_numeric_zscore_all, to_xy, encode_text_index_list, encode_numeric_log_list



In [2]:
def load_train():
    path = "./data/allstate"
    inputFilePath = os.path.join(path, "train.csv.zip")
    df = pd.read_csv(inputFilePath, compression="zip", header=0, na_values=['NULL'])
    #shuffle dataset. Unnecessary in this case because already sorted by guid
    np.random.seed(42)
    df = df.reindex(np.random.permutation(df.index))
    df.reset_index(inplace=True, drop=True)

    labels = df["loss"]
   
    df = df.drop('id', axis=1)
    df = df.drop('loss', axis=1)
    encode_text_index_list(df, ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116'])
    return df, labels.astype('float32')

def load_test():
    test = pd.read_csv('../data/test.csv')
    test = test.drop('id', axis=1)
    return test.values


def write_submission(preds, output):
    sample = pd.read_csv('../data/sampleSubmission.csv')
    preds = pd.DataFrame(
        preds, index=sample.id.values, columns=sample.columns[1:])
    preds.to_csv(output, index_label='id')


def score(params):
    print("Training with params : ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)
    predictions = model.predict(dvalid)
    print(predictions)
    score =  mean_absolute_error(y_test, predictions)
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}


def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
             'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
             #do not use hp.randint. crashes
             'max_depth' : hp.choice('max_depth', [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'eval_metric': 'mae',
             'objective': 'reg:linear',
             'nthread' : 6,
             'silent' : 0
             }

    best = fmin(score, space, algo=rand.suggest, trials=trials, max_evals=250)

    print(best)


X, y = load_train()
print("Splitting data into train and valid ...\n\n")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)

Splitting data into train and valid ...


Training with params : 
{'eta': 0.325, 'objective': 'reg:linear', 'max_depth': 6, 'gamma': 0.6000000000000001, 'n_estimators': 889.0, 'subsample': 0.7000000000000001, 'min_child_weight': 3.0, 'nthread': 6, 'eval_metric': 'mae', 'silent': 0, 'colsample_bytree': 0.9500000000000001}
[ 4260.13232422  6007.32910156  2062.97338867 ...,  1755.7298584
  4027.84570312  4747.23583984]
	Score 1293.1312255859375


Training with params : 
{'eta': 0.35000000000000003, 'objective': 'reg:linear', 'max_depth': 3, 'gamma': 0.8500000000000001, 'n_estimators': 808.0, 'subsample': 0.5, 'min_child_weight': 6.0, 'nthread': 6, 'eval_metric': 'mae', 'silent': 0, 'colsample_bytree': 0.9}
[ 4271.69189453  4529.23828125  2383.32788086 ...,  1528.9888916
  3911.03051758  5191.28857422]
	Score 1228.1441650390625


Training with params : 
{'eta': 0.275, 'objective': 'reg:linear', 'max_depth': 1, 'gamma': 0.7000000000000001, 'n_estimators': 147.0, 'subsample': 0.8, 'min_child