In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from random import randint
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand

train = pd.read_csv('./data/allstate/train.csv')
test = pd.read_csv('./data/allstate/test.csv')

test['loss'] = np.nan
joined = pd.concat([train, test])

def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))


cat_feature = [n for n in joined.columns if n.startswith('cat')]    
cont_feature = [n for n in joined.columns if n.startswith('cont')] 

for column in cat_feature:
        joined[column] = pd.factorize(joined[column].values, sort=True)[0]

    
train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)


def score(params):           
    print("Training with params : ")
    print(params)

    final_fold_prediction= []
    final_fold_real = []

    prediction = np.zeros(X.shape[0])
    
    n_folds  = params["n_folds"]
    del(params["n_folds"])
    
    shift = params["shift"]
    del(params["shift"])
    y = np.log(train['loss'] + shift)
    xgfull = xgb.DMatrix(X, label=y)
    
    kf = KFold(X.shape[0], n_folds=n_folds)
    
    for i, (train_index, test_index) in enumerate(kf):        
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]    
          
        xgtrain = xgb.DMatrix(X_train, label=y_train)
        xgtrain_2 = xgb.DMatrix(X_val, label=y_val)
        
        watchlist = [(xgtrain, 'train'), (xgtrain_2, 'eval')]                    

        model = xgb.train(params, xgtrain, 100000, watchlist, obj=logregobj, feval=evalerror, 
        early_stopping_rounds=300, verbose_eval=200)        

        X_val = xgb.DMatrix(X_val) 
        temp_serises = pd.Series(np.exp(model.predict(X_val))-shift)
        final_fold_prediction.append( temp_serises )
        temp_serises = np.exp(y_val) -shift
        final_fold_real.append(temp_serises )
        
        temp_cv_score = mean_absolute_error(np.exp(model.predict(X_val))-shift, np.exp(y_val) -shift)
        print("Fold {} score: {}".format(i + 1, temp_cv_score))
        
        prediction += np.exp(model.predict(xgfull)) - shift

    prediction = prediction/n_folds
    score =  mean_absolute_error(y, prediction)
        
    print("\tMAE {0}\n\n".format(score))    
    return {'loss': score, 'status': STATUS_OK}

def optimize(trials):
    RANDOM_STATE = 2016
    space = {
             'min_child_weight' : hp.quniform('min_child_weight', 0.01, 1.0, 0.01),
             'eta' : hp.quniform('eta', 0.001, 1.0, 0.01),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.01, 1.0, 0.01),
             'max_depth' : hp.choice('batch_size', np.arange(1, 20, dtype=int)),   
             'subsample' : hp.quniform('subsample', 0.6, 1.0, 0.01),
             'alpha' : hp.quniform('alpha', 0.01, 1.0, 0.01),
             'gamma' : hp.quniform('gamma', 0.01, 1.0, 0.01),                        
             'seed' : RANDOM_STATE,
             'n_folds' : hp.choice('n_folds', np.arange(2, 10, dtype=int)), 
             'shift' : hp.choice('shift', np.arange(150, 5000, dtype=int)),
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print("Best params predicted by hyperopt are:")    
    print(best)

#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)



Training with params : 
{'n_folds': 4, 'colsample_bytree': 0.9400000000000001, 'subsample': 0.9500000000000001, 'seed': 2016, 'min_child_weight': 0.05, 'gamma': 0.55, 'alpha': 0.14, 'eta': 0.48, 'max_depth': 16, 'shift': 426}
[0]	train-mae:1.04112e+08	eval-mae:1.04864e+08
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.




ValueError: Input contains NaN, infinity or a value too large for dtype('float32').