In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.cross_validation import StratifiedKFold,KFold,train_test_split
from scipy.stats import randint, uniform
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import LabelKFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss, roc_auc_score

%matplotlib inline
import matplotlib.pyplot as plt

import datetime
import random
from operator import itemgetter
import time
import copy

from scipy.io import mmread

np.random.seed(333)

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import os
import sys
#sys.stdout = open('tune_hyperopt_try1.txt', 'w', 1)
import xgboost as xgb

In [2]:
dtrain = xgb.DMatrix('svmlight_try2/dtrain.data')
dtest = xgb.DMatrix('svmlight_try2/dtest.data')

act_train_data = pd.read_csv("redhat_data_new/act_train_new_try2.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
act_test_data  = pd.read_csv("redhat_data_new/act_test_new_try2.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("redhat_data_new/people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])

In [None]:
lkf = LabelKFold(act_train_data['people_id'], n_folds=30)
def score(clf, random_state = 444):
    #lkf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=random_state)
    pred = np.zeros((y.shape[0],nclasses))
    for itrain, itest in lkf:
        Xtr, Xte = Xtrain[itrain, :], Xtrain[itest, :]
        ytr, yte = y[itrain], y[itest]
        clf.fit(Xtr, ytr)
        pred[itest,:] = clf.predict_proba(Xte)
        # Downsize to one fold only for kernels
        print("{:.5f}".format(roc_auc_score(yte, pred[itest,1:])), end=' ')
        #return roc_auc_score(yte, pred[itest, 1:])
    
    print("score : {:.5f}".format(roc_auc_score(y, pred[:,1:])), end=' ')
    return pred

In [14]:
def score1(params):
    print("Training with params : ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    watchlist = [(dtrain, 'train'),(dval, 'eval')]
    model = xgb.train(params, dtrain, num_round, early_stopping_rounds=30, evals=watchlist, verbose_eval=10)
    pred_val = model.predict(dval, ntree_limit = model.best_ntree_limit)
    score_val = roc_auc_score(yval, pred_val)
    score = 1 - score
    print("\tA\auc_val: {0}\n\n".format(score_val))
    return {'loss': score, 'status': STATUS_OK}

def score(params):
    print("Training with params : ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    #watchlist = [(dtrain, 'train'),(dval, 'eval')]
    model = xgb.train(params, dtrain, num_round)
    pred_val = model.predict(dval)
    score_val = roc_auc_score(yval, pred_val)
    score = 1 - score_val
    print("\tA\auc_val: {0}\n\n".format(score_val))
    return {'loss': score, 'status': STATUS_OK}

def optimize(random_state=5):
    space = {
             'n_estimators' : 200,
             #'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
             'eta' : 0.1,  
             'lambda': hp.uniform('lambda', 0, 5),
             'alpha': hp.uniform('alpha', 0, 5),
             'lambda_bias' : hp.uniform('lambda_bias', 0, 5),
             'eval_metric': 'auc',
             'objective': 'binary:logistic',
             'booster': 'gblinear',
             'nthread' : 4,
             'silent' : 1,
             'seed' : random_state
             }

    best = fmin(score, space, algo=tpe.suggest, max_evals=250)

    print(best)

In [15]:
dtrain = xgb.DMatrix('to_r_n_back/dtrain.data')
dval = xgb.DMatrix('to_r_n_back/dtest.data')
yval = (pd.read_csv('to_r_n_back/val1_target.csv')).outcome.values
dval.set_label(yval)

#Trials object where the history of search will be stored
trials = Trials()

optimize()

Training with params : 
{'objective': 'binary:logistic', 'n_estimators': 200, 'alpha': 2.49661904880883, 'silent': 1, 'lambda': 0.38400074436670595, 'nthread': 4, 'seed': 5, 'lambda_bias': 1.3923183019424918, 'eval_metric': 'auc', 'booster': 'gblinear', 'eta': 0.1}
	Auc_val: 0.6664703982970666


Training with params : 
{'objective': 'binary:logistic', 'n_estimators': 200, 'alpha': 0.08119008636677472, 'silent': 1, 'lambda': 1.4064943322305297, 'nthread': 4, 'seed': 5, 'lambda_bias': 3.7369339881757617, 'eval_metric': 'auc', 'booster': 'gblinear', 'eta': 0.1}
	Auc_val: 0.6656938945183298


Training with params : 
{'objective': 'binary:logistic', 'n_estimators': 200, 'alpha': 4.4773322739405765, 'silent': 1, 'lambda': 1.405098042619356, 'nthread': 4, 'seed': 5, 'lambda_bias': 3.2616441696339997, 'eval_metric': 'auc', 'booster': 'gblinear', 'eta': 0.1}
	Auc_val: 0.6683088474086012


Training with params : 
{'objective': 'binary:logistic', 'n_estimators': 200, 'alpha': 1.783530783700318

KeyboardInterrupt: 

In [None]:
df_score = pd.DataFrame(data,columns=['test','val'])
df_score.to_csv('tunescores_hyperopt_try10.csv',index=False)

os.system('drive upload --file tunescores_hyperopt_try10.csv')
os.system('drive upload --file tune_hyperopt_try10.txt')
os.system('sudo poweroff')