In [1]:
import numpy as np
import xgboost as xgb
import pandas as pd
import gc
from sklearn.metrics import accuracy_score,r2_score
import matplotlib.pyplot as plt
import time
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

SEED = 314159265
VALID_SIZE = 0.2
TARGET = 'outcome'

In [2]:
starttime = time.time()
# read in data
mainFrame=pd.read_csv('20062007Small.csv',parse_dates=['date'])
#model = joblib.load(r'C:\Hedge Fund Project\training\modelv1.plk')
mainFrame.set_index(['entityID','date'],inplace=True)
#mainFrame=mainFrame20052018
mainFrame.sort_index(inplace=True)
targets=mainFrame.iloc[:,-7:]
features = mainFrame.iloc[:,:-7]
gc.collect()
endtime = time.time()
print("It takes {}s to load data".format(endtime-starttime))

It takes 51.66190028190613s to load data


In [3]:
maskTrain=(mainFrame.index.get_level_values(1)>='2006') & (mainFrame.index.get_level_values(1)<'2006-10-01')

maskTest=(mainFrame.index.get_level_values(1)>='2006-10-01') & (mainFrame.index.get_level_values(1)<'2007')

x_train=np.array(features[maskTrain])
y_train=np.array(targets['ztargetMedian5'][maskTrain])
x_train[np.isinf(x_train)]=100000000
y_train=y_train*1
y_train=y_train.astype(int)

x_test=np.array(features[maskTest])
y_test = np.array(targets['ztargetMedian5'][maskTest])
x_test[np.isinf(x_test)]=100000000
y_test=y_test*1
y_test=y_test.astype(int)

dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid =xgb.DMatrix(x_test, label=y_test)
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2

In [4]:
def score(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm_model = xgb.train(params, dtrain, num_round,
                          evals=watchlist,
                          verbose_eval=True)
    print(params)
    predictions = gbm_model.predict(dvalid,
                                    ntree_limit=gbm_model.best_iteration + 1)
    
    predictions[predictions>0.5] = 1
    predictions[predictions<=0.5] = 0
    
    score = accuracy_score(y_test, predictions)
    # TODO: Add the importance for the selected features
    print("\tScore {0}\n\n".format(score))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}

In [5]:
def optimize(trials=None, 
             random_state=SEED):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space = {
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        # A problem with max_depth casted to float instead of int with
        # the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'auc',
        'objective': 'binary:logistic',
        # Increase this number if you have more cores. Otherwise, remove it and it will default 
        # to the maxium number. 
        'nthread': 3,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': random_state
    }
    if trials is None:
        trials = Trials()
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, space, algo=tpe.suggest,
                max_evals=100, 
                trials=trials)
    return best

In [None]:
best_hyperparams = optimize()
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)
gc.collect()

Training with params:                                                                                                  
{'booster': 'gbtree', 'colsample_bytree': 0.8500000000000001, 'eta': 0.2, 'eval_metric': 'auc', 'gamma': 0.9, 'max_depth': 13, 'min_child_weight': 5.0, 'n_estimators': 519.0, 'nthread': 3, 'objective': 'binary:logistic', 'seed': 314159265, 'silent': 1, 'subsample': 0.65, 'tree_method': 'exact'}
[0]	eval-auc:0.511105	train-auc:0.695293                                                                               

[1]	eval-auc:0.501703	train-auc:0.743413                                                                               

[2]	eval-auc:0.501469	train-auc:0.765414                                                                               

[3]	eval-auc:0.500203	train-auc:0.784266                                                                               

[4]	eval-auc:0.501327	train-auc:0.796476                                                            

[131]	eval-auc:0.509	train-auc:0.990796                                                                                

[132]	eval-auc:0.508937	train-auc:0.990982                                                                             

[133]	eval-auc:0.509004	train-auc:0.99116                                                                              

[134]	eval-auc:0.508968	train-auc:0.991539                                                                             

[135]	eval-auc:0.509109	train-auc:0.991748                                                                             

[136]	eval-auc:0.509105	train-auc:0.991858                                                                             

[137]	eval-auc:0.509409	train-auc:0.992245                                                                             

[138]	eval-auc:0.50948	train-auc:0.99241                                                                               

[139]	eval-auc:0.509175	train-au

[265]	eval-auc:0.51198	train-auc:0.999795                                                                              

[266]	eval-auc:0.511927	train-auc:0.999802                                                                             

[267]	eval-auc:0.511968	train-auc:0.999813                                                                             

[268]	eval-auc:0.511905	train-auc:0.999817                                                                             

[269]	eval-auc:0.511907	train-auc:0.999818                                                                             

[270]	eval-auc:0.511784	train-auc:0.999822                                                                             

[271]	eval-auc:0.511737	train-auc:0.999826                                                                             

[272]	eval-auc:0.511754	train-auc:0.99983                                                                              

[273]	eval-auc:0.51165	train-auc

[399]	eval-auc:0.513794	train-auc:0.999997                                                                             

[400]	eval-auc:0.513786	train-auc:0.999997                                                                             

[401]	eval-auc:0.513729	train-auc:0.999997                                                                             

[402]	eval-auc:0.513693	train-auc:0.999997                                                                             

[403]	eval-auc:0.513698	train-auc:0.999998                                                                             

[404]	eval-auc:0.513743	train-auc:0.999998                                                                             

[405]	eval-auc:0.513735	train-auc:0.999998                                                                             

[406]	eval-auc:0.513636	train-auc:0.999998                                                                             

[407]	eval-auc:0.513981	train-au

[8]	eval-auc:0.48523	train-auc:0.672521                                                                                

[9]	eval-auc:0.486935	train-auc:0.67576                                                                                

[10]	eval-auc:0.489717	train-auc:0.677876                                                                              

[11]	eval-auc:0.490772	train-auc:0.680613                                                                              

[12]	eval-auc:0.49177	train-auc:0.682838                                                                               

[13]	eval-auc:0.492514	train-auc:0.684703                                                                              

[14]	eval-auc:0.492905	train-auc:0.686546                                                                              

[15]	eval-auc:0.493358	train-auc:0.688869                                                                              

[16]	eval-auc:0.494493	train-auc

[142]	eval-auc:0.49638	train-auc:0.792665                                                                              

[143]	eval-auc:0.49652	train-auc:0.793343                                                                              

[144]	eval-auc:0.496596	train-auc:0.793733                                                                             

[145]	eval-auc:0.496596	train-auc:0.794029                                                                             

[146]	eval-auc:0.496642	train-auc:0.794135                                                                             

[147]	eval-auc:0.496675	train-auc:0.794237                                                                             

[148]	eval-auc:0.496657	train-auc:0.794453                                                                             

[149]	eval-auc:0.496671	train-auc:0.79538                                                                              

[150]	eval-auc:0.496934	train-au

[276]	eval-auc:0.501199	train-auc:0.842518                                                                             

[277]	eval-auc:0.501134	train-auc:0.842762                                                                             

[278]	eval-auc:0.501178	train-auc:0.843237                                                                             

[279]	eval-auc:0.501076	train-auc:0.843504                                                                             

[280]	eval-auc:0.501197	train-auc:0.843898                                                                             

[281]	eval-auc:0.501237	train-auc:0.844569                                                                             

[282]	eval-auc:0.501275	train-auc:0.844737                                                                             

[283]	eval-auc:0.501286	train-auc:0.844935                                                                             

[284]	eval-auc:0.501292	train-au

[410]	eval-auc:0.503675	train-auc:0.879137                                                                             

[411]	eval-auc:0.503633	train-auc:0.879329                                                                             

[412]	eval-auc:0.503651	train-auc:0.879687                                                                             

[413]	eval-auc:0.503677	train-auc:0.879663                                                                             

[414]	eval-auc:0.503622	train-auc:0.879936                                                                             

[415]	eval-auc:0.503592	train-auc:0.880018                                                                             

[416]	eval-auc:0.503609	train-auc:0.880362                                                                             

[417]	eval-auc:0.503552	train-auc:0.880646                                                                             

[418]	eval-auc:0.503519	train-au

[20]	eval-auc:0.509892	train-auc:0.883087                                                                              

[21]	eval-auc:0.510099	train-auc:0.883891                                                                              

[22]	eval-auc:0.510345	train-auc:0.885068                                                                              

[23]	eval-auc:0.511039	train-auc:0.887952                                                                              

[24]	eval-auc:0.512463	train-auc:0.893179                                                                              

[25]	eval-auc:0.512656	train-auc:0.896154                                                                              

[26]	eval-auc:0.512877	train-auc:0.897362                                                                              

[27]	eval-auc:0.512919	train-auc:0.899789                                                                              

[28]	eval-auc:0.513046	train-auc

[154]	eval-auc:0.515647	train-auc:0.996002                                                                             

[155]	eval-auc:0.515814	train-auc:0.996168                                                                             

[156]	eval-auc:0.515951	train-auc:0.996284                                                                             

[157]	eval-auc:0.516131	train-auc:0.996398                                                                             

[158]	eval-auc:0.516205	train-auc:0.996481                                                                             

[159]	eval-auc:0.516331	train-auc:0.996539                                                                             

[160]	eval-auc:0.51624	train-auc:0.996618                                                                              

[161]	eval-auc:0.51639	train-auc:0.996698                                                                              

[162]	eval-auc:0.516528	train-au

[288]	eval-auc:0.517215	train-auc:0.999943                                                                             

[289]	eval-auc:0.517248	train-auc:0.999945                                                                             

[290]	eval-auc:0.517268	train-auc:0.999946                                                                             

[291]	eval-auc:0.517445	train-auc:0.99995                                                                              

[292]	eval-auc:0.517361	train-auc:0.999952                                                                             

[293]	eval-auc:0.517341	train-auc:0.999952                                                                             

[294]	eval-auc:0.517301	train-auc:0.999955                                                                             

[295]	eval-auc:0.517493	train-auc:0.999958                                                                             

[296]	eval-auc:0.517475	train-au

[422]	eval-auc:0.518597	train-auc:1                                                                                    

[423]	eval-auc:0.518745	train-auc:1                                                                                    

[424]	eval-auc:0.518811	train-auc:1                                                                                    

[425]	eval-auc:0.518782	train-auc:1                                                                                    

[426]	eval-auc:0.518635	train-auc:1                                                                                    

[427]	eval-auc:0.518533	train-auc:1                                                                                    

[428]	eval-auc:0.518614	train-auc:1                                                                                    

[429]	eval-auc:0.518651	train-auc:1                                                                                    

[430]	eval-auc:0.5186	train-auc:

[556]	eval-auc:0.517749	train-auc:1                                                                                    

[557]	eval-auc:0.517784	train-auc:1                                                                                    

[558]	eval-auc:0.517732	train-auc:1                                                                                    

[559]	eval-auc:0.517772	train-auc:1                                                                                    

[560]	eval-auc:0.517895	train-auc:1                                                                                    

[561]	eval-auc:0.517866	train-auc:1                                                                                    

[562]	eval-auc:0.517874	train-auc:1                                                                                    

[563]	eval-auc:0.517888	train-auc:1                                                                                    

[564]	eval-auc:0.517766	train-au

[690]	eval-auc:0.518177	train-auc:1                                                                                    

[691]	eval-auc:0.518187	train-auc:1                                                                                    

[692]	eval-auc:0.5182	train-auc:1                                                                                      

[693]	eval-auc:0.518184	train-auc:1                                                                                    

[694]	eval-auc:0.518321	train-auc:1                                                                                    

[695]	eval-auc:0.518366	train-auc:1                                                                                    

[696]	eval-auc:0.518345	train-auc:1                                                                                    

[697]	eval-auc:0.518277	train-auc:1                                                                                    

[698]	eval-auc:0.518248	train-au

[113]	eval-auc:0.506523	train-auc:0.828866                                                                             

[114]	eval-auc:0.506486	train-auc:0.83005                                                                              

[115]	eval-auc:0.506537	train-auc:0.830711                                                                             

[116]	eval-auc:0.506598	train-auc:0.831144                                                                             

[117]	eval-auc:0.506495	train-auc:0.831408                                                                             

[118]	eval-auc:0.506476	train-auc:0.832063                                                                             

[119]	eval-auc:0.506571	train-auc:0.832566                                                                             

[120]	eval-auc:0.506602	train-auc:0.832777                                                                             

[121]	eval-auc:0.506541	train-au

[247]	eval-auc:0.510478	train-auc:0.89613                                                                              

[248]	eval-auc:0.510447	train-auc:0.896299                                                                             

[249]	eval-auc:0.510516	train-auc:0.896601                                                                             

[250]	eval-auc:0.510656	train-auc:0.896949                                                                             

[251]	eval-auc:0.510765	train-auc:0.897499                                                                             

[252]	eval-auc:0.510732	train-auc:0.898048                                                                             

[253]	eval-auc:0.510792	train-auc:0.898235                                                                             

[254]	eval-auc:0.510816	train-auc:0.898332                                                                             

[255]	eval-auc:0.510643	train-au

[381]	eval-auc:0.512971	train-auc:0.935992                                                                             

[382]	eval-auc:0.512969	train-auc:0.936329                                                                             

[383]	eval-auc:0.512968	train-auc:0.936472                                                                             

[384]	eval-auc:0.513226	train-auc:0.936721                                                                             

[385]	eval-auc:0.513351	train-auc:0.936901                                                                             

[386]	eval-auc:0.513396	train-auc:0.937043                                                                             

[387]	eval-auc:0.513372	train-auc:0.937142                                                                             

[388]	eval-auc:0.513399	train-auc:0.93722                                                                              

[389]	eval-auc:0.513357	train-au

In [None]:
gbm_model.save_model('xgboost_imple.model')