# HiggsML
## XGBoost

In [1]:
# using XGBoost official demo for kaggle-higgs
# https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/

In [2]:
# import
import numpy as np
import xgboost as xgb

## Training

In [14]:
# XGBoost test code below
test_size = 550000

# path to where the data lies
dpath = '../../data/'

# load in training data, directly use numpy
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
print ('finish loading from csv ')

label  = dtrain[0:225000,32]
data   = dtrain[0:225000,1:31]
# rescale weight to make it same as test set
weight = dtrain[0:225000,31] * float(test_size) / len(label)

sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0  )

# print weight statistics
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))

# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )

# setup parameters for xgboost
param = {}
# use logistic regression loss, use raw prediction before logistic transformation
# since we only need the rank
param['objective'] = 'binary:logitraw'
# scale weight of positive examples
param['scale_pos_weight'] = sum_wneg/sum_wpos
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 1
param['nthread'] = 16

# you can directly throw param in, though we want to watch multiple metrics here
plst = list(param.items())+[('eval_metric', 'ams@0.15')]

watchlist = [ (xgmat,'train') ]
# boost 120 tres
num_round = 120
print ('loading data end, start to boost trees')
bst = xgb.train( plst, xgmat, num_round, watchlist );
# save out model
bst.save_model('higgs.model')

print ('finish training')

finish loading from csv 
weight statistics: wpos=1522.38, wneg=905021, ratio=594.478
loading data end, start to boost trees
[0]	train-auc:0.911423	train-ams@0.15:3.82749
[1]	train-auc:0.915852	train-ams@0.15:3.84539
[2]	train-auc:0.918686	train-ams@0.15:4.13342
[3]	train-auc:0.919729	train-ams@0.15:4.04069
[4]	train-auc:0.920828	train-ams@0.15:4.2107
[5]	train-auc:0.921371	train-ams@0.15:4.22123
[6]	train-auc:0.921817	train-ams@0.15:4.22617
[7]	train-auc:0.923193	train-ams@0.15:4.32661
[8]	train-auc:0.923809	train-ams@0.15:4.33708
[9]	train-auc:0.924416	train-ams@0.15:4.34483
[10]	train-auc:0.924974	train-ams@0.15:4.37694
[11]	train-auc:0.925546	train-ams@0.15:4.40831
[12]	train-auc:0.926095	train-ams@0.15:4.42525
[13]	train-auc:0.926513	train-ams@0.15:4.44384
[14]	train-auc:0.926951	train-ams@0.15:4.48167
[15]	train-auc:0.927612	train-ams@0.15:4.53498
[16]	train-auc:0.928051	train-ams@0.15:4.5426
[17]	train-auc:0.928689	train-ams@0.15:4.58298
[18]	train-auc:0.929227	train-ams@0.15:4.6

## Evaluation

In [17]:
# construct test data
tLabel  = dtrain[225001:250000,32]
tData   = dtrain[225001:250000,1:31]
# rescale weight to make it same as test set
tWeight = dtrain[225001:250000,31] * float(test_size) / len(label)

# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
xgmat = xgb.DMatrix( tData)

In [18]:
# predict classes
ypred = bst.predict(xgmat)

In [21]:
# make top 15% positive
idx = dtrain[225001:250000,0]
threshold_ratio = 0.15
res  = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ]

rorder = {}
for k, v in sorted( res, key = lambda x:-x[1] ):
    rorder[ k ] = len(rorder) + 1
ntop = int( threshold_ratio * len(rorder ) )

In [27]:
len(rorder)

24999

In [28]:
len(res)

24999

In [50]:
myPred = []
nhit = 0
for k, v in res:
    # if rorder[k] <= ntop:
    if v > 0:
        lb = 's'
        nhit += 1
    else:
        lb = 'b'
    # change output rank order to follow Kaggle convention
    myPred.append(lb)

In [32]:
# compute AMS
def ams(s, b):
    from math import sqrt,log
    if b==0:
        return 0

    return sqrt(2*((s+b+10)*log(1+float(s)/(b+10))-s))

In [33]:
# compute all measures
def validate(predicted, real, weights):
    sumsig = 0.
    sumbkg = 0.
    tp = 0.
    tn = 0.
    fp = 0.
    fn = 0.
    precision = 0.
    recall = 0.
    acc = 0.
    
    if (predicted.shape[0] != real.shape[0]):
        raise Exception
    
    for i in range(predicted.shape[0]):
        if predicted[i] == "s":
            if real[i] == "s":
                sumsig += weights[i]
                tp += 1
            else:
                sumbkg += weights[i]
                fp += 1
        else:
            if real[i] == "s":
                fn += 1
            else:
                tn += 1
    
    print(tp, fp, fn, tn)
    
    # calculate scores
    amsscore = ams(sumsig * 10, sumbkg * 10)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    acc = (tp + tn) / (tp + fp + tn + fn)
    f1score = (2 * precision * recall)/(precision + recall)

    printScores(tp, tn, fp, fn, precision, recall, acc, f1score, amsscore)
    
    return amsscore

In [34]:
def printScores(tp, tn, fp, fn, precision, recall, acc, f1score, amsscore):
    all = tp + tn + fp + fn
    print("TP: ", tp/all)
    print("TN: ", tn/all)
    print("FP: ", fp/all)
    print("FN: ", fn/all)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("Acc: ", acc)
    print("F1: ", f1score)
    print("AMS: ", amsscore)

In [48]:
import pandas as pd

In [51]:
validate(np.array(myPred), np.array(pd.Series(tLabel).map({1: 's', 0: 'b'})), np.array(tWeight))

6939.0 3495.0 1670.0 12895.0
TP:  0.27757110284411374
TN:  0.515820632825313
FP:  0.13980559222368893
FN:  0.06680267210688427
Precision:  0.6650373778033353
Recall:  0.8060169589963991
Acc:  0.7933917356694268
F1:  0.7287717271438323
AMS:  3.7349642532420426


3.7349642532420426