# Import

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import evaluation

np.random.seed(42)

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from hep_ml.gradientboosting import UGradientBoostingClassifier,LogLossFunction
from sklearn.metrics import roc_curve, auc

# Read training data

In [3]:
train = pd.read_csv('training.csv', index_col='id')

In [4]:
train.head()

Unnamed: 0_level_0,LifeTime,dira,FlightDistance,FlightDistanceError,IP,IPSig,VertexChi2,pt,DOCAone,DOCAtwo,...,p1_p,p2_p,p0_eta,p1_eta,p2_eta,SPDhits,production,signal,mass,min_ANNmuon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18453471,0.001578,0.999999,14.033335,0.681401,0.016039,0.451886,1.900433,1482.037476,0.066667,0.060602,...,12290.760742,39264.398438,3.076006,4.0038,4.031514,458,-99,0,1866.300049,0.277559
5364094,0.000988,0.999705,5.536157,0.302341,0.142163,9.564503,0.865666,3050.720703,0.024022,0.019245,...,16562.667969,7341.257812,3.228553,2.786543,2.975564,406,-99,0,1727.095947,0.225924
11130990,0.000877,0.999984,6.117302,0.276463,0.034746,1.970751,10.975849,3895.908691,0.055044,0.047947,...,22695.388672,10225.30957,3.536903,2.865686,3.05281,196,-99,0,1898.588013,0.36863
15173787,0.000854,0.999903,5.228067,0.220739,0.076389,4.271331,3.276358,4010.781738,0.053779,0.006417,...,16909.515625,9141.426758,3.087461,3.218034,2.375592,137,-99,0,1840.410034,0.246045
1102544,0.001129,0.999995,39.069534,1.898197,0.120936,4.984982,0.468348,4144.546875,0.004491,0.037326,...,97612.804688,47118.785156,4.632295,4.711155,4.296878,477,-99,0,1899.793945,0.22206


In [11]:
#randomize the training sample
train = train.iloc[np.random.permutation(len(train))]

# Training features

In [6]:
print("Eliminate SPDhits, which makes the agreement check fail")
features= ['LifeTime', 'dira', 'FlightDistance', 'FlightDistanceError', 'IP',
       'IPSig', 'VertexChi2', 'pt', 'DOCAone', 'DOCAtwo', 'DOCAthree',
       'IP_p0p2', 'IP_p1p2', 'isolationa', 'isolationb', 'isolationc',
       'isolationd', 'isolatione', 'isolationf', 'iso', 'CDF1', 'CDF2',
       'p0_track_Chi2Dof', 'p1_track_Chi2Dof', 'p2_track_Chi2Dof','p0_IP',
       'p1_IP', 'p2_IP', 'p0_IPSig', 'p1_IPSig', 'p2_IPSig', 
            'p0_eta', 'p1_eta',
       'p2_eta']

Eliminate SPDhits, which makes the agreement check fail


# Baseline training

In [12]:
params = {"objective": "binary:logistic",
          "eta": 0.17,
          "max_depth": 5,
          "min_child_weight": 3,
          "silent": 1,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "seed": 1}
num_trees=100
gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees)

In [13]:
print("train a UBoost classifier")
loss_funct=LogLossFunction()
ub=UGradientBoostingClassifier(loss=loss_funct,n_estimators=100, random_state=3,learning_rate=0.25,subsample=0.7)
ub.fit(train[features],train["signal"])

train a UBoost classifier


UGradientBoostingClassifier(learning_rate=0.25,
              loss=LogLossFunction(regularization=5.0), max_depth=3,
              max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
              min_samples_split=2, n_estimators=100,
              random_state=<mtrand.RandomState object at 0x1136d4b90>,
              splitter='best', subsample=0.7, train_features=None,
              update_tree=True)

# Check agreement test

In [14]:
check_agreement = pd.read_csv('check_agreement.csv', index_col='id')

agreement_probs = 0.5 * ub.predict_proba(check_agreement[features])[:, 1] + 0.5 * gbm.predict(xgb.DMatrix(check_agreement[features]))

ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)

print 'KS metric', ks, ks < 0.09

KS metric 0.0877989594456 True


# Check correlation test

In [15]:
check_correlation = pd.read_csv('check_correlation.csv', index_col='id')

correlation_probs = 0.5 * ub.predict_proba(check_correlation[features])[:, 1] + 0.5 * gbm.predict(xgb.DMatrix(check_correlation[features]))

cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])

print 'CvM metric', cvm, cvm < 0.002

CvM metric 0.000937115939426 True


# Compute weighted AUC on the training data with min_ANNmuon > 0.4

In [16]:
train_eval = train[train['min_ANNmuon'] > 0.4]

print("calculating train probs having min_annmuon>0.4")

train_probs = 0.5 * ub.predict_proba(train_eval[features])[:, 1] + 0.5 * gbm.predict(xgb.DMatrix(train_eval[features])) 

calculating train probs having min_annmuon>0.4


In [18]:
AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs)
print ("AUC metric",AUC) 

('AUC metric', 0.98771517498657757)


# Predict test, create file for kaggle

In [19]:
test = pd.read_csv('test.csv', index_col='id')

In [21]:
test_probs = 0.5 * ub.predict_proba(test[features])[:, 1] + 0.5 * gbm.predict(xgb.DMatrix(test[features]))

result = pd.DataFrame({'id': test.index, "prediction": test_probs})

In [22]:
result.to_csv('UBoost_XGB_classifier.csv', index=False, sep=',')