In [107]:
# General
import os, sys
# Handling ROOT files
import numpy as np
import uproot
import pandas as pd
# Plotting
import matplotlib
from matplottery import Hist1D, Hist2D, plot_stack
import matplotlib.pyplot as plt
# Machine Learning
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
import xgboost as xgb #BDT
import ast
import json
# Custom Tools
from data import GetData
# Jupyter Display Settings
%matplotlib inline
pd.set_option('display.max_rows', 20)

# Global Variables

In [108]:
# Manualy set signal name
signal = "WH_HtoRhoGammaPhiGamma"

# Get Data

In [109]:
# Get dataframes
dataframes = GetData("outputs", verbose=True)

Loaded Dataframes:
    TTJets_SingleLeptFromT
    TTGamma_SingleLeptFromTbar
    WGToLNuG
    TTJets_SingleLeptFromTbar
    WJetsToLNu
    WH_HtoRhoGammaPhiGamma
    TTGamma_SingleLeptFromT


In [110]:
# Print branch names
branches = dataframes[signal].keys()
print("Branch Names:\n"+", ".join(branches))

Branch Names:
run, lumi, event, scale1fb, genRecoGamma_dR, genRecoPhi_dR, genRecoRho_dR, genW_pt, genW_eta, genW_phi, genW_mass, genWLepton_id, genWLepton_pt, genWLepton_eta, genWLepton_phi, genHiggs_pt, genHiggs_eta, genHiggs_phi, genHiggs_mass, genHiggsMeson_id, genHiggsMeson_pt, genHiggsMeson_eta, genHiggsMeson_phi, genHiggsMeson_mass, genHiggsMesonGamma_dR, genKm_pt, genKm_phi, genKm_eta, genKp_pt, genKp_phi, genKp_eta, genKpKm_dR, genGamma_pt, genGamma_phi, genGamma_eta, recoHiggs_mass, recoMeson_nCands, recoPhi_mass, recoPhi_pt, recoPhi_eta, recoPhi_phi, recoPhi_iso, recoKm_pt, recoKm_eta, recoKm_phi, recoKm_iso, recoKp_pt, recoKp_eta, recoKp_phi, recoKp_iso, recoKpKm_dR, recoRho_mass, recoRho_pt, recoRho_eta, recoRho_phi, recoRho_iso, recoPim_pt, recoPim_eta, recoPim_phi, recoPim_iso, recoPip_pt, recoPip_eta, recoPip_phi, recoPip_iso, recoPipPim_dR, recoGamma_pt, recoGamma_phi, recoGamma_eta, recoGamma_iso, genRecoGamma_isMatch, minGammaParton_dR, recoWLepton_id, recoWLepton_pt,

In [111]:
# Add signal bool and dataset name columns
for name, df in dataframes.iteritems():
    if name == signal:
        df["stype"] = np.ones_like(df["lumi"])
    else:
        df["stype"] = np.zeros_like(df["lumi"])
    df["signal"] = (df["stype"] == 0)

# Concatenate dataframes
data = pd.concat(dataframes.values())
        
# Get bookkeeping columns to exclude in training
exclude = ["signal", "stype", "run", "lumi", "event", "scale1fb",
           "recoMeson_nCands", "recoWLepton_nLep", "minGammaParton_dR"]
for bname in branches:
    if "gen" in bname: exclude.append(bname)
    
# Collect selected features
features = sorted(list(set(df.columns.values)-set(exclude)))

In [112]:
# Display selected features
print("Selected Feature Names:\n"+", ".join(features))

Selected Feature Names:
recoGamma_eta, recoGamma_iso, recoGamma_phi, recoGamma_pt, recoHiggs_mass, recoKm_eta, recoKm_iso, recoKm_phi, recoKm_pt, recoKpKm_dR, recoKp_eta, recoKp_iso, recoKp_phi, recoKp_pt, recoPhi_eta, recoPhi_iso, recoPhi_mass, recoPhi_phi, recoPhi_pt, recoPim_eta, recoPim_iso, recoPim_phi, recoPim_pt, recoPipPim_dR, recoPip_eta, recoPip_iso, recoPip_phi, recoPip_pt, recoRho_eta, recoRho_iso, recoRho_mass, recoRho_phi, recoRho_pt, recoWLepton_eta, recoWLepton_id, recoWLepton_phi, recoWLepton_pt


In [113]:
# Split dataframe for training and testing
dummyWeights = np.ones(len(data["signal"])) # in place of data["weights"]
x_train, x_test, y_train, y_test, stype_train, stype_test, weights_train, weights_test = train_test_split(
                data[features],
                data["signal"],
                data["stype"],
                dummyWeights,
                test_size=0.25, random_state=42,
                )

# Train BDT

In [114]:
sel_test = stype_test != 1.
sel_train = stype_train != 1.
dump_name = "rpg.h"

dtrain = xgb.DMatrix( x_train[sel_train], label=y_train[sel_train], weight=np.abs(weights_train[sel_train]))
dtest = xgb.DMatrix( x_test[sel_test], label=y_test[sel_test], weight=np.abs(weights_test[sel_test]))
evallist  = [(dtrain,'train'), (dtest,'eval')]
param = {}
param['objective'] = 'binary:logistic'

# BDT Knobs
num_round = 20
param['eta'] = 0.07
param['max_depth'] = 4
# param['max_depth'] = 5
param['silent'] = 1
param['nthread'] = 4
param['eval_metric'] = "auc"
param['subsample'] = 0.6
param['alpha'] = 8.0
param['gamma'] = 2.0
param['lambda'] = 1.0
param['min_child_weight'] = 1.0
param['colsample_bytree'] = 1.0

In [115]:
# Get pos/neg weights
sumw_pos = np.abs(dtrain.get_weight()[dtrain.get_label()==1]).sum()
sumw_neg = np.abs(dtrain.get_weight()[dtrain.get_label()==0]).sum()
param["scale_pos_weight"] = sumw_neg/sumw_pos

In [116]:
# Train
bst = xgb.train( param.items(), dtrain, num_round, evallist, early_stopping_rounds=15 )

XGBoostError: [20:16:29] src/metric/rank_metric.cc:135: Check failed: !auc_error AUC: the dataset only contains pos or neg samples

Stack trace returned 10 entries:
[bt] (0) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/external/py2-xgboost/0.72/lib/python2.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::StackTrace[abi:cxx11]()+0x3b) [0x7f923d958c7b]
[bt] (1) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/external/py2-xgboost/0.72/lib/python2.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x29) [0x7f923d9593d9]
[bt] (2) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/external/py2-xgboost/0.72/lib/python2.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::metric::EvalAuc::Eval(std::vector<float, std::allocator<float> > const&, xgboost::MetaInfo const&, bool) const+0xf46) [0x7f923da27fe6]
[bt] (3) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/external/py2-xgboost/0.72/lib/python2.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::EvalOneIter(int, std::vector<xgboost::DMatrix*, std::allocator<xgboost::DMatrix*> > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&)+0x248) [0x7f923d9620a8]
[bt] (4) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/external/py2-xgboost/0.72/lib/python2.7/site-packages/xgboost/./lib/libxgboost.so(XGBoosterEvalOneIter+0x364) [0x7f923d975a74]
[bt] (5) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/cms/cmssw/CMSSW_10_2_5/external/slc6_amd64_gcc700/lib/libffi.so.6(ffi_call_unix64+0x4c) [0x7f926a958242]
[bt] (6) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/cms/cmssw/CMSSW_10_2_5/external/slc6_amd64_gcc700/lib/libffi.so.6(ffi_call+0x1a1) [0x7f926a9572a1]
[bt] (7) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/external/python/2.7.14-omkpbe4/lib/python2.7/lib-dynload/_ctypes.so(_ctypes_callproc+0x485) [0x7f926a96e115]
[bt] (8) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/external/python/2.7.14-omkpbe4/lib/python2.7/lib-dynload/_ctypes.so(+0x8b0f) [0x7f926a964b0f]
[bt] (9) /cvmfs/cms.cern.ch/slc6_amd64_gcc700/cms/cmssw/CMSSW_10_2_5/external/slc6_amd64_gcc700/lib/libpython2.7.so.1.0(PyObject_Call+0x43) [0x7f926d027ef3]

