# Train classifier to estimate analysis efficiency vs gen variable

- Actual training code in train.py
- Classifier based on sklearn. Default is GradientBoostedClassifier, 
    but can be specified at run time.

## Load libraries

In [1]:
import train as tn
#reload(tn)

import plotting
reload(plotting)

import matplotlib.pyplot as plt
plt.style.use('seaborn-poster')
%matplotlib inline

import numpy as np

from pprint import pprint

import os
import json
import importlib

import util as ut
reload(ut)

Welcome to ROOTaaS 6.06/08


<module 'util' from 'util.pyc'>

## Instantiate helper class

Data are read from ROOT trees and converted into pandas data frames.  
The loading function makes sure that all the needed columns have been read from the trees, otherwise it rebilds the data frame.


#### In the following cell the parameters are set up. In this case a new classifier will be initiated with the name effGenVarClass and stored in the directory "classifiers". The training data will be read from a root tree located in dataDir. The branches of gen and reco events are the default ones (look at the help function for detailed information).

In [2]:
ut.defaultParameters(dataDir="/mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/data", 
                     classifiers=['class','recoPt','recoNjets2p5'],
                          load = False,
                     inputName = "2clfs_GenIsoAdded",
                       outName = "3clfs_GenIsoAdded_out",
                        outDir = './classifiers',
                    inputDir = '/mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/classifiers',
                    defineBins = { 'recoPt' : dict(boundaries=[0.,15.,30.,45.,85.,125.,200.,350.,10000.],overflow=False), # do not add overflow automatically
                                  'recoNjets2p5' : dict(boundaries=[-0.5,0.5,1.5,2.5,3.5,100.],overflow=False)
                    #              #'genPt' : dict(boundaries=[0.,15.,30.,45.,85.,125.,200.,350.,10000.],overflow=False), # do not add overflow automatically
                    #              #'genNjets2p5' : dict(boundaries=[-0.5,0.5,1.5,2.5,3.5,100.],overflow=False)
                                 },
                     dataFiles=[(0,'output_InsideAcceptance_ggF_125.root'),                   
                                (1,'output_InsideAcceptance_ttH_125.root'),                     
                                (2,'output_InsideAcceptance_VBF_125.root'),                            
                                (3,'output_InsideAcceptance_VH_125.root')
                               ]
                    )



In case you need help or further explanation you can execute the help function:

#### We want to use machine learning techniques for classification. For that we need to specify the classifier we want to use like here the GradientBoostingClassifier from sklearn.ensamble. Furthermore the parameters of the used classifier have to be set, e.g. the number of training events, max. tree depth, learning rate and so forth.

In [7]:
ut.params["class"] = [ "sklearn.ensemble.GradientBoostingClassifier", 
                      dict(trainevts= -1,
                           max_depth=5,learning_rate=0.2,n_estimators=200,
                        min_weight_fraction_leaf=1e-3)
]


ut.params['recoPt'] = ["xgboost.sklearn.XGBClassifier",
                       { "Xbr" : ["genPt","absGenRapidity"],#,'genLeadGenIso','genSubleadGenIso'],
                        "trainevts" : -1, 
                        "max_depth" : 5,"learning_rate" : 0.1,
                        "n_estimators" : 500,"min_child_weight" : 1e-5,
                        "nthread" : 4}]

""",
                        "cvoptimize" : True,"cv_params_grid" : { "max_depth" : [5, 7, 10],
                                                                 "learning_rate" : [0.05, 0.1, 0.2], 
                                                                 "n_estimators" : [250,500,700,1000],
                                                                 "min_child_weight" : [1e-4, 5e-4, 1e-3],
                                                                 "subsample" : [0.1, 0.2, 0.5, 1.]}, 
                        "cv_nfolds" : 5, "cv_niter" : 5,      
"""

ut.params['recoNjets2p5'] =  ["xgboost.sklearn.XGBClassifier",
                        { "Xbr" : ["genJet2p5Pt0",  "absGenJet2p5Rapidity0",
                                    "genJet2p5Pt1", "absGenJet2p5Rapidity1",
                                    "genJet2p5Pt2", "absGenJet2p5Rapidity2",
                                    "genJet2p5Pt3", "absGenJet2p5Rapidity3",
                                   "genJet2p5Pt4",  "absGenJet2p5Rapidity4",
                                    "genPt","absGenRapidity","genNjets2p5"
                                ],
            "trainevts" : -1, "max_depth" : 7,
            "learning_rate" : 0.1,"n_estimators" : 500,
            "min_child_weight" : 1e-5,
            "nthread" : 4 }]

In [8]:
#ut.params

$\color{red}{\text{changes in class become also active without ut.setParams(). }}$

In [4]:
#set the parameters "class" defined above
"""
? discuss the function setParams() with Pasquale
"""
ut.setParams()
# manual fix in order that the json file doesn't overwrite inputDir
ut.params['inputDir'] = "/mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/classifiers"

entered config files named my_train_config
hi
None


In [5]:
%time effFitter = ut.loadOrMake()

Create object with the name 2clfs_GenIsoAdded and the following paramters 
The number of selected events are 572356
The number of selected events are 766839
The number of selected events are 1448533
The number of selected events are 1583887
shuffling dataset
defining bins
CPU times: user 130 ms, sys: 11.9 s, total: 12 s
Wall time: 35.8 s


## Train classifiers

# $\color{red}{\text{What happens in the bellow 2 cells?}}$

In [7]:
#if ut.params.get("setupJoblib",True):
#   ut.setupJoblib(ipp_profile=ut.params.get('ippProfile','short'))

###  make sure that the trained classifers have been evaluated

In [7]:
ut.runEvaluation(effFitter)

Index([u'absweight', u'class', u'genJet2p5Pt0', u'genJet2p5Pt1',
       u'genJet2p5Pt2', u'genJet2p5Pt3', u'genJet2p5Pt4',
       u'genJet2p5Rapidity0', u'genJet2p5Rapidity1', u'genJet2p5Rapidity2',
       u'genJet2p5Rapidity3', u'genJet2p5Rapidity4', u'genLeadGenIso',
       u'genNjets2p5', u'genPt', u'genRapidity', u'genSubleadGenIso',
       u'recoNjets2p5', u'recoPt', u'recoRapidity', u'weight', u'proc',
       u'absGenRapidity', u'absGenJet2p5Rapidity0', u'absGenJet2p5Rapidity1',
       u'absGenJet2p5Rapidity2', u'absGenJet2p5Rapidity3', u'recoPtBin',
       u'recoPtCat', u'recoNjets2p5Bin', u'recoNjets2p5Cat', u'class_prob_0',
       u'class_prob_1', u'class_prob_2', u'class_prob_3', u'recoPtCat_prob_0',
       u'recoPtCat_prob_1', u'recoPtCat_prob_2', u'recoPtCat_prob_3',
       u'recoPtCat_prob_4', u'recoPtCat_prob_5', u'recoPtCat_prob_6',
       u'recoPtCat_prob_7', u'recoPtCat_prob_8', u'recoPtCat_prob_9',
       u'recoPtCat_prob_10', u'recoPtCat_prob_11', u'recoPtCat_prob_12

### Run the actual training

In [15]:
#reload(tn)
#try negative weights
%time ut.runTraining(effFitter,useAbsWeight=True)

We need to train the following classifiers recoNjets2p5
Fitting recoNjets2p5
<class 'xgboost.sklearn.XGBClassifier'>
{'nthread': 2, 'learning_rate': 0.1, 'trainevts': 20, 'min_child_weight': 1e-05, 'Xbr': ['genJet2p5Pt0', 'genJet2p5Rapidity0', 'genJet2p5Pt1', 'genJet2p5Rapidity1', 'genJet2p5Pt2', 'genJet2p5Rapidity2', 'genJet2p5Pt3', 'genJet2p5Rapidity3', 'genJet2p5Pt4', 'genJet2p5Rapidity4', 'genPt', 'absGenRapidity', 'genNjets2p5'], 'n_estimators': 500, 'max_depth': 7}
['genJet2p5Pt0', 'genJet2p5Rapidity0', 'genJet2p5Pt1', 'genJet2p5Rapidity1', 'genJet2p5Pt2', 'genJet2p5Rapidity2', 'genJet2p5Pt3', 'genJet2p5Rapidity3', 'genJet2p5Pt4', 'genJet2p5Rapidity4', 'genPt', 'absGenRapidity', 'genNjets2p5', 'class_prob_0', 'class_prob_1', 'class_prob_2', 'class_prob_3']
recoNjets2p5Cat
cvoptimize False
((20, 17), 340)
((20,), 20)
((20,), 20)
[[  6.04053230e+01  -1.58305466e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000

## Save the output

In [10]:
#reload(tn)
%time tn.IO.save(effFitter)

./Train_Configurable_Refactor_higgs_model_dep_2255084.pkl.gz
CPU times: user 9.95 s, sys: 237 ms, total: 10.2 s
Wall time: 10.9 s
