# Train classifier to estimate analysis efficiency vs gen variable

- Actual training code in train.py
- Classifier based on sklearn. Default is GradientBoostedClassifier, 
    but can be specified at run time.

## Load libraries

In [1]:
import train as tn
#reload(tn)

import plotting
reload(plotting)

import matplotlib.pyplot as plt
plt.style.use('seaborn-poster')
%matplotlib inline

import numpy as np

from pprint import pprint

import os
import json
import importlib

import util as ut
reload(ut)

Welcome to ROOTaaS 6.06/05


<module 'util' from 'util.pyc'>

## Instantiate helper class

Data are read from ROOT trees and converted into pandas data frames.  
The loading function makes sure that all the needed columns have been read from the trees, otherwise it rebilds the data frame.


#### In the following cell the parameters are set up. In this case a new classifier will be initiated with the name effGenVarClass and stored in the directory "classifiers". The training data will be read from a root tree located in dataDir. The branches of gen and reco events are the default ones (look at the help function for detailed information).

In [2]:
ut.defaultParameters(dataDir="/mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/MoriondAnalysis/data", 
                     classifiers=['class','recoPt','recoNjets2p5'],
                          load = True,
                     inputName = "2clfs",
                       outName = "3clfs",
                        outDir = './classifiers',
                    inputDir = '/mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/MoriondAnalysis/classifiers',
                    defineBins = { 'recoPt' : dict(boundaries=[0.,15.,30.,45.,85.,125.,200.,350.,10000.],overflow=False), # do not add overflow automatically
                                  'recoNjets2p5' : dict(boundaries=[-0.5,0.5,1.5,2.5,3.5,100.],overflow=False)
                    #              #'genPt' : dict(boundaries=[0.,15.,30.,45.,85.,125.,200.,350.,10000.],overflow=False), # do not add overflow automatically
                    #              #'genNjets2p5' : dict(boundaries=[-0.5,0.5,1.5,2.5,3.5,100.],overflow=False)
                                 },
                     dataFiles=[(0,'output_GluGluHToGG_M125_IA.root'),                   
                                (1,'output_ttHToGG_M125_IA.root'),                     
                                (2,'output_VBFHToGG_M125_IA.root'),                            
                                (3,'output_VHToGG_M125_IA.root'),
                               ]
                    )




In case you need help or further explanation you can execute the help function:

#### We want to use machine learning techniques for classification. For that we need to specify the classifier we want to use like here the GradientBoostingClassifier from sklearn.ensamble. Furthermore the parameters of the used classifier have to be set, e.g. the number of training events, max. tree depth, learning rate and so forth.

In [3]:
ut.params["class"] = [ "sklearn.ensemble.GradientBoostingClassifier", 
                      dict(trainevts= -1,
                           max_depth=5,learning_rate=0.2,n_estimators=200,
                        min_weight_fraction_leaf=1e-3)
]


ut.params['recoPt'] = ["xgboost.sklearn.XGBClassifier",
                       { "Xbr" : ["genPt","absGenRapidity"],#,'genLeadGenIso','genSubleadGenIso'],
                        "trainevts" : -1, 
                        "max_depth" : 5,"learning_rate" : 0.1,
                        "n_estimators" : 500,"min_child_weight" : 1e-5,
                        "nthread" : 4}]

""",
                        "cvoptimize" : True,"cv_params_grid" : { "max_depth" : [5, 7, 10],
                                                                 "learning_rate" : [0.05, 0.1, 0.2], 
                                                                 "n_estimators" : [250,500,700,1000],
                                                                 "min_child_weight" : [1e-4, 5e-4, 1e-3],
                                                                 "subsample" : [0.1, 0.2, 0.5, 1.]}, 
                        "cv_nfolds" : 5, "cv_niter" : 5,      
"""

ut.params['recoNjets2p5'] =  ["xgboost.sklearn.XGBClassifier",
                        { "Xbr" : ["genJet2p5Pt0",  "absGenJet2p5Rapidity0",
                                   "genJet2p5Pt1", "absGenJet2p5Rapidity1",
                                   "genJet2p5Pt2", "absGenJet2p5Rapidity2",
                                   "genJet2p5Pt3", "absGenJet2p5Rapidity3",
                                   "genJet2p5Pt4",  "absGenJet2p5Rapidity4",
                                   "genJet2p5Pt5",  "absGenJet2p5Rapidity5",
                                   "genPt","absGenRapidity",
                                   "genNjets2p5",
                                   'absCosDeltaAlpha01','absCosDeltaAlpha02','absCosDeltaAlpha03',
                                   'absCosDeltaAlpha04','absCosDeltaAlpha05','absCosDeltaAlpha12',
                                   'absCosDeltaAlpha13','absCosDeltaAlpha14','absCosDeltaAlpha15',
                                   'absCosDeltaAlpha23','absCosDeltaAlpha24','absCosDeltaAlpha25',
                                   'absCosDeltaAlpha34','absCosDeltaAlpha35','absCosDeltaAlpha45',
                                   'absCosDeltaPhi01','absCosDeltaPhi02','absCosDeltaPhi03',
                                   'absCosDeltaPhi04','absCosDeltaPhi05','absCosDeltaPhi12',
                                   'absCosDeltaPhi13','absCosDeltaPhi14','absCosDeltaPhi15',
                                   'absCosDeltaPhi23','absCosDeltaPhi24','absCosDeltaPhi25',
                                   'absCosDeltaPhi34','absCosDeltaPhi35','absCosDeltaPhi45'
                                ],                         
            "trainevts" : -1, "max_depth" : 7,
            "learning_rate" : 0.1,"n_estimators" : 500,
            "min_child_weight" : 1e-5,
            "nthread" : 16 }]



"'absCosDeltaAlpha01',\n'absCosDeltaAlpha02',\n'absCosDeltaAlpha03',\n'absCosDeltaAlpha04',\n'absCosDeltaAlpha05',\n'absCosDeltaAlpha12',\n'absCosDeltaAlpha13',\n'absCosDeltaAlpha14',\n'absCosDeltaAlpha15',\n'absCosDeltaAlpha23',\n'absCosDeltaAlpha24',\n'absCosDeltaAlpha25',\n'absCosDeltaAlpha34',\n'absCosDeltaAlpha35',\n'absCosDeltaAlpha45',\n'absCosDeltaPhi01',\n'absCosDeltaPhi02',\n'absCosDeltaPhi03',\n'absCosDeltaPhi04',\n'absCosDeltaPhi05',\n'absCosDeltaPhi12',\n'absCosDeltaPhi13',\n'absCosDeltaPhi14',\n'absCosDeltaPhi15',\n'absCosDeltaPhi23',\n'absCosDeltaPhi24',\n'absCosDeltaPhi25',\n'absCosDeltaPhi34',\n'absCosDeltaPhi35',\n'absCosDeltaPhi45',\n"

$\color{red}{\text{changes in class become also active without ut.setParams(). }}$

In [4]:
#set the parameters "class" defined above
"""
? discuss the function setParams() with Pasquale
"""
ut.setParams()
# manual fix in order that the json file doesn't overwrite inputDir
ut.params['inputDir'] = "/mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/MoriondAnalysis/classifiers"

# manual fixes because trees don't have rapidities and genJetRapidities

"""ut.params['genBranches'] = ['genPt','genRapidity','genJet2p5Pt0','genJet2p5Pt1','genJet2p5Pt2',
'genJet2p5Pt3','genJet2p5Pt4','genJet2p5Pt5','weight','genNjets2p5','genLeadGenIso','genSubleadGenIso'] 
"""

ut.params['genpfx'] = 'genDiphotonDumper/trees/InsideAcceptance_125_13TeV'
ut.params['pfx'] = 'tagsDumper/trees/InsideAcceptance_125_13TeV'
ut.params['genBranches'] = ['genPt','genRapidity',
                            'genJet2p5Pt0', 'genJet2p5Pt1', 'genJet2p5Pt2', 'genJet2p5Pt3', 'genJet2p5Pt4', 'genJet2p5Pt5',
                            'genJet2p5Rapidity0', 'genJet2p5Rapidity1', 'genJet2p5Rapidity2', 'genJet2p5Rapidity3',
                            'genJet2p5Rapidity4', 'genJet2p5Rapidity5',
                            'weight',
                            'genNjets2p5','genLeadGenIso','genSubleadGenIso',
                           'absCosDeltaAlpha01','absCosDeltaAlpha02','absCosDeltaAlpha03',
                           'absCosDeltaAlpha04','absCosDeltaAlpha05','absCosDeltaAlpha12',
                           'absCosDeltaAlpha13','absCosDeltaAlpha14','absCosDeltaAlpha15',
                           'absCosDeltaAlpha23','absCosDeltaAlpha24','absCosDeltaAlpha25',
                           'absCosDeltaAlpha34','absCosDeltaAlpha35','absCosDeltaAlpha45',
                           'absCosDeltaPhi01','absCosDeltaPhi02','absCosDeltaPhi03',
                           'absCosDeltaPhi04','absCosDeltaPhi05','absCosDeltaPhi12',
                           'absCosDeltaPhi13','absCosDeltaPhi14','absCosDeltaPhi15',
                           'absCosDeltaPhi23','absCosDeltaPhi24','absCosDeltaPhi25',
                           'absCosDeltaPhi34','absCosDeltaPhi35','absCosDeltaPhi45'
                           ] 

ut.params['recoBranches'] = ['recoPt','recoRapidity','recoNjets2p5']



#ut.params['recoBranches'] = ['recoPt','recoRapidity']

entered config files named my_train_config
hi
None


In [5]:
print ut.params['genpfx']
print ut.params['pfx']


genDiphotonDumper/trees/InsideAcceptance_125_13TeV
tagsDumper/trees/InsideAcceptance_125_13TeV


In [6]:
reload(tn)
%time effFitter = ut.loadOrMake()

Load object with the name 2clfs and the following paramters 
loading
2clfs
/mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/MoriondAnalysis/classifiers
/mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/MoriondAnalysis/classifiers
/mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/MoriondAnalysis/classifiers/2clfs.pkl.gz
loading pickle /mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/MoriondAnalysis/classifiers/2clfs.pkl.gz
loading data /mnt/t3nfs01/data01/shome/jandrejk/higgs_model_dep/MoriondAnalysis/classifiers/2clfs.root
<train.EfficiencyFitter object at 0x7f4ec2576a90>
Index([u'absweight', u'class', u'genJet2p5Pt0', u'genJet2p5Pt1',
       u'genJet2p5Pt2', u'genJet2p5Pt3', u'genJet2p5Pt4', u'genJet2p5Pt5',
       u'genLeadGenIso', u'genNjets2p5', u'genPt', u'genRapidity',
       u'genSubleadGenIso', u'recoPt', u'recoRapidity', u'weight', u'proc',
       u'absGenRapidity', u'recoPtBin', u'recoPtCat', u'class_prob_0',
       u'class_prob_1', u'class_prob_2', u'class_prob_3', u'r

## Train classifiers

# $\color{red}{\text{What happens in the bellow 2 cells?}}$

In [7]:
#if ut.params.get("setupJoblib",True):
#   ut.setupJoblib(ipp_profile=ut.params.get('ippProfile','short'))

###  make sure that the trained classifers have been evaluated

In [8]:
ut.runEvaluation(effFitter)

['class', 'recoPt']
class
class
class_prob_0
Index([u'absweight', u'class', u'genJet2p5Pt0', u'genJet2p5Pt1',
       u'genJet2p5Pt2', u'genJet2p5Pt3', u'genJet2p5Pt4', u'genJet2p5Pt5',
       u'genJet2p5Rapidity0', u'genJet2p5Rapidity1', u'genJet2p5Rapidity2',
       u'genJet2p5Rapidity3', u'genJet2p5Rapidity4', u'genJet2p5Rapidity5',
       u'genLeadGenIso', u'genNjets2p5', u'genPt', u'genRapidity',
       u'genSubleadGenIso', u'recoNjets2p5', u'recoPt', u'recoRapidity',
       u'weight', u'proc', u'absGenRapidity', u'absGenJet2p5Rapidity0',
       u'absGenJet2p5Rapidity1', u'absGenJet2p5Rapidity2',
       u'absGenJet2p5Rapidity3', u'absGenJet2p5Rapidity4',
       u'absGenJet2p5Rapidity5', u'recoPtBin', u'recoPtCat',
       u'recoNjets2p5Bin', u'recoNjets2p5Cat'],
      dtype='object')
running prediction for class
recoPt
recoPtCat
recoPtCat_prob_0
Index([u'absweight', u'class', u'genJet2p5Pt0', u'genJet2p5Pt1',
       u'genJet2p5Pt2', u'genJet2p5Pt3', u'genJet2p5Pt4', u'genJet2p5Pt5',

### Run the actual training

In [9]:
#reload(tn)
#try negative weights
%time ut.runTraining(effFitter,useAbsWeight=True)

We need to train the following classifiers recoNjets2p5
Fitting recoNjets2p5
<class 'xgboost.sklearn.XGBClassifier'>
{'nthread': 2, 'learning_rate': 0.1, 'trainevts': 1000, 'min_child_weight': 1e-05, 'Xbr': ['genJet2p5Pt0', 'absGenJet2p5Rapidity0', 'genJet2p5Pt1', 'absGenJet2p5Rapidity1', 'genJet2p5Pt2', 'absGenJet2p5Rapidity2', 'genJet2p5Pt3', 'absGenJet2p5Rapidity3', 'genJet2p5Pt4', 'absGenJet2p5Rapidity4', 'genJet2p5Pt5', 'absGenJet2p5Rapidity5', 'genPt', 'absGenRapidity', 'genNjets2p5'], 'n_estimators': 500, 'max_depth': 7}
['genJet2p5Pt0', 'absGenJet2p5Rapidity0', 'genJet2p5Pt1', 'absGenJet2p5Rapidity1', 'genJet2p5Pt2', 'absGenJet2p5Rapidity2', 'genJet2p5Pt3', 'absGenJet2p5Rapidity3', 'genJet2p5Pt4', 'absGenJet2p5Rapidity4', 'genJet2p5Pt5', 'absGenJet2p5Rapidity5', 'genPt', 'absGenRapidity', 'genNjets2p5', 'class_prob_0', 'class_prob_1', 'class_prob_2', 'class_prob_3']
recoNjets2p5Cat
cvoptimize False
((1000, 19), 19000)
((1000,), 1000)
((1000,), 1000)
[[ -9.99000000e+02   9.99000

## Save the output

In [10]:
#reload(tn)
%time tn.IO.save(effFitter)

./Train_Configurable_Refactor_higgs_model_dep_2255084.pkl.gz
CPU times: user 9.95 s, sys: 237 ms, total: 10.2 s
Wall time: 10.9 s
