# Train classifier to estimate analysis efficiency vs gen variable

- Actual training code in train.py
- Classifier based on sklearn. Default is GradientBoostedClassifier, 
    but can be specified at run time.

## Load libraries

In [1]:
import train as tn
reload(tn)

import plotting
reload(plotting)

import matplotlib.pyplot as plt
plt.style.use('seaborn-poster')
%matplotlib inline

import numpy as np

from pprint import pprint

import os
import json
import importlib

import util as ut
reload(ut)

Welcome to JupyROOT 6.08/06


<module 'util' from 'util.pyc'>

In [9]:
import util as ut
reload(ut)

<module 'util' from 'util.py'>

## Instantiate helper class

Data are read from ROOT trees and converted into pandas data frames.  
The loading function makes sure that all the needed columns have been read from the trees, otherwise it rebilds the data frame.


#### In the following cell the parameters are set up. In this case a new classifier will be initiated with the name effGenVarClass and stored in the directory "classifiers". The training data will be read from a root tree located in dataDir. The branches of gen and reco events are the default ones (look at the help function for detailed information).

In [29]:
reload(ut)
#help(ut.defaultParameters)

<module 'util' from 'util.py'>

In [30]:
ut.defaultParameters(dataDir="./data", 
                     classifiers=['class'],
                          load = False,
                     inputName = "effGenVarClass_test",
                       outName = "effGenVarClass_test_out",
                        outDir = './classifiers',
                      InputDir = './classifiers')

In [31]:
ut.params

{'InputDir': './classifiers',
 'classifiers': ['class'],
 'clean': [],
 'dataDir': './data',
 'dataFname': 'output_InsideAcceptance_125.root',
 'defineBins': {},
 'forceMake': False,
 'genBranches': ['genPt',
  'genRapidity',
  'genJet2p5Pt0',
  'genJet2p5Rapidity0',
  'genJet2p5Pt1',
  'genJet2p5Rapidity1',
  'genJet2p5Pt2',
  'genJet2p5Rapidity2',
  'genJet2p5Pt3',
  'genJet2p5Rapidity3',
  'weight',
  'genNjets2p5'],
 'inputDir': '.',
 'inputName': 'effGenVarClass_test',
 'load': False,
 'ncats': 3,
 'outDir': './classifiers',
 'outName': 'effGenVarClass_test_out',
 'pfx': 'genDiphotonDumper/trees/InsideAcceptance_125_13TeV',
 'recoBranches': ['recoPt', 'recoRapidity', 'recoNjets2p5'],
 'rndseed': 9347865,
 'rndseed2': 2315645,
 'split_frac': 0.75}

In case you need help or further explanation you can execute the help function:

#### We want to use machine learning techniques for classification. For that we need to specify the classifier we want to use like here the GradientBoostingClassifier from sklearn.ensamble. Furthermore the parameters of the used classifier have to be set, e.g. the number of training events, max. tree depth, learning rate and so forth.

In [32]:
ut.params["class"] = [ "sklearn.ensemble.GradientBoostingClassifier", 
                      dict(trainevts=1000,
                           max_depth=5,learning_rate=0.2,n_estimators=200,
                        min_weight_fraction_leaf=1e-3)
]


In [33]:
ut.params

{'InputDir': './classifiers',
 'class': ['sklearn.ensemble.GradientBoostingClassifier',
  {'learning_rate': 0.2,
   'max_depth': 5,
   'min_weight_fraction_leaf': 0.001,
   'n_estimators': 200,
   'trainevts': 1000}],
 'classifiers': ['class'],
 'clean': [],
 'dataDir': './data',
 'dataFname': 'output_InsideAcceptance_125.root',
 'defineBins': {},
 'forceMake': False,
 'genBranches': ['genPt',
  'genRapidity',
  'genJet2p5Pt0',
  'genJet2p5Rapidity0',
  'genJet2p5Pt1',
  'genJet2p5Rapidity1',
  'genJet2p5Pt2',
  'genJet2p5Rapidity2',
  'genJet2p5Pt3',
  'genJet2p5Rapidity3',
  'weight',
  'genNjets2p5'],
 'inputDir': '.',
 'inputName': 'effGenVarClass_test',
 'load': False,
 'ncats': 3,
 'outDir': './classifiers',
 'outName': 'effGenVarClass_test_out',
 'pfx': 'genDiphotonDumper/trees/InsideAcceptance_125_13TeV',
 'recoBranches': ['recoPt', 'recoRapidity', 'recoNjets2p5'],
 'rndseed': 9347865,
 'rndseed2': 2315645,
 'split_frac': 0.75}

$\color{red}{\text{changes in class become also active without ut.setParams(). }}$

In [13]:
#set the parameters "class" defined above
"""
? discuss the function setParams() with Pasquale
"""
ut.setParams()

entered config files named my_train_config
None


In [34]:
reload(tn)
%time effFitter = ut.loadOrMake()

Create object with the name effGenVarClass_test and the following paramters 
{'InputDir': './classifiers',
 'class': ['sklearn.ensemble.GradientBoostingClassifier',
           {'learning_rate': 0.2,
            'max_depth': 5,
            'min_weight_fraction_leaf': 0.001,
            'n_estimators': 200,
            'trainevts': 1000}],
 'classifiers': ['class'],
 'clean': [],
 'dataDir': './data',
 'dataFname': 'output_InsideAcceptance_125.root',
 'defineBins': {},
 'forceMake': False,
 'genBranches': ['genPt',
                 'genRapidity',
                 'genJet2p5Pt0',
                 'genJet2p5Rapidity0',
                 'genJet2p5Pt1',
                 'genJet2p5Rapidity1',
                 'genJet2p5Pt2',
                 'genJet2p5Rapidity2',
                 'genJet2p5Pt3',
                 'genJet2p5Rapidity3',
                 'weight',
                 'genNjets2p5'],
 'inputDir': '.',
 'inputName': 'effGenVarClass_test',
 'load': False,
 'ncats': 3,
 'outDir': './cla

train.EfficiencyFitter

## Train classifiers

# $\color{red}{\text{What happens in the bellow 2 cells?}}$

In [35]:
print(ut.params.get("setupJoblib",False))
if ut.params.get("setupJoblib",False):
    print('hi')
    #ut.setupJoblib(ipp_profile=ut.params.get('ippProfile','gcc49'))

False


###  make sure that the trained classifers have been evaluated

In [36]:
ut.runEvaluation(effFitter)

Index([u'absweight', u'class', u'genJet2p5Pt0', u'genJet2p5Pt1',
       u'genJet2p5Pt2', u'genJet2p5Pt3', u'genJet2p5Rapidity0',
       u'genJet2p5Rapidity1', u'genJet2p5Rapidity2', u'genJet2p5Rapidity3',
       u'genNjets2p5', u'genPt', u'genRapidity', u'recoNjets2p5', u'recoPt',
       u'recoRapidity', u'weight', u'proc', u'absGenRapidity'],
      dtype='object')


### Run the actual training

In [37]:
reload(tn)
%time ut.runTraining(effFitter)

We need to train the following classifiers class
Fitting class
<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
{'n_estimators': 200, 'learning_rate': 0.2, 'min_weight_fraction_leaf': 0.001, 'max_depth': 5, 'trainevts': 1000}
cvoptimize False
((1000, 2), 2000)
((1000,), 1000)
((1000,), 1000)
[[  93.78787231    0.80313104]
 [  52.05802155    0.17778152]
 [ 174.98707581    0.2449225 ]]
[ 1  2 -1]
[  3.46860725e-05   1.33893293e-04   4.42525379e-05]
CPU times: user 1min 14s, sys: 236 ms, total: 1min 14s
Wall time: 1min 14s


## Save the output

In [None]:
%time tn.IO.save(effFitter)

### Reload the above effieciency fitter and list some features

In [None]:
%time tn.IO.load(effFitter.name,effFitter.outdir)

"pretty" print the input features of the classifier

In [None]:
pprint(effFitter.clfs)

"pretty" print the columns of the classifier after being exported to a pandas dataframe

In [None]:
pprint(effFitter.df.columns)

In [None]:
df = effFitter.df

Display the first 3 rows of the dataframe

In [None]:
df[0:3]

# Additional code

In [None]:
# set default parameters
# ut.defaultParameters(clean=['class'],classifiers=['class'],setupJoblib=True)
# ut.params["class"]=["xgboost.sklearn.XGBClassifier",
#                     { "trainevts" :-1, "max_depth" : 5,
#                         "learning_rate" : 0.2,"n_estimators" : 250,
#                         "min_child_weight" : 5e-4,
#                         "nthread" : 16,
#                         "cvoptimize" : True,
#                         "cv_params_grid" : { 
#                             "max_depth" : [3, 5, 7, 10],
#                             "learning_rate" : [0.05, 0.1, 0.2], "n_estimators" : [100,250,500],
#                              "min_child_weight" : [1e-4, 5e-4, 1e-3],
#                             "subsample" : [0.1, 0.2, 0.5, 1.]
#                         },
#                     "cv_nfolds" : 5, "cv_niter" : 100, "cv_verbose" : 20
#                     }
#                     ]



# ut.params["recoPt"]= ["xgboost.sklearn.XGBClassifier",
#           { "Xbr" : ["genPt","absGenRapidity"],
#             "trainevts" :-1, "max_depth" : 5,
#             "learning_rate" : 0.1,"n_estimators" : 500,
#             "min_child_weight" : 1e-5,
#             "nthread" : 16,
#             "cvoptimize" : True,
#             "cv_params_grid" : { 
#                     "max_depth" : [5, 7, 10],
#                     "learning_rate" : [0.05, 0.1, 0.2], "n_estimators" : [250,500,700,1000],
#                     "min_child_weight" : [1e-4, 5e-4, 1e-3],
#                     "subsample" : [0.1, 0.2, 0.5, 1.]
#                  },
#             "cv_nfolds" : 4, "cv_niter" : 50, "cv_verbose" : 20      
#           }
#         ]
# ut.params["outName"] = "addRecoPt"

# load additional parameters 
# a list of json files to be loaded can be specified through the the environmental variable my_train_config 