# Train classifier to estimate analysis efficiency vs gen variable

- Actual training code in training.py
- Classifier based on sklearn. Default is GradientBoostedClassifier, 
    but can be specified at run time.

## Load libraries

In [None]:
import train as tn
reload(tn)

import plotting
reload(plotting)

import matplotlib.pyplot as plt
#plt.style.use('seaborn-ticks')
plt.style.use('seaborn-poster')
%matplotlib inline
#%matplotlib notebook

import numpy as np

from pprint import pprint

import os
import json
import importlib

## Instantiate helper class

Data are read from ROOT trees and converted into pandas data frames.  
The loading function makes sure that all the needed columns have been read from the trees, otherwise it rebilds the data frame.


In [None]:
params= {}
# inputDir="/eos/user/m/musella/data/mod_dep_005"
params["dataDir"]="root://t3dcachedb03.psi.ch//pnfs/psi.ch/cms/trivcat/store/user/musella/mod_dep_005"
params["dataFname"] = "output_InsideAcceptance_125.root"
params["pfx"] = "genDiphotonDumper/trees/InsideAcceptance_125_13TeV"

params["inputDir"] = "."
params["inputName"] = "effFitter"
params["outDir"] = "."
params["outName"] = "effFitter_out"

params["ncats"] = 3
params["genBranches"] = ["genPt","genRapidity",
            "genJet2p5Pt0","genJet2p5Rapidity0",
            "genJet2p5Pt1","genJet2p5Rapidity1",
            "genJet2p5Pt2","genJet2p5Rapidity2",
            "genJet2p5Pt3","genJet2p5Rapidity3",
            "weight",
            "genNjets2p5"
           ]
params["recoBranches"] = ['recoPt','recoRapidity',"recoNjets2p5"]
params["rndseed"] = 9347865
params["rndseed2"] = 2315645

params["split_frac"] = 0.75
#split_params = {"train_size" : 0.75, "test_size" : 0.25, "random_state" : rndseed2, "stratify" : False }

params["load"] = True
params["forceMake"] = False

params["clean"] = []

params["classifiers"] = [ "class", "recoPt", "recoNjets2p5" ]
params["class"] = [  "sklearn.ensemble.GradientBoostingClassifier", 
                      dict(trainevts=100000,max_depth=5,learning_rate=0.2,n_estimators=100,
                          min_weight_fraction_leaf=1e-3)
                    ]

params["recoPt"] = [ "sklearn.ensemble.GradientBoostingClassifier",
                      dict(Xbr=['genPt','absGenRapidity'],
                          trainevts=100000,max_depth=7,learning_rate=0.1,n_estimators=500,
                          min_weight_fraction_leaf=1e-4)
                    ]

params["recoNjets2p5"] = [ "sklearn.ensemble.GradientBoostingClassifier",
                            dict(Xbr=['genJet2p5Pt0', 'genJet2p5Rapidity0',
                                    'genJet2p5Pt1', 'genJet2p5Rapidity1',
                                    'genJet2p5Pt2', 'genJet2p5Rapidity2',
                                    'genJet2p5Pt3', 'genJet2p5Rapidity3',
                                    'genPt','absGenRapidity'
                                ],#factorized=True,
                                trainevts=500000,max_depth=5,learning_rate=0.1,
                                n_estimators=100,min_weight_fraction_leaf=1e-4,
                                subsample=0.1,verbose=True)
                         ]
                      
params["defineBins"] = { 'recoPt' : dict(boundaries=[0,15,30,60,120,180,200]),
                         'recoNjets2p5' : dict(boundaries=[-0.5,1.5,2.5,3.5,4.5])
}


params["clean"] = ["class", "recoPt", "recoNjets2p5" ]
params["class"] = [  "classify.BinnedFitter", 
                      dict(trainevts=-1,bins=30,ranges=[(0,300),(0,30)]) ]

params["recoPt"] = [  "classify.BinnedFitter", 
                      dict(Xbr=['genPt','absGenRapidity'],includeClassProbs=False,
                           trainevts=-1,bins=180,ranges=[(0,300),(0,30)],addprobs=True) ]


params["classifiers"] = ["class", "recoPt"]

config_files = os.environ.get('my_train_config',None)
if config_files:
    for cfg in config_files.split(','):
        print("reading %s" % cfg)
        with open(cfg) as fin: 
            loadparams = json.loads(fin.read())
            params.update(loadparams)


pprint(params)

def runDefineBins(fitter,binsDef):
    for name,params in binsDef.iteritems(): made.defineBins(name,**params)

def loadOrMake():

    name = params["inputName"]
    load = params["load"]
    forceMake = params["forceMake"]
    
    make = False
    if load:
        onDisk = tn.IO.load(name, path=params["inputDir"], nodata=forceMake)
        pprint(onDisk)
        if not forceMake:
            pprint(onDisk.df.columns)
        pprint(onDisk.clfs)
        if onDisk.genBranches != params["genBranches"] or onDisk.recoBranches != params["recoBranches"]:
            make = True
        if onDisk.ncats != params["ncats"]:
            make = True
            load = False
    else:
        make = True

    if make or forceMake:
        if not load:
            made = tn.EfficiencyFitter(name)
        else:
            made = onDisk
        
        fileName = os.path.join(params["dataDir"],params["dataFname"])
        made.readData(ncats,genBranches,recoBranches,[(fileName,None,pfx)])
        
        print('shuffling dataset')
        np.random.seed(rndseed)
        made.df['random_index'] = np.random.permutation(range(made.df.index.size))
        made.df.sort_values(by='random_index',inplace=True)
        made.df.set_index('random_index',inplace=True)
        made.split_frac = split_frac
        
        print('defining bins')
        if not 'absGenRapidity' in made.df.columns:
            made.df['absGenRapidity'] = np.abs(made.df['genRapidity'])
        runDefineBins(params["defineBins"])
        
    else:
        made = onDisk
    
    made.cleanClfs(params["clean"])
    
    made.outdir = params["outDir"]
    made.name = params["outName"]
    return made

effFitter = loadOrMake()


In [None]:
effFitter

## Train classifiers

In [None]:
#recoPts = filter(lambda x: "recoPt_prob" in x, effFitter.df.columns)
#rename = { x : x.replace("_prob","Cat_prob") for x in recoPts }
#effFitter.df.rename(columns=rename,inplace=True)

### Control plot functions

In [None]:
def naive_closure(df,column,first=0,logy=False):
    target = target_name(column)
    nstats = np.unique(df[target]).size
    print(target,nstats)
    
    pred_cols = map(lambda x: ("%s_prob_%d" % (target, x)), range(nstats) ) 
    
    trueh = np.histogram(df[target],np.arange(-1.5,nstats-0.5))[0].ravel()
    predh = np.array(df[pred_cols].sum(axis=0)).ravel()
    
    print(trueh,predh)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    true = ax.bar(np.arange(0,2*(nstats),2)[first:],trueh[first:],color='black')
    pred = ax.bar(np.arange(1,2*(nstats)+1,2)[first:],predh[first:],color='red')
    if logy:
        ax.set_yscale('log')
        
    plt.legend((true,pred),("true","predicted"))
    
    plt.show()

def control_plots(key,fitter):
    target = target_name(key)
    
    nclasses = len(fitter.clfs[key].classes_)
    columns = map(lambda x: "%s_prob_%d" % (target,x), xrange(nclasses) )
    
    columns = columns[1:]+columns[:1]
    
    nrows = nclasses/3+1
    ncols = 3
    fitter.df.boxplot(by=target,column=columns,figsize=(7*ncols,7*nrows),layout=(nrows,ncols))
    
    plotting.scatter_hist(fitter.df,columns,figsize=(28,28))
    
    naive_closure(fitter.df,key,logy=True)
    naive_closure(fitter.df,key,first=1,logy=False)

###  make sure that the trained classifers have been evaluated

In [None]:
def target_name(key):
    postFix = ""
    if key != 'class':
        postFix = 'Cat' if not params[key][1].get('factorized',False) else 'Bin'
    return key+postFix 

clf_keys = filter(lambda x: x in effFitter.clfs.keys(),params["classifiers"])
     
for key in clf_keys:
    target = target_name(key)
    catKey = '%s_prob_0' % (target)
    if not catKey in effFitter.df.columns:
        print('running prediction for %s' % key)
        effFitter.runPrediction(target,effFitter.clfs[key])


effFitter.df.columns

### Run the actual training

In [None]:
to_train = filter(lambda x: x not in effFitter.clfs.keys(), params["classifiers"])

for key in to_train:
    classifier,train_params = params[key]
    pack,cls = classifier.rsplit('.',1)
    classifier = getattr(importlib.import_module(pack),cls)
    print("Fitting %s" % key)
    print(classifier)
    print(train_params)
    if key == 'class':
        %time effFitter.fitClass(classifier=classifier,**train_params)
    else:
        %time effFitter.fitBins(key,classifier=classifier,**train_params)
    control_plots(key,effFitter)

## save output

In [None]:
pprint(effFitter.clfs)
pprint(effFitter.df.columns)

In [None]:
tn.IO.save(effFitter)