# OPTIMIZATION OF THE FEATURES USED FOR THE ISOLATION

The starting point are the 14 variables that were studied in the JupyScripts `towerIsoInspection` and in `isoPloting`. Such variables are:

![basic_features](img/basic_features.png)

This notebook aims at improving them and possibly reducing their number

In [1]:
import os
import time
import sys
import pandas as pd
import numpy as np
import ROOT
pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows', None)

import matplotlib
from matplotlib import pyplot as plt

Welcome to JupyROOT 6.14/04


In [2]:
def deltar2cluster ( df ):
    delta_eta = np.abs(df['cl3d_eta']-df['cl3d_eta_ass'])
    delta_phi = np.abs(df['cl3d_phi']-df['cl3d_phi_ass'])
    sel = delta_phi > np.pi
    delta_phi = sel*(2*np.pi) - delta_phi
    return np.sqrt( delta_eta**2 + delta_phi**2 )

def L1Cl3dEtIso ( dfL1Candidates, dfL1associated2Candidates, dR ):
    df_joined  = dfL1Candidates.join(dfL1associated2Candidates, on='event', how='left', rsuffix='_ass', sort=False)

    df_joined['deltar2cluster'] = deltar2cluster(df_joined)
    sel = (df_joined['deltar2cluster'] <= dR) & (df_joined['deltar2cluster'] > 0.0001)
    df_joined = df_joined[sel].copy(deep=True)

    dfL1Candidates.reset_index(inplace=True)
    dfL1Candidates.set_index(['event', 'cl3d_pt_c3'], inplace=True)
    
    dfL1Candidates['cl3d_etIso_dR{0}'.format(int(dR*100))] = df_joined.groupby(['event', 'cl3d_pt_c3'])['cl3d_pt_c3_ass'].sum()
    dfL1Candidates['cl3d_NclIso_dR{0}'.format(int(dR*100))] = df_joined.groupby(['event', 'cl3d_pt_c3'])['cl3d_pt_c3_ass'].size()

    dfL1Candidates.reset_index(inplace=True)
    dfL1Candidates.set_index('event',inplace=True)
    dfL1Candidates.fillna(0.0,inplace=True)

    del df_joined

def deltar2tower ( df ):
    delta_eta = np.abs(df['cl3d_eta'] - df['tower_eta'])
    delta_phi = np.abs(df['cl3d_phi'] - df['tower_phi'])
    sel = delta_phi > np.pi
    delta_phi = sel*(2*np.pi) - delta_phi
    return np.sqrt( delta_eta**2 + delta_phi**2 )

def L1TowerEtIso ( dfL1Candidates, dfL1Towers, dRsgn, dRiso, dRisoEm, dRisoHad ):
    df_joined  = dfL1Candidates.join(dfL1Towers, on='event', how='inner', rsuffix='_tow', sort=False) # use 'inner' so that only the events present in the candidates dataframe are actually joined

    df_joined['deltar2tower'] = deltar2tower(df_joined)
    sel_sgn = (df_joined['deltar2tower'] <= dRsgn)
    sel_iso = (df_joined['deltar2tower'] <= dRiso) & (df_joined['deltar2tower'] > dRsgn)
    df_joined_sgn = df_joined[sel_sgn].copy(deep=True)
    df_joined_iso = df_joined[sel_iso].copy(deep=True)

    dfL1Candidates.reset_index(inplace=True)
    dfL1Candidates.set_index(['event', 'cl3d_pt'], inplace=True)

    dfL1Candidates['tower_etSgn_dRsgn{0}'.format(int(dRsgn*100))] = df_joined_sgn.groupby(['event', 'cl3d_pt'])['tower_pt'].sum()
    dfL1Candidates['tower_eSgn_dRsgn{0}'.format(int(dRsgn*100))] = df_joined_sgn.groupby(['event', 'cl3d_pt'])['tower_energy'].sum()

    dfL1Candidates['tower_etIso_dRsgn{0}_dRiso{1}'.format(int(dRsgn*100),int(dRiso*100))] = df_joined_iso.groupby(['event', 'cl3d_pt'])['tower_pt'].sum()
    dfL1Candidates['tower_eIso_dRsgn{0}_dRiso{1}'.format(int(dRsgn*100),int(dRiso*100))] = df_joined_iso.groupby(['event', 'cl3d_pt'])['tower_energy'].sum()
    dfL1Candidates['tower_etEmIso_dRsgn{0}_dRiso{1}'.format(int(dRsgn*100),int(dRisoEm*100))] = df_joined_iso.groupby(['event', 'cl3d_pt'])['tower_etEm'].sum()
    dfL1Candidates['tower_etHadIso_dRsgn{0}_dRiso{1}'.format(int(dRsgn*100),int(dRisoHad*100))] = df_joined_iso.groupby(['event', 'cl3d_pt'])['tower_etHad'].sum() 

    dfL1Candidates.reset_index(inplace=True)
    dfL1Candidates.set_index('event',inplace=True)
    dfL1Candidates.fillna(0.0,inplace=True)

    del df_joined

def IsoCalculation(dfTr, dfTowers, dRsgn, dRiso, dRisoEm, dRisoHad, dRisoCl, mode='candidate'):
    if mode == 'Nu':
        print('       doing nu special L1 candidate selection')
        dfTr.reset_index(inplace=True)
        dfL1CandidatesTr = dfTr.query('cl3d_pubdt_passWP99==True').copy(deep=True) # selecting WP 99 we select also 95 and 90
        dfL1CandidatesTr.sort_values('cl3d_pt_c3', inplace=True)
        dfL1CandidatesTr.drop_duplicates('event', keep='last', inplace=True) # keep only highest pt cluster
        sel = dfTr['cl3d_pt_c3'].isin(dfL1CandidatesTr['cl3d_pt_c3'])
        dfL1ass2candTr = dfTr.drop(dfTr[sel].index)
        dfL1CandidatesTr.set_index('event', inplace=True)
        dfL1CandidatesTr.sort_values('event', inplace=True)
        dfL1ass2candTr.set_index('event', inplace=True)
        dfL1ass2candTr.sort_values('event', inplace=True)
        
    else:
        dfL1CandidatesTr = dfTr.query('cl3d_isbestmatch==True').copy(deep=True)
        dfL1ass2candTr = dfTr.query('cl3d_isbestmatch==False').copy(deep=True)

    # split the two endcaps to make the loops over the rows faster
    dfL1CandidatesTr_p = dfL1CandidatesTr.query('cl3d_eta>=0').copy(deep=True)
    dfL1CandidatesTr_m = dfL1CandidatesTr.query('cl3d_eta<0').copy(deep=True)
    dfL1ass2candTr_p = dfL1ass2candTr.query('cl3d_eta>=0').copy(deep=True)
    dfL1ass2candTr_m = dfL1ass2candTr.query('cl3d_eta<0').copy(deep=True)
    
    dfL1Towers_p = dfTowers.query('tower_eta>=0').copy(deep=True)
    dfL1Towers_m = dfTowers.query('tower_eta<0').copy(deep=True)

    print('    Dataset')
    print('        Positive endcap')
    L1Cl3dEtIso(dfL1CandidatesTr_p, dfL1ass2candTr_p, dRisoCl)
    L1TowerEtIso(dfL1CandidatesTr_p, dfL1Towers_p, dRsgn, dRiso, dRisoEm, dRisoHad)
    print('        Negative endcap')
    L1Cl3dEtIso(dfL1CandidatesTr_m, dfL1ass2candTr_m, dRisoCl)
    L1TowerEtIso(dfL1CandidatesTr_m, dfL1Towers_m, dRsgn, dRiso, dRisoEm, dRisoHad)

    dfOut = pd.concat([dfL1CandidatesTr_p, dfL1CandidatesTr_m], sort=False)

    return dfOut

open the needed files and select the needed Tau and QCD information

In [3]:
indir = '/home/llr/cms/motta/HGCAL/CMSSW_11_1_0/src/GRAPHAnalysis/L1BDT/hdf5dataframes/PUrejected'
matchdir = '/home/llr/cms/motta/HGCAL/CMSSW_11_1_0/src/GRAPHAnalysis/L1BDT/hdf5dataframes/matched'

name = 'threshold'

inFileTraining_dict = {
    'threshold'    : indir+'/Training_PU200_th_PUrejected.hdf5',
    'mixed'        : indir+'/'
}

inFileValidation_dict = {
    'threshold'    : indir+'/Validation_PU200_th_PUrejected.hdf5',
    'mixed'        : indir+'/'
}


inFileSingleTauTowers_dict = {
    'threshold'    : matchdir+'/RelValSingleTau_PU200_th_towers.hdf5',
    'mixed'        : matchdir+'/'
}

inFileQCDTowers_dict = {
    'threshold'    : matchdir+'/QCD_PU200_th_towers.hdf5',
    'mixed'        : matchdir+'/'
}

inFileNuTowers_dict = {
    'threshold'    : matchdir+'/RelValNu_PU200_th_towers.hdf5',
    'mixed'        : matchdir+'/'
}

In [4]:
store_tr = pd.HDFStore(inFileTraining_dict[name], mode='r')
dfTr = store_tr[name]
store_tr.close()

store_val = pd.HDFStore(inFileValidation_dict[name], mode='r')
dfVal = store_val[name]
store_val.close()

dfTau = pd.concat([dfTr, dfVal], sort=False).query('dataset==2').copy(deep=True)
dfQCD = pd.concat([dfTr, dfVal], sort=False).query('dataset==3').copy(deep=True)
dfNu  = pd.concat([dfTr, dfVal], sort=False).query('dataset==4').copy(deep=True)
del dfTr, dfVal

store = pd.HDFStore(inFileSingleTauTowers_dict[name], mode='r')
dfTauTowers = store[name]
store.close()

store = pd.HDFStore(inFileQCDTowers_dict[name], mode='r')
dfQCDTowers = store[name]
store.close()

store = pd.HDFStore(inFileNuTowers_dict[name], mode='r')
dfNuTowers = store[name]
store.close()
del store

calculate the isolation features

In [None]:
dRsgn_list = [0.1, 0.15, 0.2, 0.25]
dRiso_list = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

start = time.time()

dfTauIso = dfTau
dfQCDIso = dfQCD
dfNuIso = dfNu
for dRsgn in dRsgn_list:
    for dRiso in dRiso_list:
        if dRiso <= dRsgn: continue   
        
        print('dRsgn={0} ; dRiso={1}'.format(dRsgn,dRiso))
        
        dfTauIso = IsoCalculation(dfTauIso, dfTauTowers, dRsgn, dRiso, dRiso, dRiso, dRiso)
        dfQCDIso = IsoCalculation(dfQCDIso, dfQCDTowers, dRsgn, dRiso, dRiso, dRiso, dRiso)
        dfNuIso = IsoCalculation(dfNuIso, dfNuTowers, dRsgn, dRiso, dRiso, dRiso, dRiso)
        
end = time.time()
print('\nRunning time = %02dh %02dm %02ds'%((end-start)/3600, ((end-start)%3600)/60, (end-start)% 60))

store = pd.HDFStore('./dfTauIso.hdf5', mode='w')
store[name] = dfTauIso
store.close()

store = pd.HDFStore('./dfQCDIso.hdf5', mode='w')
store[name] = dfQCDIso
store.close()

store = pd.HDFStore('./dfNuIso.hdf5', mode='w')
store[name] = dfNuIso
store.close()

dRsgn=0.1 ; dRiso=0.2
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Negative endcap
dRsgn=0.1 ; dRiso=0.25
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Negative endcap
dRsgn=0.1 ; dRiso=0.3
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Negative endcap
dRsgn=0.1 ; dRiso=0.35
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Negative endcap
dRsgn=0.1 ; dRiso=0.4
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Negative endcap
    Dataset
        Positive endcap
        Nega

In [None]:
def simple_plotter(variable, dfTauIso, dfQCDIso=None, dfNuIso=None):
    fig = plt.figure(figsize = (15,10))
    ax1 = fig.add_subplot(221)
    ax2 = fig.add_subplot(222)
    
    ax1.scatter(dfTauIso['cl3d_abseta'], dfTauIso[variable], alpha=0.1, label='Tau')
    #ax1.scatter(dfQCDIso['cl3d_abseta'], dfQCDIso[variable], alpha=0.1, label='QCD')
    #ax1.scatter(dfNuIso['cl3d_abseta'], dfNuIso[variable], alpha=0.1, label='PU')
    ax1.set_xlabel('cl3d_abseta')
    ax1.set_ylabel(variable)
    
    ax2.scatter(dfTauIso['cl3d_pt_c3'], dfTauIso[variable], alpha=0.1, label='Tau')
    #ax2.scatter(dfQCDIso['cl3d_pt_c3'], dfQCDIso[variable], alpha=0.1, label='QCD')
    #ax2.scatter(dfNuIso['cl3d_pt_c3'], dfNuIso[variable], alpha=0.1, label='PU')
    ax2.set_xlabel('cl3d_pt_c3')
    ax2.set_ylabel(variable)
    
    fig.suptitle(variable)

In [None]:
def complex_plotter(variable, dfTauIso, dfQCDIso=None, dfNuIso=None):
    plt.figure(figsize = (7*14,10))
    ax1  = plt.subplot2grid((14,1), (1,0))
    ax2  = plt.subplot2grid((14,1), (2,0))
    ax3  = plt.subplot2grid((14,1), (3,0))
    ax4  = plt.subplot2grid((14,1), (4,0))
    ax5  = plt.subplot2grid((14,1), (5,0))
    ax6  = plt.subplot2grid((14,1), (6,0))
    ax7  = plt.subplot2grid((14,1), (7,0))
    ax8  = plt.subplot2grid((14,1), (8,0))
    ax9  = plt.subplot2grid((14,1), (9,0))
    ax10 = plt.subplot2grid((14,1), (10,0))
    ax11 = plt.subplot2grid((14,1), (11,0))
    ax12 = plt.subplot2grid((14,1), (12,0))
    ax13 = plt.subplot2grid((14,1), (13,0))
    ax14 = plt.subplot2grid((14,1), (14,0))
    
    ax1.hist(dfTauIso.query('gentau_bin_eta==1')[variable], lw=2, histtype='step', label='Tau')
    ax2.hist(dfTauIso.query('gentau_bin_eta==2')[variable], lw=2, histtype='step', label='Tau')
    ax3.hist(dfTauIso.query('gentau_bin_eta==3')[variable], lw=2, histtype='step', label='Tau')
    ax4.hist(dfTauIso.query('gentau_bin_eta==4')[variable], lw=2, histtype='step', label='Tau')
    ax5.hist(dfTauIso.query('gentau_bin_eta==5')[variable], lw=2, histtype='step', label='Tau')
    ax6.hist(dfTauIso.query('gentau_bin_eta==6')[variable], lw=2, histtype='step', label='Tau')
    ax7.hist(dfTauIso.query('gentau_bin_eta==7')[variable], lw=2, histtype='step', label='Tau')
    ax8.hist(dfTauIso.query('gentau_bin_eta==8')[variable], lw=2, histtype='step', label='Tau')
    ax9.hist(dfTauIso.query('gentau_bin_eta==9')[variable], lw=2, histtype='step', label='Tau')
    ax10.hist(dfTauIso.query('gentau_bin_eta==10')[variable], lw=2, histtype='step', label='Tau')
    ax11.hist(dfTauIso.query('gentau_bin_eta==11')[variable], lw=2, histtype='step', label='Tau')
    ax12.hist(dfTauIso.query('gentau_bin_eta==12')[variable], lw=2, histtype='step', label='Tau')
    ax13.hist(dfTauIso.query('gentau_bin_eta==13')[variable], lw=2, histtype='step', label='Tau')
    ax14.hist(dfTauIso.query('gentau_bin_eta==14')[variable], lw=2, histtype='step', label='Tau')
    
    ax1.hist(dfQCD.query('gentau_bin_eta==1')[variable], lw=2, histtype='step', label='QCD')
    ax2.hist(dfQCD.query('gentau_bin_eta==2')[variable], lw=2, histtype='step', label='QCD')
    ax3.hist(dfQCD.query('gentau_bin_eta==3')[variable], lw=2, histtype='step', label='QCD')
    ax4.hist(dfQCD.query('gentau_bin_eta==4')[variable], lw=2, histtype='step', label='QCD')
    ax5.hist(dfQCD.query('gentau_bin_eta==5')[variable], lw=2, histtype='step', label='QCD')
    ax6.hist(dfQCD.query('gentau_bin_eta==6')[variable], lw=2, histtype='step', label='QCD')
    ax7.hist(dfQCD.query('gentau_bin_eta==7')[variable], lw=2, histtype='step', label='QCD')
    ax8.hist(dfQCD.query('gentau_bin_eta==8')[variable], lw=2, histtype='step', label='QCD')
    ax9.hist(dfQCD.query('gentau_bin_eta==9')[variable], lw=2, histtype='step', label='QCD')
    ax10.hist(dfQCD.query('gentau_bin_eta==10')[variable], lw=2, histtype='step', label='QCD')
    ax11.hist(dfQCD.query('gentau_bin_eta==11')[variable], lw=2, histtype='step', label='QCD')
    ax12.hist(dfQCD.query('gentau_bin_eta==12')[variable], lw=2, histtype='step', label='QCD')
    ax13.hist(dfQCD.query('gentau_bin_eta==13')[variable], lw=2, histtype='step', label='QCD')
    ax14.hist(dfQCD.query('gentau_bin_eta==14')[variable], lw=2, histtype='step', label='QCD')
    
    #ax1.hist(dfNu.query('gentau_bin_eta==1')[variable], lw=2, histtype='step', label='Nu')
    #ax2.hist(dfNu.query('gentau_bin_eta==2')[variable], lw=2, histtype='step', label='Nu')
    #ax3.hist(dfNu.query('gentau_bin_eta==3')[variable], lw=2, histtype='step', label='Nu')
    #ax4.hist(dfNu.query('gentau_bin_eta==4')[variable], lw=2, histtype='step', label='Nu')
    #ax5.hist(dfNu.query('gentau_bin_eta==5')[variable], lw=2, histtype='step', label='Nu')
    #ax6.hist(dfNu.query('gentau_bin_eta==6')[variable], lw=2, histtype='step', label='Nu')
    #ax7.hist(dfNu.query('gentau_bin_eta==7')[variable], lw=2, histtype='step', label='Nu')
    #ax8.hist(dfNu.query('gentau_bin_eta==8')[variable], lw=2, histtype='step', label='Nu')
    #ax9.hist(dfNu.query('gentau_bin_eta==9')[variable], lw=2, histtype='step', label='Nu')
    #ax10.hist(dfNu.query('gentau_bin_eta==10')[variable], lw=2, histtype='step', label='Nu')
    #ax11.hist(dfNu.query('gentau_bin_eta==11')[variable], lw=2, histtype='step', label='Nu')
    #ax12.hist(dfNu.query('gentau_bin_eta==12')[variable], lw=2, histtype='step', label='Nu')
    #ax13.hist(dfNu.query('gentau_bin_eta==13')[variable], lw=2, histtype='step', label='Nu')
    #ax14.hist(dfNu.query('gentau_bin_eta==14')[variable], lw=2, histtype='step', label='Nu')
    
    plt.suptitle('variable')
    plt.show()

In [None]:
def superimposed_plotter(variable, dfTauIso, dfQCDIso=None, dfNuIso=None):
    plt.figure(figsize = (15,10))
    for eta_bin in [0, 2, 4, 6, 8, 10, 12]:
        plt.hist(dfTauIso.query('gentau_bin_eta=={0} or gentau_bin_eta=={1}'.format(eta_bin,eta_bin+1))[variable], lw=2, histtype='step', label=r'eta_bin {0}'.format(eta_bin), bins=np.arange(4,100,5), alpha=0.5)
    
    plt.legend(loc = 'upper right', fontsize=15)
    plt.show()

In [None]:
complex_plotter('tower_etIso_dRsgn10_dRiso50', dfTauIso)

In [None]:
superimposed_plotter('tower_etIso_dRsgn10_dRiso30', dfTauIso)

In [None]:
simple_plotter('tower_etIso_dRsgn10_dRiso50', dfTauIso)

In [None]:
dfTauIso.groupby('gentau_bin_eta').mean()