In [4]:
import os
import time

import pandas as pd
pd.set_option('display.max_columns',None)
import uproot
import numpy as np
import pickle

import torch
from torch.nn import Linear
from torch_geometric.nn import GravNetConv
from torch_geometric.data import Data

**_RUN THE FOLLOWING CELL ONLY IF THE HDF5 FILES HAVE NEVER BEEN CREATED, OTHERWISE SKIP THIS_**

In [2]:
indir = '/data_CMS_upgrade/motta/HGCAL_SKIMS/SKIM_18Feb2021'
outdir = '/home/llr/cms/motta/HGCAL/CMSSW_11_1_0/src/GRAPHAnalysis/L1GNN/hdf5dataframes/skimLevel'
os.system('mkdir -p '+outdir)

FE = 'threshold'

######################### IN-FILES ######################### 

inFileTau_dict = {
    'threshold'    : indir+'/SKIM_RelValTenTau_PU200/mergedOutput.root',
    'supertrigger' : indir+'/',
    'bestchoice'   : indir+'/',
    'bestcoarse'   : indir+'/',
    'mixed'        : indir+'/'
}

inFileHH_dict = {
    'threshold'    : indir+'/SKIM_GluGluHHTo2b2Tau_PU200/mergedOutput.root',
    'supertrigger' : indir+'/',
    'bestchoice'   : indir+'/',
    'bestcoarse'   : indir+'/',
    'mixed'        : indir+'/'
}

inFileNu_dict = {
    'threshold'    : indir+'/SKIM_RelValNu_PU200/mergedOutput.root',
    'supertrigger' : indir+'/',
    'bestchoice'   : indir+'/',
    'bestcoarse'   : indir+'/',
    'mixed'        : indir+'/'
}

inFileQCD_dict = {
    'threshold'    : indir+'/SKIM_QCD_PU200/mergedOutput.root',
    'supertrigger' : indir+'/',
    'bestchoice'   : indir+'/',
    'bestcoarse'   : indir+'/',
    'mixed'        : indir+'/'
}


######################### OUT-FILES ######################### 

outFileTau_dict = {
    'threshold'    : outdir+'/RelValTenTau_PU200_th.hdf5',
    'supertrigger' : outdir+'/',
    'bestchoice'   : outdir+'/',
    'bestcoarse'   : outdir+'/',
    'mixed'        : outdir+'/'
}

outFileHH_dict = {
    'threshold'    : outdir+'/GluGluHHTo2b2Tau_PU200_th.hdf5',
    'supertrigger' : outdir+'/',
    'bestchoice'   : outdir+'/',
    'bestcoarse'   : outdir+'/',
    'mixed'        : outdir+'/'
}

outFileNu_dict = {
    'threshold'    : outdir+'/RelValNu_PU200_th.hdf5',
    'supertrigger' : outdir+'/',
    'bestchoice'   : outdir+'/',
    'bestcoarse'   : outdir+'/',
    'mixed'        : outdir+'/'
}

outFileQCD_dict = {
    'threshold'    : outdir+'/QCD_PU200_th.hdf5',
    'supertrigger' : outdir+'/',
    'bestchoice'   : outdir+'/',
    'bestcoarse'   : outdir+'/',
    'mixed'        : outdir+'/'
}


######################### CREATE HDF5 FILES ######################### 

# TTree to be read
treename = 'SkimmedTree'

branches_tc = ['event', 'tc_id', 'tc_subdet', 'tc_zside', 'tc_layer', 'tc_waferu', 'tc_waferv', 
               'tc_wafertype', 'tc_panel_number', 'tc_panel_sector', 'tc_cellu', 'tc_cellv', 'tc_data', 
               'tc_uncompressedCharge', 'tc_compressedCharge',  'tc_pt', 'tc_energy', 'tc_eta', 'tc_phi', 
               'tc_x', 'tc_y', 'tc_z', 'tc_mipPt', 'tc_cluster_id', 'tc_multiuclaster_id', 
               'tc_multicluster_pt']

branches_cl3d = ['event', 'cl3_id', 'cl3d_pt', 'cl3d_energy', 'cl3d_eta', 'cl3d_phi', 'cl3d_clusters_n',
                 'cl3d_showerlength', 'cl3d_coreshowerlength',  'cl3d_firstlayer','cl3d_maxlayer','cl3d_seetot',
                 'cl3d_spptot', 'cl3d_sppmax', 'cl3d_szz', 'cl3d_srrtot', 'cl3d_srrmax', 'cl3d_srrmean', 
                 'cled_emaxe', 'cl3d_hoe', 'cl3d_meanz', 'cl3d_layer10', 'cl3d_layer50', 'cl3d_layer90', 
                 'cl3d_ntc67', 'cl3d_ntc90', 'cl3d_bdteg', 'cl3d_uality']

branches_gentau = ['event', 'gentau_pt', 'gentau_eta', 'gentau_phi', 'gentau_energy', 'gentau_mass', 
                   'gentau_vis_pt', 'gentau_vis_eta', 'gentau_vis_phi', 'gentau_vis_energy', 'gentau_vis_mass', 
                   'gentau_decayMode']

branches_genjet = ['event', 'genjet_pt', 'genjet_eta', 'genjet_phi', 'genjet_energy', 'genjet_mass']


#dfTau_gentau = uproot.open(inFileTau_dict[FE])[treename].arrays()
dfTau_gentau = root_pandas.read_root(inFileTau_dict[FE], key=treename, columns=branches_event_cl3d, flatten=False)
#dfTau_cl3d = uproot.open(inFileTau_dict[FE])[treename].tree.pandas.df(branches_cl3d, flatten=False)
#dfTau_tc = uproot.open(inFileTau_dict[FE])[treename].tree.pandas.df(branches_tc, flatten=False)
print(dfTau_gentau)

'''
#TBranches to be stored containing the 3D clusters' info
branches_event_cl3d = ['event','cl3d_pt','cl3d_eta','cl3d_phi','cl3d_showerlength','cl3d_coreshowerlength', 'cl3d_firstlayer','cl3d_maxlayer','cl3d_seetot','cl3d_spptot','cl3d_szz', 'cl3d_srrtot', 'cl3d_srrmean', 'cl3d_hoe', 'cl3d_meanz', 'cl3d_layer10', 'cl3d_layer50', 'cl3d_layer90', 'cl3d_ntc67', 'cl3d_ntc90']
branches_cl3d       = ['cl3d_pt','cl3d_eta','cl3d_phi','cl3d_showerlength','cl3d_coreshowerlength', 'cl3d_firstlayer','cl3d_maxlayer','cl3d_seetot','cl3d_spptot','cl3d_szz', 'cl3d_srrtot', 'cl3d_srrmean', 'cl3d_hoe', 'cl3d_meanz', 'cl3d_layer10', 'cl3d_layer50', 'cl3d_layer90', 'cl3d_ntc67', 'cl3d_ntc90']
# TBranches to be stored containing the gen taus' info
branches_event_gentau = ['event', 'gentau_pt', 'gentau_eta', 'gentau_phi', 'gentau_energy', 'gentau_mass', 'gentau_vis_pt', 'gentau_vis_eta', 'gentau_vis_phi', 'gentau_vis_energy', 'gentau_vis_mass', 'gentau_decayMode']
branches_gentau       = ['gentau_pt', 'gentau_eta', 'gentau_phi', 'gentau_energy', 'gentau_mass', 'gentau_vis_pt', 'gentau_vis_eta', 'gentau_vis_phi', 'gentau_vis_energy', 'gentau_vis_mass', 'gentau_decayMode']
# TBranches to be stored containing the gen jets' info
branches_event_genjet = ['event', 'genjet_pt', 'genjet_eta', 'genjet_phi', 'genjet_energy', 'genjet_mass']
branches_genjet       = ['genjet_pt', 'genjet_eta', 'genjet_phi', 'genjet_energy', 'genjet_mass']

# fill tau dataframes and dictionaries -> training 
df_tau_cl3d = root_pandas.read_root(inFileTau_dict[FE], key=treename, columns=branches_event_cl3d, flatten=branches_cl3d)
df_tau_gentau = root_pandas.read_root(inFileTau_dict[FE], key=treename, columns=branches_event_gentau, flatten=branches_gentau)
dfTau = pd.concat([df_tau_cl3d,df_tau_gentau],sort=False)
store_tau = pd.HDFStore(outFileTau_dict[FE], mode='w')
store_tau[FE] = dfTau
store_tau.close()

# fill HH dataframes and dictionaries -> validation
df_hh_cl3d = root_pandas.read_root(inFileHH_dict[FE], key=treename, columns=branches_event_cl3d, flatten=branches_cl3d)
df_hh_gentau = root_pandas.read_root(inFileHH_dict[FE], key=treename, columns=branches_event_gentau, flatten=branches_gentau)
dfHH = pd.concat([df_hh_cl3d,df_hh_gentau],sort=False)
store_hh = pd.HDFStore(outFileHH_dict[FE], mode='w')
store_hh[FE] = dfHH
store_hh.close()

# fill nu pileup dataframes and dictionaries
df_nu_cl3d = root_pandas.read_root(inFileNu_dict[FE], key=treename, columns=branches_event_cl3d, flatten=branches_cl3d)
dfNu = pd.concat([df_nu_cl3d],sort=False)
store_nu = pd.HDFStore(outFileNu_dict[FE], mode='w')
store_nu[FE] = dfNu
store_nu.close()

# fill QCD dataframes and dictionaries -> 1/2 training + 1/2 validation 
df_qcd_cl3d = root_pandas.read_root(inFileQCD_dict[FE], key=treename, columns=branches_event_cl3d, flatten=branches_cl3d)
df_qcd_genjet = root_pandas.read_root(inFileQCD_dict[FE], key=treename, columns=branches_event_genjet, flatten=branches_genjet)
dfQCD = pd.concat([df_qcd_cl3d,df_qcd_genjet],sort=False)
dfQCD['gentau_decayMode'] = 4 # tag as QCD background
store_qcd = pd.HDFStore(outFileQCD_dict[FE], mode='w')
store_qcd[FE] = dfQCD
store_qcd.close()
'''

NameError: name 'root_pandas' is not defined

In [None]:
dfTau

**_RUN FROM THE FOLLOWING CELL IF THE HDF5 FILES HAVE ALREADY BEEN CREATED_**

In [None]:
indir = '/home/llr/cms/motta/HGCAL/CMSSW_11_1_0/src/GRAPHAnalysis/L1GNN/hdf5dataframes/skimLevel'

FE = 'threshold'

######################### IN-FILES ######################### 

inFileTau_dict = {
    'threshold'    : indir+'/RelValTenTau_PU200_th.hdf5',
    'supertrigger' : indir+'/',
    'bestchoice'   : indir+'/',
    'bestcoarse'   : indir+'/',
    'mixed'        : indir+'/'
}

inFileHH_dict = {
    'threshold'    : indir+'/GluGluHHTo2b2Tau_PU200_th.hdf5',
    'supertrigger' : indir+'/',
    'bestchoice'   : indir+'/',
    'bestcoarse'   : indir+'/',
    'mixed'        : indir+'/'
}

inFileNu_dict = {
    'threshold'    : indir+'/RelValNu_PU200_th.hdf5',
    'supertrigger' : indir+'/',
    'bestchoice'   : indir+'/',
    'bestcoarse'   : indir+'/',
    'mixed'        : indir+'/'
}

inFileQCD_dict = {
    'threshold'    : indir+'/QCD_PU200_th.hdf5',
    'supertrigger' : indir+'/',
    'bestchoice'   : indir+'/',
    'bestcoarse'   : indir+'/',
    'mixed'        : indir+'/'
}

# fill tau dataframes and dictionaries -> training 
store_tau = pd.HDFStore(inFileTau_dict[FE], mode='r')
dfTau = store_tau[FE]
store_tau.close()

# fill nu pileup dataframes and dictionaries
store_nu = pd.HDFStore(inFileNu_dict[FE], mode='r')
dfNu = store_nu[FE]
store_nu.close()   

# fill HH dataframes and dictionaries -> validation
store_hh = pd.HDFStore(inFileHH_dict[FE], mode='r')
dfHH = store_hh[FE]
store_hh.close()

# fill QCD dataframes and dictionaries -> 1/2 training + 1/2 validation 
store_qcd= pd.HDFStore(inFileQCD_dict[FE], mode='r')
dfQCD = store_qcd[FE]
store_qcd.close() 

In [None]:
bdtcut = 'cl3d_pubdt_passWP99'
# bdtcut = 'cl3d_pubdt_passWP95'
# bdtcut = 'cl3d_pubdt_passWP90'


######################### SELECT EVENTS FOR TRAINING #########################  

# SIGNAL
dfTauTraining = dfTau.query('gentau_vis_pt>20' and 'gentau_vis_eta>1.6' and 'gentau_vis_eta<2.9' 
                            and 'gentau_vis_eta>-2.9' and 'gentau_vis_eta<-1.6' and 'cl3d_isbestmatch==True' 
                            and 'cl3d_pt>4' and 'gentau_decayMode>=0' and bdtcut+'==True')

# BACKGROUND
dfQCDTraining = dfQCD.sample(frac=0.5,random_state=10)
dfQCDValidation = dfQCD.drop(dfQCDTraining.index)

dfQCDTraining = dfQCDTraining.query('genjet_pt>20' and 'genjet_eta>1.6' and 'genjet_eta<2.9' 
                    and 'genjet_eta>-2.9' and 'genjet_eta<-1.6' and 'cl3d_isbestmatch==True' 
                    and 'cl3d_pt>4' and bdtcut+'==True')

# VALIDATION
dfHHValidation = dfHH.query('cl3d_eta>1.6' and 'cl3d_eta<2.9' and 'cl3d_pt_c3>-2.9' and 'cl3d_eta<-1.6' 
                            and 'cl3d_pt_c3>4')

dfQCDValidation = dfQCDValidation.query('cl3d_eta>1.6' and 'cl3d_eta<2.9' and 'cl3d_pt_c3>-2.9' and 'cl3d_eta<-1.6' 
                            and 'cl3d_pt_c3>4')

# MERGE
dfMergedTraining = pd.concat([dfTauTraining,dfQCDTraining],sort=True)
dfMergedValidation = pd.concat([dfHHValidation,dfQCDValidation],sort=False)

dfMergedTraining = dfMergedTraining.astype('float64')
dfMergedValidation = dfMergedValidation.astype('float64')

In [None]:
######################### SELECT SEPARATE DMs #########################

dfTau_DM0 = dfTau.query('gentau_decayMode==0')
dfTau_DM1 = dfTau.query('gentau_decayMode==1')
dfTau_DM10 = dfTau.query('gentau_decayMode==10')
dfTau_DM11 = dfTau.query('gentau_decayMode==11')
dfTau_DM5 = dfTau.query('gentau_decayMode==5')
dfTau_DM2 = dfTau.query('gentau_decayMode==5' or 'gentau_decayMode==6')
dfTau_DM3 = dfTau.query('gentau_decayMode==10' or 'gentau_decayMode==11')
dfHH_DM0 = dfHH.query('gentau_decayMode==0')
dfHH_DM1 = dfHH.query('gentau_decayMode==1')
dfHH_DM10 = dfHH.query('gentau_decayMode==10')
dfHH_DM11 = dfHH.query('gentau_decayMode==11')
dfHH_DM5 = dfHH.query('gentau_decayMode==5')
dfHH_DM2 = dfHH.query('gentau_decayMode==5' or 'gentau_decayMode==6')
dfHH_DM3 = dfHH.query('gentau_decayMode==10' or 'gentau_decayMode==11')

# replace DM>=5 with category numbers 2/3 (REMEMBER: category number 4 is QCD)
dfMergedTraining['gentau_decayMode'].replace([5,6], 2, inplace=True)
dfMergedValidation['gentau_decayMode'].replace([5,6], 2, inplace=True)
dfMergedTraining['gentau_decayMode'].replace([10,11], 3, inplace=True)
dfMergedValidation['gentau_decayMode'].replace([10,11], 3, inplace=True)

In [None]:
features = ['cl3d_pt_c3', 'cl3d_abseta', 'cl3d_showerlength', 'cl3d_coreshowerlength', 'cl3d_firstlayer', 
            'cl3d_maxlayer', 'cl3d_szz', 'cl3d_seetot', 'cl3d_spptot', 'cl3d_srrtot', 'cl3d_srrmean', 'cl3d_hoe', 
            'cl3d_meanz', 'cl3d_layer10', 'cl3d_layer50', 'cl3d_layer90', 'cl3d_ntc67', 'cl3d_ntc90']

In [None]:
model = GravNetConv(18,6,4,3,4,aggr='mean')
print(model)

In [None]:
xTrainingTensor = torch.tensor(dfMergedTraining[features].values)
yTrainingTensor = torch.tensor(dfMergedTraining['gentau_decayMode'].values)
data = Data(x=xTrainingTensor,y=yTrainingTensor)

In [None]:
data.x

In [None]:
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)  # Define optimizer.

def train(data):
    optimizer.zero_grad()  # Clear gradients.
    out, h = model(data.x, data.y)   # Perform a single forward pass.
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
    loss.backward()   # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss, h

In [None]:
train(data)

In [None]:
from torch_geometric.datasets import KarateClub

dataset = KarateClub()

In [None]:
print(dataset.num_features)

In [None]:
(dfMergedTraining[features].values)