In [1]:
import uproot
import pandas as pd
import numpy as np
import boost_histogram as bh
import matplotlib.pyplot as plt
import pickle
import atlasplots as ap
%matplotlib inline

In [2]:
folder = '/eos/atlas/atlascerngroupdisk/perf-egamma/InclusivePhotons'
branches = ['evtWeight', 'mcWeight', 'mcTotWeight', 'yWeight', 'y_passOQ', 'y_pt', 'y_eta', 
            'y_isTruthMatchedPhoton', 'y_convType', 'y_Rhad1', 'y_Rhad',       #do we need y_convType thats not truth?
            'y_Reta', 'y_weta2', 'y_Rphi', 'y_wtots1', 'y_weta1', 'y_fracs1', 'y_deltae', 'y_Eratio', 'y_f1']

This notebook could be turned into a python script that makes `.pickle` or `.npz` files for the training script. Would then have to adapt it to take all entries, not just first low-$p_T$ events. Might run into memory problems

In [3]:
# RIGHT NOW JUST TESTING WITH mc20a AND 250000 ENTRIES
# can change later
    
df_mc20a_gjfull = ap.fileloader(folder+'/mc20_gammajet_v09/PyPt8_inf_mc20a_p5536_Rel22_AB22.2.97_v09.root',branches,entry_stop=250000)
df_mc20a_jjfull = ap.fileloader(folder+'/mc20_jetjet_v09/Py8_jetjet_mc20a_p5536_Rel22_AB22.2.97_v09.root',branches,entry_stop=250000)
df_mc20d_gjfull = ap.fileloader(folder+'/mc20_gammajet_v09/PyPt8_inf_mc20d_p5536_Rel22_AB22.2.97_v09.root',branches) #just 1000 without specification
df_mc20d_jjfull = ap.fileloader(folder+'/mc20_jetjet_v09/Py8_jetjet_mc20d_p5536_Rel22_AB22.2.97_v09.root',branches)
df_mc20e_gjfull = ap.fileloader(folder+'/mc20_gammajet_v09/PyPt8_inf_mc20e_p5536_Rel22_AB22.2.97_v09.root',branches)
df_mc20e_jjfull = ap.fileloader(folder+'/mc20_jetjet_v09/Py8_jetjet_mc20e_p5536_Rel22_AB22.2.97_v09.root',branches)

In [4]:
# COMBINING a, d, e SLICES

df_mc20_gjfull = pd.concat([df_mc20a_gjfull, df_mc20d_gjfull, df_mc20e_gjfull])
df_mc20_jjfull = pd.concat([df_mc20a_jjfull, df_mc20d_jjfull, df_mc20e_jjfull])

In [5]:
# CREATING GOOD WEIGHTS WITHOUT PHOTON SFs

df_mc20_gjfull['goodWeight'] = df_mc20_gjfull['mcTotWeight']/df_mc20_gjfull['yWeight']
df_mc20_jjfull['goodWeight'] = df_mc20_jjfull['mcTotWeight']/df_mc20_jjfull['yWeight']

In [6]:
# CREATING HadLeakage VARIABLE:

df_mc20_gjfull['HadLeakage'] = ap.makehadlist(df_mc20_gjfull)
df_mc20_jjfull['HadLeakage'] = ap.makehadlist(df_mc20_jjfull)

In [7]:
# PASSING OBJECT QUALITY
df_mc20_gjfull = df_mc20_gjfull[df_mc20_gjfull.y_passOQ]
df_mc20_jjfull = df_mc20_jjfull[df_mc20_gjfull.y_passOQ]

# TRUTH MATCHING THE REAL AND FAKE photons

df_mc20_gj = df_mc20_gjfull[df_mc20_gjfull.y_isTruthMatchedPhoton]
# df_mc20_gj.index = list(range(len(df_mc20_gj)))   #resetting indices
df_mc20_jj = df_mc20_jjfull[~df_mc20_jjfull.y_isTruthMatchedPhoton]
# df_mc20_jj.index = list(range(len(df_mc20_gj)))   #resetting indices

In [8]:
# APPLYING ETA PRESELECTION
#1.37 ≤ |η| ≤ 1.52 & |η| < 2.37 

etapresel_gj = ((abs(df_mc20_gj.y_eta) <= 1.37) | (abs(df_mc20_gj.y_eta) >= 1.52)) & (abs(df_mc20_gj.y_eta) < 2.37)
etapresel_jj = ((abs(df_mc20_jj.y_eta) <= 1.37) | (abs(df_mc20_jj.y_eta) >= 1.52)) & (abs(df_mc20_jj.y_eta) < 2.37)


df_mc20_gj = df_mc20_gj[etapresel_gj]
df_mc20_jj = df_mc20_jj[etapresel_jj]

In [9]:
# APPLYING E_T PRESELECTION
# ET > 25 GeV ⇒ signal enriched sample

# ? is E_T the same as p_T? Do I even need to do this?

In [10]:
df_mc20_gj

Unnamed: 0,evtWeight,mcWeight,mcTotWeight,yWeight,y_passOQ,y_pt,y_eta,y_isTruthMatchedPhoton,y_convType,y_Rhad1,...,y_weta2,y_Rphi,y_wtots1,y_weta1,y_fracs1,y_deltae,y_Eratio,y_f1,goodWeight,HadLeakage
0,0.694366,1.0,1376.094295,1.012471,True,15.728769,0.401363,True,0,0.012368,...,0.011130,0.953994,2.065897,0.558274,0.218920,43.794846,0.956904,0.166216,1359.144458,0.012368
1,0.231644,1.0,459.071480,0.944613,True,16.643381,1.317189,True,0,0.007513,...,0.010982,0.814478,2.255660,0.637920,0.270443,1.123573,0.883094,0.306469,485.989206,0.008259
2,1.083721,1.0,2147.718576,0.955260,True,11.592933,2.353105,True,0,-0.012112,...,0.011877,0.940090,1.658180,0.543524,0.106485,276.999481,0.703117,0.106133,2248.308061,-0.012112
3,1.042794,1.0,2066.610245,0.944097,True,14.840798,1.032143,True,0,-0.000838,...,0.008600,0.985699,2.688000,0.604637,0.372434,24.996504,0.638554,0.032494,2188.981572,-0.004752
4,0.997500,1.0,1976.846219,1.000000,True,9.218862,-0.988313,True,0,0.000250,...,0.010375,0.972798,1.815609,0.573713,0.323983,13.994884,0.896193,0.153142,1976.846219,0.003601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.012271,1.0,1906.840090,0.972677,True,10.823894,0.642223,True,0,0.009294,...,0.011273,0.928582,1.845362,0.603263,0.254370,36.967400,0.936974,0.378895,1960.403629,0.009294
996,0.979401,1.0,1844.922717,1.000000,True,9.040781,-1.867435,True,0,-0.014750,...,0.011547,0.954865,-9066.329102,0.517254,0.023107,64.174339,0.952581,0.070673,1844.922717,-0.014750
997,0.657147,1.0,1237.884979,1.000000,True,9.350460,-1.556191,True,0,0.018199,...,0.009776,0.980276,2.320039,0.611157,0.209377,47.000000,0.928269,0.154816,1237.884979,0.018199
998,1.264885,1.0,2382.696457,1.071660,True,17.309237,0.520234,True,3,0.030799,...,0.009231,1.011031,1.930917,0.584344,0.166342,56.000389,0.949240,0.312131,2223.370681,0.030799


In [11]:
# COMBINING gj AND jj  (for correct standardization)

df_mc20_all = pd.concat([df_mc20_gj,df_mc20_jj])

In [12]:
# CREATING STANDARDIZED VARIABLES
# *fix this later to just apply to 'all' df. but then couldnt save gj and jj separately hmm.

branchlist = ap.branchlist[2:]
minmaxlist = ap.minmaxlist[2:]
labellist = ap.labellist[2:]

for i in range(len(branchlist)):
    branchname = branchlist[i]
    label = labellist[i]
    minmax = minmaxlist[i]
    datagj = np.array(df_mc20_gj[branchname])
    datajj = np.array(df_mc20_jj[branchname])
    data = np.array(df_mc20_all[branchname])
    standlistgj = (datagj - np.mean(data))/np.std(data)
    standlistjj = (datajj - np.mean(data))/np.std(data)
    df_mc20_gj[branchname+'_stand'] = standlistgj
    df_mc20_jj[branchname+'_stand'] = standlistjj

In [13]:
# do more weighting here (ET and eta matching?), also get rid of the columns that aren't needed.
#also, add in eta preselection from FLorian slides.
df_mc20_gj.keys()

Index(['evtWeight', 'mcWeight', 'mcTotWeight', 'yWeight', 'y_passOQ', 'y_pt',
       'y_eta', 'y_isTruthMatchedPhoton', 'y_convType', 'y_Rhad1', 'y_Rhad',
       'y_Reta', 'y_weta2', 'y_Rphi', 'y_wtots1', 'y_weta1', 'y_fracs1',
       'y_deltae', 'y_Eratio', 'y_f1', 'goodWeight', 'HadLeakage',
       'HadLeakage_stand', 'y_Reta_stand', 'y_Rphi_stand', 'y_weta2_stand',
       'y_wtots1_stand', 'y_weta1_stand', 'y_fracs1_stand', 'y_deltae_stand',
       'y_Eratio_stand', 'y_f1_stand'],
      dtype='object')

In [14]:
#TAKING ONLY WANTED COLUMNS

df_mc20_gj_clean = df_mc20_gj[['mcTotWeight','goodWeight','y_pt',
       'y_eta', 'y_isTruthMatchedPhoton', 'y_convType',
        'HadLeakage', 'y_Reta', 'y_weta2', 'y_Rphi', 'y_wtots1', 
        'y_weta1', 'y_fracs1', 'y_deltae', 'y_Eratio', 'y_f1', 
        'HadLeakage_stand', 'y_Reta_stand', 'y_Rphi_stand', 'y_weta2_stand',
       'y_wtots1_stand', 'y_weta1_stand', 'y_fracs1_stand', 'y_deltae_stand',
       'y_Eratio_stand', 'y_f1_stand']]
df_mc20_gj_clean.index = list(range(len(df_mc20_gj_clean)))   #resetting indices

df_mc20_jj_clean = df_mc20_jj[['mcTotWeight','goodWeight','y_pt',
       'y_eta', 'y_isTruthMatchedPhoton', 'y_convType',
        'HadLeakage', 'y_Reta', 'y_weta2', 'y_Rphi', 'y_wtots1', 
        'y_weta1', 'y_fracs1', 'y_deltae', 'y_Eratio', 'y_f1', 
        'HadLeakage_stand', 'y_Reta_stand', 'y_Rphi_stand', 'y_weta2_stand',
       'y_wtots1_stand', 'y_weta1_stand', 'y_fracs1_stand', 'y_deltae_stand',
       'y_Eratio_stand', 'y_f1_stand']]
df_mc20_jj_clean.index = list(range(len(df_mc20_jj_clean)))   #resetting indices

In [15]:
# WRITING FILES

ap.picklewrite(df_mc20_gj_clean,'df_mc20_gj_clean.pickle')
ap.picklewrite(df_mc20_jj_clean,'df_mc20_jj_clean.pickle')

#could also write together to one file, like .npz that john has
# should shuffle them, should i do this here or after?

df_mc20_clean_ordered = pd.concat([df_mc20_gj_clean,df_mc20_jj_clean])
df_mc20_clean = df_mc20_clean_ordered.sample(frac=1).reset_index(drop=True)    #shuffling and resetting indices
ap.picklewrite(df_mc20_clean,'df_mc20_clean.pickle')

In [16]:
# print(np.array(df_mc20_clean.y_convType)[:1000]) #converted:>0, unconverted:0

In [17]:
# SAVING MORE FILES, CONV/UNCONV and EVEN/ODD

In [18]:
#converted and unconverted
df_conv = df_mc20_clean[df_mc20_clean.y_convType > 0]
df_conv.index = list(range(len(df_conv))) #resetting indicies
df_unconv = df_mc20_clean[df_mc20_clean.y_convType == 0]
df_unconv.index = list(range(len(df_unconv))) #resetting indicies

In [19]:
#SEPARATING EVEN AND ODD

def evenodd(df_name):
    '''separates input dataframe df_name into even events and odd events, by index'''
    evenlist = []
    oddlist = []
    for i in range(len(df_name)):
#         print(i)
        if i%2 == 0:
            evenlist.append(i)
        else:
            oddlist.append(i)
            
    even = df_name.loc[evenlist]
    odd =  df_name.loc[oddlist]
    
    return even, odd

In [20]:
df_evenc,df_oddc = evenodd(df_conv)
df_evenu, df_oddu = evenodd(df_unconv)

In [21]:
ap.picklewrite(df_evenc,'df_mc20_clean_conv_even.pickle')
ap.picklewrite(df_oddc,'df_mc20_clean_conv_odd.pickle')
ap.picklewrite(df_evenu,'df_mc20_clean_unconv_even.pickle')
ap.picklewrite(df_oddu,'df_mc20_clean_unconv_odd.pickle')

In [24]:
df_evenc.shape,df_evenu.shape

((18811, 26), (226056, 26))

In [31]:
1 - sum(df_oddc.y_isTruthMatchedPhoton)/len(df_oddc)

0.849016480595428