In [6]:
# imports

import numpy as np

# import keras
# from keras.layers import Input, Dense, BatchNormalization
# from keras.callbacks import ModelCheckpoint, EarlyStopping

# from tqdm.auto import tqdm 

# from sklearn.metrics import roc_curve, auc
# from sklearn.preprocessing import StandardScaler
# from sklearn.dedef get_labels(tree,label):
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import matplotlib.pyplot as plt
import uproot
import awkward as ak

import pandas as pd
import seaborn as sns

In [7]:
def get_labels(tree,label):
    '''
    Function to return the labels array out of a root tree.
    This function is required because we use 2 sets of 2 lables each
    where each set is combined with a logical "and".
    For instance "fj_isQCD and sample_isQCD" is the final label to label a jet as
    originating from QCD.
    '''
    prods = label.split('*')
    facts = tree.arrays(prods,library='np')
    labels = np.multiply(facts[prods[0]],facts[prods[1]])
    return labels

In [8]:
def reshape(awk_ar,sorter,fet_lngth, ascending=False):                                                                                                 
    '''
    This function creates takes an awkward array of track information in an event, and formats it in a standard 2D array.
    The tracks of an event are sorted according to the 'sorter' argument in descending order (greatest value first).
    Events that have too many tracks are truncated, while those with too few are padded with zeros afterwards.
    The returned array is an array of 2D arrays containing  track information for each event.
    '''
    evt_ar = [[] for i in range(len(awk_ar[0]))]
    srt_idx = np.where(np.array(features)==sorter)[0][0]
    
    for fet in awk_ar:
        for evt_n in range(len(fet)):
            evt_ar[evt_n].append(np.array(fet[evt_n]))
            
    # Making 2D arrays for each event, where each each row of that event's 2D array is now a feature array
    evt_ar2d = []
    
    for evt in evt_ar:
        evt_ar2d.append(np.stack(evt))
        
    # Now sorting each 2d array's columns according to the 'sorter' feature
    evt_ar2d_srtd = []
    
    for evt in evt_ar2d:
        idcs = evt[srt_idx].argsort()[::-1]
        evt_ar2d_srtd.append(evt[:,idcs])
        
    # Now need to standardize feature array per event
    evt_std = []
    npad = 0
    
    for evt in evt_ar2d_srtd:
        if len(evt[0]) > fet_lngth:
            evt_std.append( np.swapaxes(evt[:,:fet_lngth],0,1) )
#             evt_std.append( evt[:,:fet_lngth].flatten() )
        else:
            npad += 1
            padded = np.pad(evt,((0,0),(0,fet_lngth - len(evt[0]))))
            evt_std.append( np.swapaxes(padded,0,1) )
#             evt_std.append( np.pad(evt,((0,0),(0,fet_lngth - len(evt[0])))).flatten() )

    print('Number of events padded: {}'.format(npad))
    
    return np.array(evt_std)

In [48]:
def get_features(file_name, FetLbls, var_sort, ntrx):
    '''
    Function that extracts our chosen feature and label arrays from a root file
    for the events that are labeled as QCD or Hbb and returns two 2D arrays.
    The first array is the features array and has the shape (nummber_of_events, number_of_features).
    The second array is the labels array ans has the shape (number_of_events, 2)
    '''
    with uproot.open(f"{file_name}:deepntuplizer/tree") as tree:

        lbl_data = []
        fet_data = []
        
        
        # LABELS
        for lbl in FetLbls[1]:
            lbl_data.append(get_labels(tree,lbl))
    
        msk_raw = lbl_data[0] + lbl_data[1]
        
        nsig = lbl_data[1].sum()
        
        if nsig < lbl_data[0].sum():
            bkg = np.where(lbl_data[0] == 1)[0]
            np.random.shuffle(bkg)
            msk_raw[bkg[lbl_data[1].sum():]] = 0
        
        msk = msk_raw == 1
        
        # This part organizes it by event, rather than feature. (features move from rows to columns)
        label_array = np.stack(lbl_data,axis=-1)[msk]
        
        
        
        # FEATURES
        for fet in FetLbls[0]:
            fet_data.append(np.array(tree[fet])[msk])
            
        feature_array = reshape(fet_data, var_sort, ntrx)

        
    #     return arrays
    return feature_array, label_array

In [10]:
features = ['trackBTag_DeltaR',
            'trackBTag_Eta',
            'trackBTag_EtaRel',
            'trackBTag_JetDistVal',
            'trackBTag_Momentum',
            'trackBTag_PPar',
            'trackBTag_PParRatio',
            'trackBTag_PtRatio',
            'trackBTag_PtRel',
            'trackBTag_Sip2dSig',
            'trackBTag_Sip2dVal',
            'trackBTag_Sip3dSig',
            'trackBTag_Sip3dVal',
            'track_charge',
            'track_deltaR',
            'track_drminsv',
            'track_drsubjet1',
            'track_drsubjet2',
            'track_dxy',
            'track_dxysig',
            'track_dz',
            'track_dzsig',
            'track_erel',
            'track_etarel',
            'track_mass',
            'track_phirel',
            'track_pt',
            'track_ptrel']
    
labels = ['fj_isQCD*sample_isQCD',
          'fj_isH*fj_isBB']

FetLbls = [features,labels]

In [49]:
feature_array, label_array = get_features('../root_files/ntuple_merged_0.root',FetLbls,'trackBTag_Momentum',10)

Number of events padded: 1317


In [12]:
feature_array = np.array(feature_array)

In [50]:
print(feature_array.shape)
print(label_array.shape)

(49560, 10, 28)
(49560, 2)


In [52]:
features.index('trackBTag_Momentum')

4

In [53]:
for x in feature_array[24][:,4]:
    print(x)

462.19162
100.69239
59.379467
46.633453
29.689539
23.133604
20.600712
17.354967
15.507888
8.719406


# Multi-File Test

In [54]:
fls = [0,10]  #,11]

data = np.array([])
lbl = np.array([])

for i in fls:
    feature_array, label_array = get_features(f'../root_files/ntuple_merged_{i}.root',FetLbls,'trackBTag_Momentum',10)
    
    if len(data) == 0:
        data = feature_array
        lbl = label_array
    else:
        data = np.concatenate((data,feature_array))
        lbl = np.concatenate((lbl,label_array))

Number of events padded: 1377
Number of events padded: 1332


In [56]:
data.shape

(98848, 10, 28)