In [1]:
# imports

import numpy as np

import keras
from keras.layers import Input, Dense, BatchNormalization
from keras.callbacks import ModelCheckpoint, EarlyStopping

from tqdm.auto import tqdm 

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import matplotlib.pyplot as plt
import uproot
import awkward as ak

import pandas as pd
import seaborn as sns

2022-11-29 20:03:31.422634: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# latex settings block

plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
    "font.serif": ["Palatino"],
    "xtick.labelsize": 18,
    "ytick.labelsize": 18,
    "legend.numpoints": 1,
    "axes.linewidth": 2,
    "legend.title_fontsize": 18,
    "axes.labelsize": 18,
    "legend.fontsize": 16
})

In [3]:
def reshape(awk_ar,sorter,fet_lngth, ascending=False):                                                                                                 
    '''
    This function creates takes an awkward array of track information in an event, and formats it in a standard 2D array.
    The tracks of an event are sorted according to the 'sorter' argument in descending order (greatest value first).
    Events that have too many tracks are truncated, while those with too few are padded with zeros afterwards.
    The returned array is an array of 2D arrays containing  track information for each event.
    '''
    evt_ar = [[] for i in range(len(awk_ar[0]))]
    srt_idx = np.where(np.array(features)==sorter)[0][0]
    
    for fet in awk_ar:
        for evt_n in range(len(fet)):
            evt_ar[evt_n].append(np.array(fet[evt_n]))
            
    # Making 2D arrays for each event, where each each row of that event's 2D array is now a feature array
    evt_ar2d = []
    
    for evt in evt_ar:
        evt_ar2d.append(np.stack(evt))
        
    # Now sorting each 2d array's columns according to the 'sorter' feature
    evt_ar2d_srtd = []
    
    for evt in evt_ar2d:
        idcs = evt[srt_idx].argsort()[::-1]
        evt_ar2d_srtd.append(evt[:,idcs])
        
    # Now need to standardize feature array per event
    evt_std = []
    npad = 0
    
    for evt in evt_ar2d_srtd:
        if len(evt[0]) > fet_lngth:
            evt_std.append( np.swapaxes(evt[:,:fet_lngth],0,1) )
#             evt_std.append( evt[:,:fet_lngth].flatten() )
        else:
            npad += 1
            padded = np.pad(evt,((0,0),(0,fet_lngth - len(evt[0]))))
            evt_std.append( np.swapaxes(padded,0,1) )
#             evt_std.append( np.pad(evt,((0,0),(0,fet_lngth - len(evt[0])))).flatten() )

    print('Number of events padded: {}'.format(npad))
    
    return evt_std

In [8]:
def get_features(file_name, var_sort, ntrx):
    '''
    Function that extracts our chosen feature and label arrays from a root file
    for the events that are labeled as QCD or Hbb and returns two 2D arrays.
    The first array is the features array and has the shape (nummber_of_events, number_of_features).
    The second array is the labels array ans has the shape (number_of_events, 2)
    '''
    with uproot.open(f"{file_name}:deepntuplizer/tree") as tree:

        fet_data = []
        lbl_data = []
        
        for fet in features:
            fet_data.append(np.array(tree[fet]))
            
        for lbl in labels:
            lbl_data.append(np.array(tree[lbl]))

        feature_array = reshape(fet_data, var_sort, ntrx)

        # This part organizes it by event, rather than feature. (features move from rows to columns)
        label_array = np.stack(lbl_data,axis=-1)
        
#         feature_array = np.array(feature_array)[label_array.any(1)]
#         label_array = label_array[label_array.any(1)]
        
    #     return label_array
    return feature_array, label_array

In [9]:
with uproot.open(f"../root_files/ntuple_merged_0.root:deepntuplizer/tree") as tree:
    features = [x for x in tree.keys() if x[:5]=='track']
    
labels = ['sample_isQCD','label_H_bb']

In [26]:
feature_array, label_array = get_features('../root_files/ntuple_merged_0.root','trackBTag_Momentum',10)

Number of events padded: 8660


In [12]:
feature_array = np.array(feature_array)

In [13]:
print(feature_array.shape)
print(label_array.shape)

(200000, 10, 45)
(200000, 2)


In [24]:
features.index('track_dxy')

24

In [25]:
for x in feature_array[54][:,24]:
    print(x)

0.002010498
-0.0025805663
0.00091308594
0.0011486816
-0.0039697266
0.00015220643
-0.00083068846
0.0014111329
0.0020080567
-0.0011468506


In [46]:
np.isnan(label_array).any()

False

In [47]:
label_array

array([[1, 0],
       [1, 0],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=int32)

In [52]:
def my_loss_fn(y_true, y_pred):
    squared_difference = np.square(y_true - y_pred)
    return keras.reduce_mean(squared_difference, axis=-1)  # Note the `axis=-1`

# Making Models

In [37]:
keras_model = keras.Sequential()
keras_model.add(keras.layers.Flatten())
keras_model.add(Dense(units=45,activation='relu'))
keras_model.add(Dense(units=45,activation='relu'))
keras_model.add(Dense(units=45,activation='relu'))
keras_model.add(Dense(units=2,activation='softmax'))
#keras_model.save_weights('random_weights.h5')
#print(keras_model.summary())

In [53]:
#keras_model.load_weights('random_weights.h5')
opt = keras.optimizers.SGD(learning_rate=1e-4)
#keras_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
keras_model.compile(optimizer=opt, loss=my_loss_fn, metrics=['accuracy'])
history = keras_model.fit(feature_array, label_array, batch_size=1024, epochs=100, validation_split=0.2)

Epoch 1/100


TypeError: in user code:

    File "/Users/collinarbour/anaconda3/envs/Work/lib/python3.10/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/var/folders/n1/q0yl5j2d5qvcnc78yw8rs7qw0000gn/T/ipykernel_10815/2810428338.py", line 2, in my_loss_fn  *
        squared_difference = np.square(y_true - y_pred)

    TypeError: Input 'y' of 'Sub' Op has type float32 that does not match type int32 of argument 'x'.
