In [1]:
import numpy as np
from random import shuffle
from functools import reduce

import csv
import pickle

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.utils import shuffle
# from sklearn.utils import compute_class_weight

import tensorflow as tf
# from keras import layers, regularizers
#layers = tf.keras.layers
from tensorflow.keras import layers, models
regularizers = tf.keras.regularizers

In [2]:
with open('data.pickle', 'rb') as f:
    jasmine_data = pickle.load(f)

flagged = jasmine_data['flagged']
flagged = np.array(flagged, dtype=np.float64)
params = jasmine_data['params']
# params = [np.transpose(param) for param in params]
# onehot_flagged = np.array(onehot_flagged)
params_max_size = max(map(len, params))
# params = np.array(list(map(lambda mat: np.random.permutation(np.pad(np.array(mat), ((0, params_max_size - len(mat)), (0, 0)))), params)))
params = np.array(list(map(lambda mat: np.pad(np.array(mat, dtype=np.float64), ((0, params_max_size - len(mat)), (0, 0))), params))).reshape((-1, params_max_size, 1, 1, 4))

Data has extreme class imbalance. Subsample the majority class to obtain a balance data set for training

In [3]:
is_flagged = flagged==1
percentage = np.sum(is_flagged) / len(flagged)
print('Percentage of flagged c-cbar events: '+str(percentage))

Percentage of flagged c-cbar events: 0.04361988806301638


In [4]:
flagged_params = params[ is_flagged ]
unflagged_params = params[ ~is_flagged ]

# downsample the unflagged (majority) class to be equal in length to the flagged (minority) class
unflagged_indices = np.random.choice(unflagged_params.shape[0], size=len(flagged_params), replace=False)
to_keep = unflagged_params[ unflagged_indices ]

# re-append flagged and unflagged samples together
keep_params = np.concatenate( (flagged_params, to_keep) )
keep_flagged = np.concatenate( (np.ones(len(flagged_params)), np.zeros(len(to_keep))) )
X_orig, y_orig = shuffle( keep_params, keep_flagged, random_state=0 ) # shuffle so that flagged and unflagged classes are all mixed together

In [5]:
def category_encoding_layer(pdgs, max_tokens=None):
  
  # Create a layer that turns integers into indices.
  index = layers.IntegerLookup(max_tokens=max_tokens)

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt( pdgs )

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size(), output_mode="one_hot")

  return lambda feature: encoder(index(feature))

def do_category_encoding(dataset, category_index):

    encoding_layer = category_encoding_layer( dataset[:,:,:,:,category_index] )
    encoded_pdgs = np.asarray( encoding_layer( dataset[:,:,:,:,category_index] ) )
    mask = np.ones( dataset.shape[4], bool)
    mask[category_index] = False
    dataset = np.concatenate( (dataset[:,:,:,:,mask], encoded_pdgs), axis=4 )
    return dataset

The most useful piece of information for understanding whether there is a c-cbar splitting inside of a jet is whether there is a c-cbar pair inside of the jet, but this information is often not accessible experimentally. As a test, we remove this information (either replacing charms with gluons or light quarks) and retrain the model to see how this impacts the performance

In [6]:
def replace_charms(pdgs, replacement):

    assert replacement=='gluons' or replacement=='light quarks'

    mask = np.isin(pdgs,[4,-4])

    # replace with the id of gluons (21)
    if replacement=='gluons':
        pdgs[ mask ] = 21

    # replace with up quarks (1)
    elif replacement=='light quarks':
        pdgs[ mask ] = np.sign( pdgs[mask] )

Make a copy of the original data, and then create two modifications for testing - either the charms are replaced by gluons, or by light (up) quarks

In [7]:
category_index = 3 # the only category in this dataset is the particle id, which is at index 3

X_orig_encoded = do_category_encoding(X_orig, category_index)
x_train_orig, x_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig_encoded, y_orig, test_size = .2, random_state = 0)

X = X_orig.copy()
replace_charms(X[:,:,:,:,category_index], 'light quarks')
X_encoded = do_category_encoding(X, category_index)
x_train_light, x_test_light, y_train_light, y_test_light = train_test_split(X_encoded, y_orig, test_size = .2, random_state = 0)
test_light = X[:,:,:,:,category_index].flatten()

X2 = X_orig.copy()
replace_charms(X2[:,:,:,:,category_index], 'gluons')
X_encoded = do_category_encoding(X2, category_index)
x_train_gluon, x_test_gluon, y_train_gluon, y_test_gluon = train_test_split(X_encoded, y_orig, test_size = .2, random_state = 0)
test_gluon = X2[:,:,:,:,category_index].flatten()

Make the model

In [8]:
class ReductionLayer(layers.Layer):
    def __init__(self, name):
        super(ReductionLayer, self).__init__(name=name)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, inputs):
        return tf.reduce_sum(inputs, axis=[1, 2, 3])


def get_clf(
    meta={},
    hidden_layer_size=100,
    observable_num=128,
    activation='leaky_relu',
    dropout=0.2,
    kernel_reg=1e-5,
    bias_reg=1e-5,
    activity_reg=1e-5,
    shape=x_train_orig.shape[1:]
):
    inputs = layers.Input(
        shape=shape,
        name="input",
    )
    masked_inputs = layers.Masking(mask_value=0.0, name="masking")(inputs)
    dense_1 = layers.TimeDistributed(
        layers.Dense(
            hidden_layer_size,
            activation=activation,
            kernel_regularizer=regularizers.L2(kernel_reg),
            bias_regularizer=regularizers.L2(bias_reg),
            activity_regularizer=regularizers.L2(activity_reg),
        ),
        name="dense_1",
    )(masked_inputs)
    dropout_1 = layers.Dropout(dropout)(dense_1)
    dense_2 = layers.TimeDistributed(
        layers.Dense(
            hidden_layer_size,
            activation=activation,
            kernel_regularizer=regularizers.L2(kernel_reg),
            bias_regularizer=regularizers.L2(bias_reg),
            activity_regularizer=regularizers.L2(activity_reg),
        ),
        name="dense_2",
    )(dropout_1)
    dropout_2 = layers.Dropout(dropout)(dense_2)
    distributed_phi = layers.TimeDistributed(
        layers.Dense(
            observable_num,
            activation=activation,
            kernel_regularizer=regularizers.L2(kernel_reg),
            bias_regularizer=regularizers.L2(bias_reg),
            activity_regularizer=regularizers.L2(activity_reg),
        ),
        name="distributed_phi",
    )(dropout_2)
    observables = ReductionLayer(name="observables")(distributed_phi)
    dropout_obs = layers.Dropout(dropout)(observables)
    dense_3 = layers.Dense(
        hidden_layer_size,
        activation=activation,
        kernel_regularizer=regularizers.L2(kernel_reg),
        bias_regularizer=regularizers.L2(bias_reg),
        activity_regularizer=regularizers.L2(activity_reg),
        name="dense_3",
    )(dropout_obs)
    dropout_3 = layers.Dropout(dropout)(dense_3)
    dense_4 = layers.Dense(
        hidden_layer_size,
        activation=activation,
        kernel_regularizer=regularizers.L2(kernel_reg),
        bias_regularizer=regularizers.L2(bias_reg),
        activity_regularizer=regularizers.L2(activity_reg),
        name="dense_4",
    )(dropout_3)
    dropout_4 = layers.Dropout(dropout)(dense_4)
    dense_5 = layers.Dense(
        hidden_layer_size,
        activation=activation,
        kernel_regularizer=regularizers.L2(kernel_reg),
        bias_regularizer=regularizers.L2(bias_reg),
        activity_regularizer=regularizers.L2(activity_reg),
        name="dense_5",
    )(dropout_4)
    dropout_5 = layers.Dropout(dropout)(dense_5)
    output = layers.Dense(1, activation="sigmoid", name="output")(dropout_5)

    model = tf.keras.models.Model(inputs=inputs, outputs=output)

    return model

In [9]:
clf = get_clf(dropout=0.2)

loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)

# class_weights = compute_class_weight(
#     class_weight="balanced", classes=np.unique(flagged), y=flagged
# )
# class_weights /= sum(class_weights)
# loss_fn = tf.keras.losses.BinaryFocalCrossentropy(
#     apply_class_balancing=True, alpha=0.5, gamma=2, from_logits=False
# )

clf.compile(optimizer="adam", loss=loss_fn, metrics=["Accuracy", "AUC", "Precision", "Recall"])

The most useful piece of information for understanding whether there is a c-cbar splitting inside of a jet is whether there is a c-cbar pair inside of the jet, but this information is often not accessible experimentally. As a test, we remove this information (either replacing charms with gluons or light quarks) and retrain the model to see how this impacts the performance

In [10]:
clf.summary()

In [11]:
dot_img_file = "./model_1.png"
tf.keras.utils.plot_model(
    clf,
    to_file=dot_img_file,
    show_shapes=True,
    show_layer_names=True,
    show_layer_activations=True,
    expand_nested=True,
)

You must install pydot (`pip install pydot`) for `plot_model` to work.


In [12]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True,
)

clf.fit(
    x_train_orig,
    y_train_orig,
    epochs=10,
    batch_size=250,
    validation_split=0.2,
    callbacks=[early_stop]
)

Epoch 1/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 110ms/step - AUC: 0.6306 - Accuracy: 0.5937 - Precision: 0.5960 - Recall: 0.6249 - loss: 0.9646 - val_AUC: 0.9993 - val_Accuracy: 0.9915 - val_Precision: 0.9881 - val_Recall: 0.9950 - val_loss: 0.1385
Epoch 2/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 100ms/step - AUC: 0.9984 - Accuracy: 0.9815 - Precision: 0.9692 - Recall: 0.9941 - loss: 0.1582 - val_AUC: 0.9997 - val_Accuracy: 0.9940 - val_Precision: 0.9993 - val_Recall: 0.9887 - val_loss: 0.0921
Epoch 3/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 115ms/step - AUC: 0.9994 - Accuracy: 0.9910 - Precision: 0.9873 - Recall: 0.9950 - loss: 0.1051 - val_AUC: 1.0000 - val_Accuracy: 0.9968 - val_Precision: 0.9993 - val_Recall: 0.9943 - val_loss: 0.0701
Epoch 4/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 111ms/step - AUC: 0.9996 - Accuracy: 0.9939 - Precision: 0.9918 - Recall: 0.9960 - loss: 0.0829 -

<keras.src.callbacks.history.History at 0x2f2dbc920>

In [13]:
clf.evaluate(x_test_orig,y_test_orig)

[1m  1/111[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 17ms/step - AUC: 1.0000 - Accuracy: 1.0000 - Precision: 1.0000 - Recall: 1.0000 - loss: 0.0147

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - AUC: 1.0000 - Accuracy: 0.9997 - Precision: 0.9993 - Recall: 1.0000 - loss: 0.0155


[0.016808347776532173,
 0.9999688863754272,
 0.9994344115257263,
 0.9988795518875122,
 1.0]

In [15]:
preds = clf.predict(x_test_orig)
binary_predictions = [1 if pred>0.5 else 0 for pred in preds]
(binary_predictions!=y_test_orig).sum()

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


2

Now, try the more difficult thing where you "mislabel" the charm quarks as up quarks

In [16]:
del clf
clf = get_clf(dropout=0.2,shape=x_train_light.shape[1:])
clf.compile(optimizer="adam", loss=loss_fn, metrics=["Accuracy", "AUC", "Precision", "Recall"])

clf.fit(
    x_train_light,
    y_train_light,
    epochs=50,
    batch_size=250,
    validation_split=0.2,
    callbacks=[early_stop],
)

Epoch 1/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 104ms/step - AUC: 0.5940 - Accuracy: 0.5692 - Precision: 0.5677 - Recall: 0.5212 - loss: 1.0043 - val_AUC: 0.9352 - val_Accuracy: 0.9208 - val_Precision: 0.8681 - val_Recall: 0.9922 - val_loss: 0.3280
Epoch 2/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 91ms/step - AUC: 0.9231 - Accuracy: 0.8855 - Precision: 0.8294 - Recall: 0.9656 - loss: 0.3771 - val_AUC: 0.9414 - val_Accuracy: 0.9144 - val_Precision: 0.8703 - val_Recall: 0.9738 - val_loss: 0.3017
Epoch 3/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 107ms/step - AUC: 0.9276 - Accuracy: 0.9055 - Precision: 0.8537 - Recall: 0.9748 - loss: 0.3243 - val_AUC: 0.9435 - val_Accuracy: 0.9211 - val_Precision: 0.8677 - val_Recall: 0.9936 - val_loss: 0.2720
Epoch 4/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 112ms/step - AUC: 0.9298 - Accuracy: 0.9071 - Precision: 0.8508 - Recall: 0.9844 - loss: 0.3096 - 

<keras.src.callbacks.history.History at 0x3408bf350>

In [17]:
clf.evaluate(x_test_light,y_test_light)

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - AUC: 0.9443 - Accuracy: 0.9298 - Precision: 0.8793 - Recall: 0.9940 - loss: 0.2710


[0.28185391426086426,
 0.9392687082290649,
 0.9244909286499023,
 0.8759920597076416,
 0.990465521812439]

In [18]:
preds = clf.predict(x_test_light)
binary_predictions = [1 if pred>0.5 else 0 for pred in preds]
(binary_predictions!=y_test_light).sum()

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


267

In [19]:
del clf
clf = get_clf(dropout=0.2,shape=x_train_gluon.shape[1:])
clf.compile(optimizer="adam", loss=loss_fn, metrics=["Accuracy", "AUC", "Precision", "Recall"])

clf.fit(
    x_train_gluon,
    y_train_gluon,
    epochs=50,
    batch_size=250,
    validation_split=0.2,
    callbacks=[early_stop],
)

Epoch 1/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 101ms/step - AUC: 0.5190 - Accuracy: 0.5158 - Precision: 0.5001 - Recall: 0.4562 - loss: 0.8567 - val_AUC: 0.5687 - val_Accuracy: 0.5435 - val_Precision: 0.5377 - val_Recall: 0.6164 - val_loss: 0.6968
Epoch 2/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 90ms/step - AUC: 0.5670 - Accuracy: 0.5510 - Precision: 0.5370 - Recall: 0.5978 - loss: 0.6975 - val_AUC: 0.6309 - val_Accuracy: 0.5937 - val_Precision: 0.5714 - val_Recall: 0.7481 - val_loss: 0.6698
Epoch 3/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 108ms/step - AUC: 0.6173 - Accuracy: 0.5875 - Precision: 0.5643 - Recall: 0.7312 - loss: 0.6815 - val_AUC: 0.6793 - val_Accuracy: 0.6195 - val_Precision: 0.6152 - val_Recall: 0.6369 - val_loss: 0.6443
Epoch 4/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 114ms/step - AUC: 0.6632 - Accuracy: 0.6089 - Precision: 0.5901 - Recall: 0.6581 - loss: 0.6609 - 

<keras.src.callbacks.history.History at 0x3612ab7a0>

### Now take a look at boosted decision trees

For now, try the boosted decision tree where the features are the pt of all particles

In [28]:
train_shape = x_train_orig.shape
test_shape = x_test_orig.shape
x_train_0_bdt = np.reshape( x_train_orig,(train_shape[0],train_shape[1]*train_shape[4]) )
x_test_0_bdt = np.reshape( x_test_orig,(test_shape[0],test_shape[1]*test_shape[4]) )

In [31]:
from xgboost import XGBClassifier

# create model instance
bst = XGBClassifier(objective='binary:logistic', learning_rate = 0.1,
              max_depth = 15, n_estimators = 50)
# fit model
bst.fit(x_train_0_bdt, y_train_orig)
# make predictions
preds = bst.predict(x_test_0_bdt)

In [33]:
((y_test_orig==1) & (preds==1)).sum()

1774

In [34]:
((y_test_orig==0) & (preds==0)).sum()

1725

In [37]:
(y_test_orig==preds).sum() / len(y_test_orig)

0.9895361990950227