In [1]:
import uproot
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from collections import namedtuple, defaultdict
import random
random.seed(42)
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import add_self_loops
from torchvision import transforms
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib # efficiently serializes python objects and is used in saving and loading machine learning models
import json # encode/decode JSON data

## Methods for the pairing datapoints notebook

In [2]:
def branches_from_root_file(filename):
    '''
    Returns the branches from a root file
    '''
    file = uproot.open(filename)
    tree = file[file.keys()[0]]
    branches = tree.arrays()
    return branches

# Original Code:
# file = uproot.open("MyxAODAnalysis_all2D.outputs.root")
# file.keys()
# tree = file['analysis']
# branches = tree.arrays()

In [3]:
# variables = [cell_coordinate_x, cell_coordinate_y, cell_coordinate_z, cell_eta, cell_phi, cell_sampling, cell_noiseSigma, cell_e]
# generalized_range = len(cell_to_cluster_index)

def create_data_dict(range_value, variables):
    for i in range(generalized_range):
        data[f"data_{i}"] = np.concatenate([np.expand_dims(var[i], axis=1) for var in variables], axis=1)
    return data
    
# Original Code:
# data = {}
# for i in range(generalized_range):
#     data[f"data_{i}"] = np.concatenate((np.expand_dims(cell_coordinate_x[i], axis=1), np.expand_dims(cell_coordinate_y[i], axis=1), np.expand_dims(cell_coordinate_z[i], axis=1),
#                         np.expand_dims(cell_eta[i], axis=1), np.expand_dims(cell_phi[i], axis=1),
#                         np.expand_dims(cell_sampling[i], axis=1),
#                         np.expand_dims(cell_noiseSigma[i], axis=1),
#                         np.expand_dims(cell_e[i], axis=1)), axis=1)

In [4]:
def save_dict_to_hdf5(dic, filename):
    """Save a dictionary to an HDF5 file"""
    with h5py.File(filename, 'w') as f:
        _save_dict_to_hdf5(f, dic)
# Opens hdf5 file in write mode. Calls a helper function
# to recursively save the dictionary to the hdf5 file.

def _save_dict_to_hdf5(group, dic):
    """Save a dictionary to an HDF5 group"""
    for key, value in dic.items():
        if isinstance(value, dict):
            subgroup = group.create_group(key)
            _save_dict_to_hdf5(subgroup, value)
        else:
            if isinstance(value, list):
                """Convert list to numpy array before saving"""
                value = np.array(value)
            group[key] = value

In [5]:
# A code to remove permutation variant
def canonical_form(t):
    """Sorts elements of the tuple and converts the sorted list back into a tuple."""
    return tuple(sorted(t))

def remove_permutation_variants(tuple_list):
    """
    Creates a set of unique tuples by converting each tuple to its canonical form.
    Remove permutation variants from a list of tuples.
    Converts set back into a list of tuples.
    """
    unique_tuples = set(canonical_form(t) for t in tuple_list)
    return [tuple(sorted(t)) for t in unique_tuples]

In [6]:
def lone_bkg_neighbor_clusters_labels():
    print("Label for True: 1")
    print("Label for Lone-Lone: 0")
    print("Label for Cluster-Lone: 2")
    print("Label for Cluster-Cluster: 3")
    for pair in neighbor_pairs_unique_sorted:
        if cell_to_cluster_index[i][pair[0]] == cell_to_cluster_index[i][pair[1]]:
        # Both cells in the pair belong to the same cluster
            if cell_to_cluster_index[i][pair[0]] != 0:
                true_neighbor_cluster_event.append(1)  # Cluster-Cluster (True)
            else:
                true_neighbor_cluster_event.append(0)  # Lone-Lone
        else:
            if cell_to_cluster_index[i][pair[0]] != 0 and cell_to_cluster_index[i][pair[1]] != 0:
                true_neighbor_cluster_event.append(3)  # Cluster-Cluster
            else:
                true_neighbor_cluster_event.append(2)  # Cluster-Lone

def cluster_lone_bkg_neighbor_clusters_labels():
    print("Label for True: 1")
    print("Label for Lone-Lone: 2")
    print("Label for Cluster-Lone: 0")
    print("Label for Cluster-Cluster: 3")
    for pair in neighbor_pairs_unique_sorted:
        if cell_to_cluster_index[i][pair[0]] == cell_to_cluster_index[i][pair[1]]:
        # Both cells in the pair belong to the same cluster
            if cell_to_cluster_index[i][pair[0]] != 0:
                true_neighbor_cluster_event.append(1)  # Cluster-Cluster (True)
            else:
                true_neighbor_cluster_event.append(2)  # Lone-Lone
        else:
            if cell_to_cluster_index[i][pair[0]] != 0 and cell_to_cluster_index[i][pair[1]] != 0:
                true_neighbor_cluster_event.append(3)  # Cluster-Cluster
            else:
                true_neighbor_cluster_event.append(0)  # Cluster-Lone

def cluster_cluster_bkg_neighbor_clusters_labels():
    print("Label for True: 1")
    print("Label for Lone-Lone: 3")
    print("Label for Cluster-Lone: 2")
    print("Label for Cluster-Cluster: 0")
    for pair in neighbor_pairs_unique_sorted:
        if cell_to_cluster_index[i][pair[0]] == cell_to_cluster_index[i][pair[1]]:
        # Both cells in the pair belong to the same cluster
            if cell_to_cluster_index[i][pair[0]] != 0:
                true_neighbor_cluster_event.append(1)  # Cluster-Cluster (True)
            else:
                true_neighbor_cluster_event.append(3)  # Lone-Lone
        else:
            if cell_to_cluster_index[i][pair[0]] != 0 and cell_to_cluster_index[i][pair[1]] != 0:
                true_neighbor_cluster_event.append(0)  # Cluster-Cluster
            else:
                true_neighbor_cluster_event.append(2)  # Cluster-Lone

In [7]:
def label_neighbor_clusters(cell_to_cluster_index, neighbor_pairs_unique_sorted, background_setting):
    true_neighbor_cluster = []

    for i in range(len(cell_to_cluster_index)):
        true_neighbor_cluster_event = []
        # Append zero value based on the background setting
        if background_setting == 'lone':
            lone_bkg_neighbor_clusters_labels()
        elif background_setting == 'cluster-lone':
            cluster_lone_bkg_neighbor_clusters_labels()
        elif background_setting == 'cluster-cluster':
            cluster_cluster_bkg_neighbor_clusters_labels()       
    return true_neighbor_cluster

## Methods for the data preparation

In [8]:
def hdf5_to_dict(hdf5_file):
    """
    Convert HDF5 file to Python dictionary
    """
    data_dict = {}
    _hdf5_to_dict(hdf5_file, data_dict)
    return data_dict
# Initializes an empty dictionary and calls a function to recursively
# fill this dictionary with data from the hdf5 file.


def _hdf5_to_dict(group, dic):
    """
    Convert HDF5 group to dictionary recursively
    """
    for key, item in group.items():
        if isinstance(item, h5py.Group):
            subgroup = {}
            _hdf5_to_dict(item, subgroup)
            dic[key] = subgroup
        else:
            dic[key] = np.array(item)
# Iterates over items in the hdf5 group. If the item is a group, 
# it creates a new dictionary and calls itself recursively. If the item
# is a dataset, it converts it to a numpy array and stores it in the dictionary.

In [9]:
def indicies_of_edges(range_value, true_list, bkg_0, bkg_2, bkg_3):
    '''
    bkg_0 represents the background with a truth value of 0
    bkg_2/3 represents the background with a truth value of 2/3
    '''
    for i in range(range_value):
        true_list.append(list(np.where(neigbor_truth_100evs[i]==1)[0]))
        bkg_0.append(list(np.where(neigbor_truth_100evs[i]==0)[0]))
        bkg_2.append(list(np.where(neigbor_truth_100evs[i]==2)[0]))
        bkg_3.append(list(np.where(neigbor_truth_100evs[i]==3)[0]))
    return np.array(true_list), np.array(bkg_0), np.array(bkg_2), np.array(bkg_3)

def pairs_of_edges(range_value, true_list, bkg_0, bkg_2, bkg_3):
    '''
    bkg_0 represents the background with a truth value of 0
    bkg_2/3 represents the background with a truth value of 2/3
    '''
    for i in range(range_value):
        true_list.append(len(true[i]))
        bkg_0.append(len(bkg_0[i]))
        bkg_2.append(len(bkg_2[i]))
        bkg_3.append(len(bkg_3[i]))
    return np.array(true_list), np.array(bkg_0), np.array(bkg_2), np.array(bkg_3)

In [10]:
def sorting_indicies(pair_number_list, true_list, bkg_0, bkg_2, bkg_3):
    sorted_indices = np.argsort(-pair_number_list)
    return true_list[sorted_indices], bkg_0[sorted_indices], bkg_2[sorted_indices], bkg_3[sorted_indices]

In [11]:
def create_train_set_test_set(split_num, true_list, bkg_0, bkg_2, bkg_3):
    return true_list[:split_num],bkg_0[:split_num],bkg_2[:split_num],bkg_3[:split_num], true_list[split_num:],bkg_0[split_num:],bkg_2[split_num:],bkg_3[split_num:]

In [12]:
def scaling_of_features(dynamic_variables, split_num, sorted_indicies):
    keys = list(dynamic_variables.keys())
    values = list(dynamic_variables.values())
    #Gets the keys and variables from the dynamic variables array and stores them in arrays.
    
    rearranged_values = [values[i] for i in sorted_indicies]
    #Sorts the values with the sorted cluster indices.
    
    rearranged_dict = dict(zip(keys, rearranged_values))
    #Creates a rearranged dictionary out of the keys and sorted values.
    
    data_train = np.concatenate([value for key, value in list(rearranged_dict.items())[:split_num]])
    #Create a training data array.
    
    data_test = np.concatenate([value for key, value in list(dynamic_variables.items())[split_num:]])
    #Create a data testing array.
    
    scaler = MinMaxScaler()
    cellFeatures_trainS = scaler.fit_transform(data_train)
    scaler_filename = "./bscaler_neighbor_data_train_sorted.save"
    joblib.dump(scaler, scaler_filename)
    #Scales the training data with a minmaxscaler, put that scaled data into a training features array, and then save that scaler into a .save file.