# Data preaparation


In [1]:
import uproot
import numpy as np
import h5py

#Imports needed python libraries

In [2]:
from ipynb.fs.full.GeneralFunctions import removeBrokenCells, writeH5File, readH5File

In [3]:
uproot.__version__
#Checks the version of uproot being used

'5.3.7'

In [4]:
file = uproot.open("MyxAODAnalysis_all2D.outputs.root")
#Opens root file using uproot

In [5]:
file.keys()
#Checks the keys of the root file

['analysis;1']

In [6]:
tree = file['analysis']
#Creates a tree object from putting the analysis key into the root file

In [7]:
branches = tree.arrays()
#Creates a branches object out of the arrays in the tree object

In [8]:
print(tree.keys()) # no. of variables per event
#Prints the keys of the tree object

['RunNumber', 'EventNumber', 'cell_eta', 'cell_phi', 'cell_x', 'cell_y', 'cell_z', 'cell_subCalo', 'cell_sampling', 'cell_size', 'cell_hashID', 'neighbor', 'seedCell_id', 'cell_e', 'cell_noiseSigma', 'cell_SNR', 'cell_time', 'cell_weight', 'cell_truth', 'cell_truth_indices', 'cell_shared_indices', 'cell_cluster_index', 'cluster_to_cell_indices', 'cluster_to_cell_weights', 'cell_to_cluster_e', 'cell_to_cluster_eta', 'cell_to_cluster_phi', 'cluster_eta', 'cluster_phi', 'cluster_e', 'cellsNo_cluster', 'clustersNo_event', 'jetEnergyWtdTimeAve', 'jetEta', 'jetPhi', 'jetE', 'jetPt', 'jetNumberPerEvent', 'cellIndices_per_jet']


In [9]:
print(branches)

[{RunNumber: 0, EventNumber: 0, cell_eta: [-2.56, ...], ...}, {...}, ..., {...}]


In [10]:
len(branches['cluster_to_cell_indices'])
#Finds the length of the first element of the cluster_to_cell_indices branch

100

# Preparing data with required features for 600 events

In [11]:
cell_coordinate_x = branches['cell_x']
cell_coordinate_y = branches['cell_y']
cell_coordinate_z = branches['cell_z']
cell_eta = branches['cell_eta']
cell_phi = branches['cell_phi']
cell_subCalo = branches['cell_subCalo']
cell_sampling = branches['cell_sampling']
cell_weight = branches['cell_weight']
cell_truth = branches['cell_truth']
cell_to_cluster_e = branches['cell_to_cluster_e']
cell_noiseSigma = branches['cell_noiseSigma']
cell_SNR = branches['cell_SNR']
cell_time = branches['cell_time']
cellsNo_cluster = branches['cellsNo_cluster']
clustersNo_event = branches['clustersNo_event']
cell_cluster_index = branches['cell_cluster_index']
cell_e = branches['cell_e']
cluster_to_cell_indices = branches['cluster_to_cell_indices']
neighbor = branches['neighbor']
#Creates awkward arrays for all needed variables from the branches object

In [12]:
cell_coordinate_x = np.array(cell_coordinate_x)
cell_coordinate_y = np.array(cell_coordinate_y)
cell_coordinate_z = np.array(cell_coordinate_z)
cell_eta = np.array(cell_eta)
cell_phi = np.array(cell_phi)
cell_subCalo = np.array(cell_subCalo)
cell_sampling = np.array(cell_sampling)
cell_weight = np.array(cell_weight)
cell_truth = np.array(cell_truth)
cell_to_cluster_e = np.array(cell_to_cluster_e)
cell_noiseSigma = np.array(cell_noiseSigma)
cell_SNR = np.array(cell_SNR)
cell_time = np.array(cell_time)
#cellsNo_cluster = np.array(cellsNo_cluster)
clustersNo_event = np.array(clustersNo_event)
cell_to_cluster_index = np.array(cell_cluster_index)
cell_e = np.array(cell_e)
#Converts the awkward arrays into numpy arrays

In [13]:
cell_to_cluster_index.shape
#Gets the shape of the cell_to_cluster_index array

(100, 187652)

In [14]:
len(cellsNo_cluster[1])
#Gets the length of the second element of the cellsNo_cluster array

757

In [15]:
print(cell_coordinate_y)

[[  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 ...
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]]


In [16]:
data = {}
numberOfDatapoints = 100
for i in range(numberOfDatapoints):
    data[f"data_{i}"] = np.concatenate((np.expand_dims(cell_coordinate_x[i], axis=1), np.expand_dims(cell_coordinate_y[i], axis=1), np.expand_dims(cell_coordinate_z[i], axis=1),
                        np.expand_dims(cell_eta[i], axis=1), np.expand_dims(cell_phi[i], axis=1),
                        np.expand_dims(cell_sampling[i], axis=1),
                        np.expand_dims(cell_noiseSigma[i], axis=1),
                        np.expand_dims(cell_e[i], axis=1)), axis=1)
#Creates a data dictionary. Goes through every event and saves the cartesian cell coordinates, eta, phi, sampling, sigma noise, and energy into a two
#dimensional array that has the eight variables saved to each cell coordinate. Every event is saved to a dictionary element named data_{event_number}.


In [17]:
data['data_2'].shape
#Checks the shape of one of the arrays in the data dictionary.

(187652, 8)

In [18]:
def save_dict_to_hdf5(dic, filename):
    """
    Save a dictionary to an HDF5 file
    """
    with h5py.File(filename, 'w') as f:
        _save_dict_to_hdf5(f, dic)

def _save_dict_to_hdf5(group, dic):
    """
    Save a dictionary to an HDF5 group
    """
    for key, value in dic.items():
        if isinstance(value, dict):
            subgroup = group.create_group(key)
            _save_dict_to_hdf5(subgroup, value)
        else:
            if isinstance(value, list):
                # Convert list to numpy array before saving
                value = np.array(value)
            group[key] = value
#Creates two methods, one called save_dict_to_hdf5 and another called _save_dict_to_hdf5. The first takes a dictionary and an output file name. The
#method saves the dictionary to an hdf5 file with that output filename. It does this by creating the file and then calling the second method. The
#second method takes an hdf5 file and a dictionary. I’m honestly not sure how this one works. My best guess is that it uses recursion to save the
#dictionary entries under separate group names.

In [19]:
save_dict_to_hdf5(data,'./cellFeatures_100evsAll2D.hdf5')
#Calls the first method from cell 15 and saves the data dictionary to an hdf5 file.

## preparing Neighbor Pairs

In [20]:
neighbor = branches['neighbor'][0]
len(neighbor)
#Creates a neighbor awkward array consisting of the first element of the neighbor branch. Prints the length of the neighbor array.

187652

In [21]:
neighbor[0]
#Prints the first element of the neighbor awkward array.

In [22]:
neighbor_pairs_set = removeBrokenCells(cell_noiseSigma, neighbor)
#Finds and prints the indices where there is zero cell noise. These cells are the not working cells.

In [23]:
len(neighbor_pairs_set)
#Prints the length of this new neighbor numpy array.

1483024

In [24]:
# A code to remove permutation variant
def canonical_form(t):
    """Return a canonical representation of a tuple."""
    return tuple(sorted(t))

def remove_permutation_variants(tuple_list):
    """Remove permutation variants from a list of tuples."""
    unique_tuples = set(canonical_form(t) for t in tuple_list)
    return [tuple(sorted(t)) for t in unique_tuples]
#Creates two methods that remove permutation variants. I honestly don’t quite understand what this means, but my best guess is that it removes duplicate
#information from the numpy array. I.e. so that it would not have both the tuples (1, 3) and (3, 1) since both tuples say that cells 1 and 3 are
#adjacent.

In [25]:
#remove list_of_tuples = [(3, 1), (1, 3), (2, 4), (4, 2), (5, 6)]
neighbor_pairs_unique = remove_permutation_variants(neighbor_pairs_set)
#Calls the methods created in cell 22 to remove duplicate tuples to create a new numpy array without duplicate tuples

In [26]:
len(neighbor_pairs_unique)
#Prints the length of the unique neighbor array

741512

In [27]:
neighbor_pairs_unique_sorted = sorted(neighbor_pairs_unique, key=lambda x: x[0])
#Sorts the unique neighbor array and stores it in a new array. I honestly am not sure how it is doing this.

In [28]:
neighbor_pairs_unique_sorted
#Prints the sorted array.

[(0, 26751),
 (0, 1),
 (0, 127),
 (0, 26497),
 (0, 63),
 (0, 26500),
 (0, 64),
 (0, 26496),
 (0, 26498),
 (0, 26499),
 (0, 65),
 (1, 26500),
 (1, 64),
 (1, 26501),
 (1, 65),
 (1, 26504),
 (1, 26499),
 (1, 26502),
 (1, 66),
 (1, 26503),
 (1, 2),
 (2, 26503),
 (2, 26505),
 (2, 26506),
 (2, 67),
 (2, 26504),
 (2, 3),
 (2, 26507),
 (2, 65),
 (2, 26508),
 (2, 66),
 (3, 26509),
 (3, 67),
 (3, 26510),
 (3, 26512),
 (3, 68),
 (3, 4),
 (3, 26507),
 (3, 26508),
 (3, 66),
 (3, 26511),
 (4, 67),
 (4, 26512),
 (4, 26513),
 (4, 26515),
 (4, 26516),
 (4, 68),
 (4, 26511),
 (4, 69),
 (4, 26514),
 (4, 5),
 (5, 26518),
 (5, 68),
 (5, 26516),
 (5, 26517),
 (5, 26519),
 (5, 69),
 (5, 26520),
 (5, 70),
 (5, 26515),
 (5, 6),
 (6, 71),
 (6, 26522),
 (6, 7),
 (6, 69),
 (6, 26520),
 (6, 70),
 (6, 26523),
 (6, 26524),
 (6, 26519),
 (6, 26521),
 (7, 71),
 (7, 26524),
 (7, 26525),
 (7, 26528),
 (7, 72),
 (7, 26523),
 (7, 8),
 (7, 26526),
 (7, 70),
 (7, 26527),
 (8, 26528),
 (8, 72),
 (8, 26529),
 (8, 26531),
 (8,

In [29]:
writeH5File('./neighbor_pairs_unique_sorted_all2D.hdf5', "neighbor_pair", neighbor_pairs_unique_sorted)
#Saves the sorted neighbor pair array to an hdf5 file.

## Loading neighbor cell pairs

In [30]:
neighbor_pairs_unique_sorted = readH5File("./neighbor_pairs_unique_sorted_all2D.hdf5", 'neighbor_pair')
#Retrieves the sorted neighbor pair data from the hdf5 file.

In [31]:
len(neighbor_pairs_unique_sorted)
#Prints the length of the retrieved data.

741512

In [32]:
neighbor_pairs_unique_sorted.shape
#Prints the shape of the retrieved data.

(741512, 2)

## Creating Labels for neighbor_pairs

In [33]:
cell_to_cluster_index.shape
#Gets the shape of the cell_to_cluster_index array.

(100, 187652)

In [34]:
true_neighbor_cluster = []
for i in range(numberOfDatapoints):
    true_neighbor_cluster_event = []
    for pair in neighbor_pairs_unique_sorted:
        if cell_to_cluster_index[i][pair[0]]==cell_to_cluster_index[i][pair[1]]:
            if cell_to_cluster_index[i][pair[0]] != 0:
                true_neighbor_cluster_event.append(1)
            else:
                true_neighbor_cluster_event.append(9) # labelling 9 to non participating cells
        else:
            if cell_to_cluster_index[i][pair[0]]!=0 and cell_to_cluster_index[i][pair[1]]!=0: 
                true_neighbor_cluster_event.append(10) # labelling 10 for cells from two different clusters
            else:
                true_neighbor_cluster_event.append(0)  # labelling 0 for cell from  cluster and nonparticipaing cell
    true_neighbor_cluster.append(true_neighbor_cluster_event)
#Goes through the sorted neighbor array and gives each pair ratings based on if any number of cells in the pair are part of any clusters, with the
#ratings showing the number of cells involved in the cluster and the number of clusters that they are part of. This is saved to a new array. I’m going
#to call it the neighbor truth array.

In [35]:
true_neighbor_cluster = np.array(true_neighbor_cluster)
true_neighbor_cluster.shape
#Converts the neighbor truth array into a numpy array. Prints the shape of the array.

(100, 741512)

In [36]:
true_neighbor_cluster[4]
#Prints the fifth element of the array.

array([1, 1, 1, ..., 9, 9, 9])

In [37]:
writeH5File('neigbor_truth_100evs_all2D', "neighbor_truth_100evs", true_neighbor_cluster)
#Saves the neighbor truth array to an hdf5 file.