# Data preaparation


In [1]:
import numpy as np
import random
random.seed(42)
#Imports needed python libraries

In [2]:
from ipynb.fs.full.GeneralFunctions import removeBrokenCells, writeH5File, readH5File, branches_from_root_file, extractData, create_data_dict, save_dict_to_hdf5, remove_permutation_variants, neighbor_pairs_mapping, label_neighbor_pairs

In [9]:
def prepareData(inputFileName, mappingValues, outputDirectory, featuresList, COCOA, backgroundMean, suffix):
    branches = branches_from_root_file(inputFileName)
    makeDictionary(branches, outputDirectory, featuresList, COCOA, backgroundMean, suffix)
    makeNeighborPairs(branches, mappingValues, outputDirectory, COCOA, suffix)
#Creates a branches object out of the arrays in the tree object

In [10]:
def makeNeighborPairs(branches, mappingValues, outputDirectory, COCOA, suffix):
    if COCOA:
        neighbor_pairs_set = getCOCOANeighborPairs(branches)
    else:
        neighbor = branches['neighbor'][0]
        neighbor_pairs_set = removeBrokenCells(extractData(branches, 'cell_noiseSigma'), neighbor)
    neighbor_pairs_unique = remove_permutation_variants(neighbor_pairs_set)
    neighbor_pairs_unique_sorted = sorted(neighbor_pairs_unique, key=lambda x: x[0])
    writeH5File(outputDirectory+'/neighbor_pairs_unique_sorted'+suffix+'.hdf5', "neighbor_pair", neighbor_pairs_unique_sorted)
    print("Wrote sorted neighbor pairs to file: "+outputDirectory+"/neighbor_pairs_unique_sorted"+suffix+".hdf5")
    mapping = neighbor_pairs_mapping(mappingValues[0], mappingValues[1], mappingValues[2], mappingValues[3])
    rangeValue = len(extractData(branches, 'cell_x'))
    true_neighbor_cluster = label_neighbor_pairs(rangeValue, extractData(branches, 'cell_cluster_index'), neighbor_pairs_unique_sorted, mapping)
    writeH5File(outputDirectory+'/neighbor_truth'+suffix+'.hdf5', "neighbor_truth", true_neighbor_cluster)
    print("Wrote neighbor truth to file: "+outputDirectory+"/neighbor_truth"+suffix+".hdf5")

In [13]:
def getCOCOANeighborPairs(branches):
    source = branches['cell_to_cell_edge_start']
    dest = branches['cell_to_cell_edge_end']
    neighbor_pairs_set = []
    for i in range(0, len(source)):
        for j in range(0, len(source[i])):
            neighbor_pairs_set.append((source[i][j], dest[i][j]))
    return neighbor_pairs_set

In [5]:
def makeDictionary(branches, outputDirectory, featuresList, COCOA, backgroundMean, suffix):
    data = {}
    rangeValue = len(extractData(branches, featuresList[0]))
    variables = []
    if COCOA:
        for feature in featuresList:
            featureData = extractData(branches, feature)
            if feature == "cell_e":
                background = random.normal(loc = backgroundMean, size = featureData.shape)
                featureData = np.add(featureData, background)
            variables.append(data)
    else:
        for feature in featuresList:
            variables.append(extractData(branches, feature))
    #variables = [extractData(branches, 'cell_x'), extractData(branches, 'cell_y'), extractData(branches, 'cell_z'), extractData(branches, 'cell_eta'), extractData(branches, 'cell_phi'), extractData(branches, 'cell_sampling'), extractData(branches, 'cell_noiseSigma'), extractData(branches, 'cell_e')]
    data = create_data_dict(rangeValue, variables, data)
    save_dict_to_hdf5(data, outputDirectory+'/cellFeatures'+suffix+'.hdf5')
    print("Made features dictionary in file: "+outputDirectory+"/cellFeatures"+suffix+".hdf5")

In [11]:
prepareData("MyxAODAnalysis_all2D.outputs.root", (0, 2, 3, 4), ".", ('cell_x', 'cell_y', 'cell_z', 'cell_eta', 'cell_phi', 'cell_sampling', 'cell_noiseSigma', 'cell_e'), False, 60, "_100evsAll2D")

Made features dictionary in file: ./cellFeatures_100evsAll2D.hdf5
[[63, 1, 26496, 26497, 26498, 26499, 64, 26751, 127, 26500, 65], ..., [...]]
Wrote sorted neighbor pairs to file: ./neighbor_pairs_unique_sorted_100evsAll2D.hdf5
Wrote neighbor truth to file: ./neighbor_truth_100evsAll2D.hdf5


In [4]:
print(branches)

[{RunNumber: 0, EventNumber: 0, cell_eta: [-2.56, ...], ...}, {...}, ..., {...}]


In [5]:
len(branches['cluster_to_cell_indices'])
#Finds the length of the first element of the cluster_to_cell_indices branch

100

# Preparing data with required features for 600 events

In [6]:
cell_coordinate_x = extractData(branches, 'cell_x')
cell_coordinate_y = extractData(branches, 'cell_y')
cell_coordinate_z = extractData(branches, 'cell_z')
cell_eta = extractData(branches, 'cell_eta')
cell_phi = extractData(branches, 'cell_phi')
cell_subCalo = extractData(branches, 'cell_subCalo')
cell_sampling = extractData(branches, 'cell_sampling')
cell_weight = extractData(branches, 'cell_weight')
cell_truth = extractData(branches, 'cell_truth')
cell_to_cluster_e = extractData(branches, 'cell_to_cluster_e')
cell_noiseSigma = extractData(branches, 'cell_noiseSigma')
cell_SNR = extractData(branches, 'cell_SNR')
cell_time = extractData(branches, 'cell_time')
cellsNo_cluster = branches['cellsNo_cluster']
clustersNo_event = extractData(branches, 'clustersNo_event')
cell_to_cluster_index = extractData(branches, 'cell_cluster_index')
cell_e = extractData(branches, 'cell_e')
cluster_to_cell_indices = branches['cluster_to_cell_indices']
neighbor = branches['neighbor']
#Creates awkward arrays for all needed variables from the branches object

In [7]:
cell_to_cluster_index.shape
#Gets the shape of the cell_to_cluster_index array

(100, 187652)

In [8]:
len(cellsNo_cluster[1])
#Gets the length of the second element of the cellsNo_cluster array

757

In [9]:
print(cell_coordinate_y)

[[  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 ...
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]
 [  33.279976   93.48492   152.79437  ... -121.638336  -92.85043
   -68.40991 ]]


In [10]:
data = {}
rangeValue = len(cell_coordinate_x)
variables = [cell_coordinate_x, cell_coordinate_y, cell_coordinate_z, cell_eta, cell_phi, cell_sampling, cell_noiseSigma, cell_e]
data = create_data_dict(rangeValue, variables, data)
#Creates a data dictionary. Goes through every event and saves the cartesian cell coordinates, eta, phi, sampling, sigma noise, and energy into a two
#dimensional array that has the eight variables saved to each cell coordinate. Every event is saved to a dictionary element named data_{event_number}.


In [11]:
data['data_2'].shape
#Checks the shape of one of the arrays in the data dictionary.

(187652, 8)

In [12]:
save_dict_to_hdf5(data,'./cellFeatures_100evsAll2D.hdf5')
#Calls the first method from cell 15 and saves the data dictionary to an hdf5 file.

## preparing Neighbor Pairs

In [13]:
neighbor = branches['neighbor'][0]
len(neighbor)
#Creates a neighbor awkward array consisting of the first element of the neighbor branch. Prints the length of the neighbor array.

187652

In [14]:
neighbor[0]
#Prints the first element of the neighbor awkward array.

In [15]:
neighbor_pairs_set = removeBrokenCells(cell_noiseSigma, neighbor)
#Finds and prints the indices where there is zero cell noise. These cells are the not working cells.

In [16]:
len(neighbor_pairs_set)
#Prints the length of this new neighbor numpy array.

1483024

In [17]:
#remove list_of_tuples = [(3, 1), (1, 3), (2, 4), (4, 2), (5, 6)]
neighbor_pairs_unique = remove_permutation_variants(neighbor_pairs_set)
#Calls the methods created in cell 22 to remove duplicate tuples to create a new numpy array without duplicate tuples

In [18]:
len(neighbor_pairs_unique)
#Prints the length of the unique neighbor array

741512

In [19]:
neighbor_pairs_unique_sorted = sorted(neighbor_pairs_unique, key=lambda x: x[0])
#Sorts the unique neighbor array and stores it in a new array. I honestly am not sure how it is doing this.

In [20]:
neighbor_pairs_unique_sorted
#Prints the sorted array.

[(0, 26751),
 (0, 1),
 (0, 127),
 (0, 26497),
 (0, 63),
 (0, 26500),
 (0, 64),
 (0, 26496),
 (0, 26498),
 (0, 26499),
 (0, 65),
 (1, 26500),
 (1, 64),
 (1, 26501),
 (1, 65),
 (1, 26504),
 (1, 26499),
 (1, 26502),
 (1, 66),
 (1, 26503),
 (1, 2),
 (2, 26503),
 (2, 26505),
 (2, 26506),
 (2, 67),
 (2, 26504),
 (2, 3),
 (2, 26507),
 (2, 65),
 (2, 26508),
 (2, 66),
 (3, 26509),
 (3, 67),
 (3, 26510),
 (3, 26512),
 (3, 68),
 (3, 4),
 (3, 26507),
 (3, 26508),
 (3, 66),
 (3, 26511),
 (4, 67),
 (4, 26512),
 (4, 26513),
 (4, 26515),
 (4, 26516),
 (4, 68),
 (4, 26511),
 (4, 69),
 (4, 26514),
 (4, 5),
 (5, 26518),
 (5, 68),
 (5, 26516),
 (5, 26517),
 (5, 26519),
 (5, 69),
 (5, 26520),
 (5, 70),
 (5, 26515),
 (5, 6),
 (6, 71),
 (6, 26522),
 (6, 7),
 (6, 69),
 (6, 26520),
 (6, 70),
 (6, 26523),
 (6, 26524),
 (6, 26519),
 (6, 26521),
 (7, 71),
 (7, 26524),
 (7, 26525),
 (7, 26528),
 (7, 72),
 (7, 26523),
 (7, 8),
 (7, 26526),
 (7, 70),
 (7, 26527),
 (8, 26528),
 (8, 72),
 (8, 26529),
 (8, 26531),
 (8,

In [21]:
writeH5File('./neighbor_pairs_unique_sorted_all2D.hdf5', "neighbor_pair", neighbor_pairs_unique_sorted)
#Saves the sorted neighbor pair array to an hdf5 file.

## Loading neighbor cell pairs

In [22]:
neighbor_pairs_unique_sorted = readH5File("./neighbor_pairs_unique_sorted_all2D.hdf5", 'neighbor_pair')
#Retrieves the sorted neighbor pair data from the hdf5 file.

In [23]:
len(neighbor_pairs_unique_sorted)
#Prints the length of the retrieved data.

741512

In [24]:
neighbor_pairs_unique_sorted.shape
#Prints the shape of the retrieved data.

(741512, 2)

## Creating Labels for neighbor_pairs

In [25]:
cell_to_cluster_index.shape
#Gets the shape of the cell_to_cluster_index array.

(100, 187652)

In [26]:
mapping = neighbor_pairs_mapping(0, 2, 3, 4)

In [27]:
true_neighbor_cluster = label_neighbor_pairs(rangeValue, cell_to_cluster_index, neighbor_pairs_unique_sorted, mapping)

In [28]:
true_neighbor_cluster.shape
#Converts the neighbor truth array into a numpy array. Prints the shape of the array.

(100, 741512)

In [29]:
true_neighbor_cluster[4]
#Prints the fifth element of the array.

array([1, 1, 1, ..., 0, 0, 0])

In [30]:
writeH5File('neighbor_truth_100evs_all2D.hdf5', "neighbor_truth_100evs", true_neighbor_cluster)
#Saves the neighbor truth array to an hdf5 file.