## Required Packages

In [149]:
import numpy as np
import joblib # efficiently serializes python objects and is used in saving and loading machine learning models
import random
random.seed(42)
import h5py
from sklearn.preprocessing import MinMaxScaler

# Reformating the hdf5 file and preparing all the data for testing in a GNN
# and saving it as multiple hdf5 file

## Reading and arranging data 

In [150]:
def hdf5_to_dict(hdf5_file):
    """
    Convert HDF5 file to Python dictionary
    """
    data_dict = {}
    _hdf5_to_dict(hdf5_file, data_dict)
    return data_dict
# Initializes an empty dictionary and calls a function to recursively
# fill this dictionary with data from the hdf5 file.


def _hdf5_to_dict(group, dic):
    """
    Convert HDF5 group to dictionary recursively
    """
    for key, item in group.items():
        if isinstance(item, h5py.Group):
            subgroup = {}
            _hdf5_to_dict(item, subgroup)
            dic[key] = subgroup
        else:
            dic[key] = np.array(item)
# Iterates over items in the hdf5 group. If the item is a group, 
# it creates a new dictionary and calls itself recursively. If the item
# is a dataset, it converts it to a numpy array and stores it in the dictionary.

In [151]:
# cell features of 100 events (100 X 180k)
with h5py.File('/storage/mxg1065/cellFeatures_100evs.hdf5', 'r') as f:
    # Convert HDF5 to dictionary
    data_dict = hdf5_to_dict(f)
# Calls the methods to open the hdf5 file and save it to a data dictionary.

In [152]:
f.close()
#Closes the hdf5 file.

In [153]:
#storing 100 events as dict data_0, data_1 ... data_99 not sure why I am doing this
dynamic_variables = {}

#Generate variable names dynamically and assign values to them
for i in range(100):
    var_name = f"data_{i}"
    dynamic_variables[var_name] = data_dict[var_name]

#This dictionary is similar to the previous one but it is on memory and not on hdf5
#dynamic_variables = data_dict - check if this runs or if there was an issue.

In [154]:
# alligning data feature-wise for scaling
data_100 = np.concatenate([value for key, value in list(dynamic_variables.items())])

#Collect all the values from dynamic_variables and concatenate them along the first
#axis resulting in a single numpy array with all the event data

In [155]:
data_100.shape
#Checks the shape of data_100. There are 8 features, but 100 times
#the number of cells in the first column

(18765200, 8)

In [156]:
# truth labels for each neighbour edges
hf_neighbor_truth_100evs = h5py.File("/storage/mxg1065/neighborLabels100Events.hdf5", 'r')
#Pulls the neighbor truth data from the hdf5 file.
neighbor_truth_100evs = hf_neighbor_truth_100evs.get("neighborLabels100Events")[:]
#Puts the neighbor truth data into an array.
hf_neighbor_truth_100evs.close()
#Closes the neighbor truth hdf5 file.

In [157]:
neighbor_truth_100evs.shape
#Checks the shape of the neighbor truth array. 100 events,
#each with 1.25 million unique neighbor pairs

(100, 1250242)

## Collecteing indices of neighbour edges based on truth (1,0,2,3)

In [158]:
numberOfDatapoints = len(neighbor_truth_100evs)
numberOfDatapoints

100

In [159]:
# There are 5 types of neighbor labeling 
true = [] # true (same cluster cell pair): 1
bkg_lone = [] # fake/bkg (lone/non-participating cell pair): 0
bkg_cluster_lone = [] # fake/bkg (cluster-lone cell pair): 2
bkg_lone_cluster = [] # fake/bkg (lone-cluster cell pair): 3
bkg_cluster_cluster = [] # fake/bkg (cluster-cluster cell pair): 4

for i in range(numberOfDatapoints):
    true.append(list(np.where(neighbor_truth_100evs[i]==1)[0]))
    bkg_lone.append(list(np.where(neighbor_truth_100evs[i]==0)[0]))
    bkg_cluster_lone.append(list(np.where(neighbor_truth_100evs[i]==2)[0]))
    bkg_lone_cluster.append(list(np.where(neighbor_truth_100evs[i]==3)[0]))
    bkg_cluster_cluster.append(list(np.where(neighbor_truth_100evs[i]==4)[0]))

# np.where returns a tuple (n,) where the conditions within the method are true. In the first 
# case, we find the indices where the neighbor pairs are considered true (=1, neighboring cells
# within the same cluster). We obtain the first element [0], turning it into a list and appending
# that to the empty true list. It's a similar concept for the other lists.

In [160]:
len(true[0])
#True list and others have information for 100 events, each with a
#different number of neighbor pairs that qualify as true. For example,
#the first event has 95,782 true pairs.

95782

## Collecting the number of pairs of each edge (true, bkg_lone, bkg_cluster_lone, bkg_cluster_cluster)

In [161]:
# Checking length is important since its complicated to deal with awkward arrays
# Try make sample as n x m matrix (so n: events, m: true/bkg samples)

true_pairNumber = []
bkg_lone_pairNumber = []
bkg_cluster_lone_pairNumber = []
bkg_lone_cluster_pairNumber = []
bkg_cluster_cluster_pairNumber = []
for i in range(numberOfDatapoints):
    true_pairNumber.append(len(true[i]))
    bkg_lone_pairNumber.append(len(bkg_lone[i]))
    bkg_cluster_lone_pairNumber.append(len(bkg_cluster_lone[i]))
    bkg_lone_cluster_pairNumber.append(len(bkg_lone_cluster[i]))
    bkg_cluster_cluster_pairNumber.append(len(bkg_cluster_cluster[i]))
    
# Makes arrays that stores the number of pairs within each type of cell-cell relationship 
# in the arrays from the previous cell. It then populates these arrays with the appropriate
# data.

true_pairNumber = np.array(true_pairNumber)
bkg_lone_pairNumber = np.array(bkg_lone_pairNumber)
bkg_cluster_lone_pairNumber = np.array(bkg_cluster_lone_pairNumber)
bkg_lone_cluster_pairNumber = np.array(bkg_lone_cluster_pairNumber)
bkg_cluster_cluster_pairNumber = np.array(bkg_cluster_cluster_pairNumber)

In [162]:
# Creates new arrays that represent training and testing dataset
# We performa a training-testing data-split of 70%-30%

# Train sample
true_70 = true[:70]
bkg_lone_70 = bkg_lone[:70]
bkg_cluster_lone_70 = bkg_cluster_lone[:70]
bkg_lone_cluster_70 = bkg_lone_cluster[:70]
bkg_cluster_cluster_70 = bkg_cluster_cluster[:70]

# Test sample
true_30 = true[70:]
bkg_lone_30 = bkg_lone[70:]
bkg_cluster_lone_30 = bkg_cluster_lone[70:]
bkg_lone_cluster_30 = bkg_lone_cluster[70:]
bkg_cluster_cluster_30 = bkg_cluster_cluster[70:]

# Train pairNumber
true_pairNumber_70 = true_pairNumber[:70]
bkg_lone_pairNumber_70 = bkg_lone_pairNumber[:70]
bkg_cluster_lone_pairNumber_70 = bkg_cluster_lone_pairNumber[:70]
bkg_lone_cluster_pairNumber_70 = bkg_lone_cluster_pairNumber[:70]
bkg_cluster_cluster_pairNumber_70 = bkg_cluster_cluster_pairNumber[:70]

# Test pairNumber
true_pairNumber_30 = true_pairNumber[70:]
bkg_lone_pairNumber_30 = bkg_lone_pairNumber[70:]
bkg_cluster_lone_pairNumber_30 = bkg_cluster_lone_pairNumber[70:]
bkg_lone_cluster_pairNumber_30 = bkg_lone_cluster_pairNumber[70:]
bkg_cluster_cluster_pairNumber_30 = bkg_cluster_cluster_pairNumber[70:]

## Checking the minimum pair_number size of each type of edges (true, bkg_l, bkg_c_l, bkg_c_c)

In [163]:
print(min(true_pairNumber_70))
print(min(bkg_lone_pairNumber_70))
print(min(bkg_cluster_lone_pairNumber_70))
print(min(bkg_lone_cluster_pairNumber_70))
print(min(bkg_cluster_cluster_pairNumber_70))

45600
926119
41654
45689
3334


### Based on minimum pairNumber concluding true: 15k, bkg_l: 6_5k, bkg_c_l: 6_5k, bkg_c_c: 2k

In [164]:
trainingSetTrueIndices = np.array([random.sample(row, 33000) for row in true_70])
trainingSetBkgLoneLoneIndices = np.array([random.sample(row, 10000) for row in bkg_lone_70])
trainingSetBkgClusterLoneIndices = np.array([random.sample(row, 10000) for row in bkg_cluster_lone_70])
trainingSetBkgLoneClusterIndices = np.array([random.sample(row, 10000) for row in bkg_lone_cluster_70])
trainingSetClusterClusterIndices = np.array([random.sample(row, 3000) for row in bkg_cluster_cluster_70])

In [165]:
trainingSetBkgTotalIndices = np.concatenate((trainingSetBkgLoneLoneIndices,
                                             trainingSetBkgLoneLoneIndices,
                                             trainingSetBkgLoneClusterIndices,
                                             trainingSetClusterClusterIndices),
                                            axis =1)
#Concatenates all of the sampled background arrays into one sampled background array.

In [166]:
print(trainingSetTrueIndices.shape)
print(trainingSetBkgTotalIndices.shape)

(70, 33000)
(70, 33000)


In [167]:
trainingSetTotalIndices = np.concatenate((trainingSetTrueIndices ,trainingSetBkgTotalIndices), axis =1)
trainingSetTotalIndices.shape

(70, 66000)

### Creating the four labels (1,0,2,3) for our four situations

In [168]:
# Label 0 if both cells are non-participating (Lone-Lone)
trainingLabelsBkgLoneEdges = np.zeros((70,10000), dtype=int)

# Label 2 if first cell is from a cluster and the second cell is non-participating (Cluster-Lone)
trainingLabelsBkgClusterLoneEdges = np.ones((70,10000), dtype=int)*2

# Label 3 if first cell is non-participating and the second cell is from a cluster (Lone-Cluster)
trainingLabelsBkgLoneClusterEdges = np.ones((70,10000), dtype=int)*3

# Label 4 for cells from two different clusters (Cluster-Cluster)
trainingLabelsBkgClusterClusterEdges = np.ones((70,3000), dtype=int)*4

# Concatenate the background arrays
trainingLabelsBkgTotalEdges = np.concatenate((trainingLabelsBkgLoneEdges, trainingLabelsBkgLoneClusterEdges,
                                    trainingLabelsBkgClusterLoneEdges, trainingLabelsBkgClusterClusterEdges),
                                    axis =1)

# Label 1 if both cells belong to the same cluster (True)
trainingLabelsTrueEdges = np.ones((70, 33000), dtype=int)

# Creates a background array in the shape of the sampled background array. Creates an array of ones in the shape of the sampled true array. 
# These are to create arrays to show the truth value of the sampled arrays. I am going to call these truth arrays.

In [169]:
trainingLabelsTotal = np.concatenate((trainingLabelsTrueEdges,trainingLabelsBkgTotalEdges), axis =1)
# Concatenate the truth arrays together to create one truth array.

In [170]:
trainingLabelsTotal

array([[1, 1, 1, ..., 4, 4, 4],
       [1, 1, 1, ..., 4, 4, 4],
       [1, 1, 1, ..., 4, 4, 4],
       ...,
       [1, 1, 1, ..., 4, 4, 4],
       [1, 1, 1, ..., 4, 4, 4],
       [1, 1, 1, ..., 4, 4, 4]])

In [171]:
trainingLabelsTotal.shape
# Check the shape of the truth array.

(70, 66000)

In [172]:
randomIndexTraining = []
for i in range(len(trainingLabelsTotal)):
    arr = np.arange(len(trainingLabelsTotal[0]))
    np.random.shuffle(arr)
    randomIndexTraining.append(arr)
#Creates an array that releases a random sequence of indices. I’m going to call this the random indices array.

randomIndexTraining = np.array(randomIndexTraining)
#Converts the random indices array into a numpy array.

In [173]:
randomIndexTraining.shape

(70, 66000)

In [174]:
# Randomizing training sample
randomTrainingIndicesTotal = []
for i in range(len(trainingLabelsTotal)):
    randomTrainingIndicesTotal.append(trainingSetTotalIndices[i][randomIndexTraining[i]])
#Uses the random indices array to randomize the training array and saves it into a randomized training array.

In [175]:
# Randomizing truth of training sample according to the sample
randomTrainingLabelsTotal = []
for i in range(len(trainingLabelsTotal)):
    randomTrainingLabelsTotal.append(trainingLabelsTotal[i][randomIndexTraining[i]])
#Uses the random indices array to randomize the truth array and saves it into a randomized truth array.
randomTrainingIndicesTotal = np.array(randomTrainingIndicesTotal)
#Converts the randomized training array into a numpy array.
randomTrainingLabelsTotal = np.array(randomTrainingLabelsTotal)
#Converts the randomized truth array into a numpy array.

In [176]:
randomTrainingLabelsTotal.shape

(70, 66000)

In [177]:
with h5py.File('/storage/mxg1065/MultiClassGNN/totalTrainingLabelsRandom.hdf5', 'w') as f: 
    dset = f.create_dataset("totalTrainingLabelsRandom", data = randomTrainingLabelsTotal)
#Saves the randomized truth array to an hdf5 file.

In [178]:
randomTrainingIndicesTotal.shape
#Prints the shape of the randomized training array.

(70, 66000)

In [179]:
hf_neighbor_pairs_unique_sorted= h5py.File("/storage/mxg1065/neighborPairsUniqueSorted.hdf5", 'r')
neighbor_pairs_unique_sorted = hf_neighbor_pairs_unique_sorted.get("neighbor_pair")[:]
hf_neighbor_pairs_unique_sorted.close()
#Retrieves the sorted neighbor pairs from an hdf5 file.

In [180]:
# Arranging training edges with training indices
totalTrainingEdgesRandom = []
for i in range(len(trainingLabelsTotal)):
    totalTrainingEdgesRandom.append(neighbor_pairs_unique_sorted[randomTrainingIndicesTotal[i]])
#Creates a training edges array by using the randomized training indices on the sorted neighbor pairs array.

totalTrainingEdgesRandom = np.array(totalTrainingEdgesRandom)
#Convert the training edges array into a numpy array.

In [181]:
totalTrainingEdgesRandom.shape
#70 events, each with 54,000 pair numbers, each with two cell indices that are neighbors

(70, 66000, 2)

In [182]:
# Arranging training edges with training indices
uniDirEdgeIndicesTrain = []
for i in range(len(trainingLabelsTotal)):
    uniDirEdgeIndicesTrain.append(neighbor_pairs_unique_sorted)

uniDirEdgeIndicesTrain = np.array(uniDirEdgeIndicesTrain)
print(uniDirEdgeIndicesTrain.shape)


totalTrainingEdgesRandom = []
for i in range(len(trainingLabelsTotal)):
    totalTrainingEdgesRandom.append(neighbor_pairs_unique_sorted)
    # totalTrainingEdgesRandom.append(neighbor_pairs_unique[total_training_indices_rand[i]])
#Creates a training edges array by using the randomized training indices on the sorted neighbor pairs array.

totalTrainingEdgesRandom = np.array(totalTrainingEdgesRandom)
#Convert the training edges array into a numpy array.

print(totalTrainingEdgesRandom.shape)
#70 events, each with 54,000 pair numbers, each with two cell indices that are neighbors

(70, 1250242, 2)
(70, 1250242, 2)


In [183]:
def createArray(inputData, numberOfData, source, BD):
    outputData = []
    for i in range(numberOfData):
        _outputData = []
        for pair in inputData[i]:
            if BD:
                if source:
                    _outputData.append(pair[0])
                    _outputData.append(pair[1])
                else:
                    _outputData.append(pair[1])
                    _outputData.append(pair[0])
            else:
                if source:
                    _outputData.append(pair[0])
                else:
                    _outputData.append(pair[1])
        outputData.append(_outputData)
    return outputData

In [184]:
train_edge_source_BD = createArray(uniDirEdgeIndicesTrain, 70, True, True)
train_edge_source_BD = np.array(train_edge_source_BD)
#Converts the train_edge_source_BD array into a numpy array.

train_edge_dest_BD = createArray(uniDirEdgeIndicesTrain, 70, False, True)
train_edge_dest_BD = np.array(train_edge_dest_BD)
#Converts the train_edge_dest_BD array into a numpy array.


train_edge_source_noBD = createArray(totalTrainingEdgesRandom, 70, True, False)
train_edge_source_noBD = np.array(train_edge_source_noBD)
train_edge_dest_noBD = createArray(totalTrainingEdgesRandom, 70, False, False)
train_edge_dest_noBD = np.array(train_edge_dest_noBD)

print(train_edge_source_BD.shape) #The bidirectionality doubles the number of pairs counted from 30,000 to 60,000
#Prints the shape of the train_edge_source_BD array.
print(train_edge_dest_BD.shape)
#Prints the shape of the train_edge_dest_BD array.
print(train_edge_source_noBD.shape)
#Prints the shape of the train_edge_source_noBD array.
print(train_edge_dest_noBD.shape)
#Prints the shape of the train_edge_dest_noBD array.

(70, 2500484)
(70, 2500484)
(70, 1250242)
(70, 1250242)


In [185]:
with h5py.File('/storage/mxg1065/MultiClassGNN/train_edge_source_BD_70evs.hdf5', 'w') as f: 
    dset = f.create_dataset("train_edge_source_BD", data = train_edge_source_BD)

with h5py.File('/storage/mxg1065/MultiClassGNN/train_edge_dest_BD_70evs.hdf5', 'w') as f: 
    dset = f.create_dataset("train_edge_dest_BD", data = train_edge_dest_BD)
    
    
with h5py.File('/storage/mxg1065/MultiClassGNN/train_edge_source_noBD_70evs.hdf5', 'w') as f: 
    dset = f.create_dataset("train_edge_source_noBD", data = train_edge_source_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/train_edge_dest_noBD_70evs.hdf5', 'w') as f: 
    dset = f.create_dataset("train_edge_dest_noBD", data = train_edge_dest_noBD)
#Saves the BD and noBD arrays into hdf5 files.

# Scaling of features

In [186]:
keys = list(dynamic_variables.keys())
values = list(dynamic_variables.values())
#Gets the keys and variables from the dynamic variables array and stores them in arrays.

In [187]:
dictDymanicVariables = dict(zip(keys, values))
#Creates a rearranged dictionary out of the keys and sorted values.

In [188]:
data_70 = np.concatenate([value for key, value in list(dictDymanicVariables.items())[:70]])
#Create a training data array.

In [189]:
data_30 = np.concatenate([value for key, value in list(dynamic_variables.items())[70:]])
#Create a data testing array.

In [190]:
scaler = MinMaxScaler()
cellFeatures_trainS = scaler.fit_transform(data_70)
scaler_filename = "/storage/mxg1065/MultiClassGNN/multi_scaler_neighbor_data_70.save"
joblib.dump(scaler, scaler_filename)
#Scales the training data with a minmaxscaler, put that scaled data into a training features array, and then save that scaler into a .save file.

['/storage/mxg1065/MultiClassGNN/multi_scaler_neighbor_data_70.save']

In [191]:
cellFeatures_trainS_reshape = cellFeatures_trainS.reshape(70, 187652, 8)
#Reshape the cell feature training array.

In [192]:
scaler = joblib.load('/storage/mxg1065/MultiClassGNN/multi_scaler_neighbor_data_70.save') 
cellFeatures_testS = scaler.transform(data_30)
#Retrieve the scaler from the .save file. Use the scaler on the test data to create a cell feature test data array.

In [193]:
cellFeatures_testS_reshape = cellFeatures_testS.reshape(30, 187652, 8)
#Reshape the cell feature testing array.

In [194]:
# creating Scaled Cell Feature file
with h5py.File('/storage/mxg1065/MultiClassGNN/multi_cellFeaturesScaled_train_70evs.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_cellFeatures_trainS", data = cellFeatures_trainS_reshape)

In [195]:
with h5py.File('/storage/mxg1065/MultiClassGNN/multi_cellFeaturesScaled_test_30evs.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_cellFeatures_testS", data = cellFeatures_testS_reshape)
#Save the cell features arrays into hdf5 files.

### for test sample

In [196]:
testingSetTrueIndices = np.array([random.sample(row, 33000) for row in true_30])
testingSetBkgLoneLoneIndices = np.array([random.sample(row, 10000) for row in bkg_lone_30])
testingSetBkgClusterLoneIndices = np.array([random.sample(row, 10000) for row in bkg_cluster_lone_30])
testingSetBkgLoneClusterIndices = np.array([random.sample(row, 10000) for row in bkg_lone_cluster_30])
testingSetBkgClusterClusterIndices = np.array([random.sample(row, 3000) for row in bkg_cluster_cluster_30])

In [197]:
testingSetBkgTotalIndices = np.concatenate((testingSetBkgLoneLoneIndices,
                                             testingSetBkgLoneLoneIndices,
                                             testingSetBkgLoneClusterIndices,
                                             testingSetClusterClusterIndices),
                                            axis =1)
#Concatenates all of the sampled background arrays into one sampled background array.

In [198]:
testingSetTotalIndices = np.concatenate((testingSetTrueIndices ,testingSetBkgTotalIndices), axis =1)
testingSetTotalIndices.shape

(30, 66000)

In [199]:
# Label 0 if both cells are non-participating (Lone-Lone)
testingLabelsBkgLoneEdges = np.zeros((30,10000), dtype=int)

# Label 2 if first cell is from a cluster and the second cell is non-participating (Cluster-Lone)
testingLabelsBkgClusterLoneEdges = np.ones((30,10000), dtype=int)*2

# Label 3 if first cell is non-participating and the second cell is from a cluster (Lone-Cluster)
testingLabelsBkgLoneClusterEdges = np.ones((30,10000), dtype=int)*3

# Label 4 for cells from two different clusters (Cluster-Cluster)
testingLabelsBkgClusterClusterEdges = np.ones((30,3000), dtype=int)*4

# Concatenate the background arrays
testingLabelsBkgTotalEdges = np.concatenate((testingLabelsBkgLoneEdges, testingLabelsBkgLoneClusterEdges,
                                    testingLabelsBkgClusterLoneEdges, testingLabelsBkgClusterClusterEdges),
                                    axis =1)

# Label 1 if both cells belong to the same cluster (True)
testingLabelsTrueEdges = np.ones((30, 33000), dtype=int)

# Creates a background array in the shape of the sampled background array. Creates an array of ones in the shape of the sampled true array. 
# These are to create arrays to show the truth value of the sampled arrays. I am going to call these truth arrays.

In [200]:
testingLabelsTotal = np.concatenate((testingLabelsTrueEdges,testingLabelsBkgTotalEdges), axis =1)
# Concatenate the truth arrays together to create one truth array.

In [201]:
testingLabelsTotal.shape

(30, 66000)

In [202]:
randomIndexTesting = []
for i in range(len(testingLabelsTotal)):
    arr = np.arange(len(testingLabelsTotal[0]))
    np.random.shuffle(arr)
    randomIndexTesting.append(arr)
#Creates an array that releases a random sequence of indices. I’m going to call this the random indices array.

randomIndexTesting = np.array(randomIndexTesting)
#Converts the random indices array into a numpy array.

In [203]:
# Randomizing testing sample
randomTestingIndicesTotal = []
for i in range(len(testingLabelsTotal)):
    randomTestingIndicesTotal.append(testingSetTotalIndices[i][randomIndexTesting[i]])

In [204]:
# Randomizing truth of testing sample according to the sample
randomTestingLabelsTotal = []
for i in range(len(testingLabelsTotal)):
    randomTestingLabelsTotal.append(testingLabelsTotal[i][randomIndexTesting[i]])
#Uses the random indices array to randomize the truth array and saves it into a randomized truth array.
randomTestingIndicesTotal = np.array(randomTestingIndicesTotal)
#Converts the randomized testing array into a numpy array.
randomTestingLabelsTotal = np.array(randomTestingLabelsTotal)
#Converts the randomized truth array into a numpy array.

In [205]:
with h5py.File('/storage/mxg1065/MultiClassGNN/totalTestingLabelsRandom.hdf5', 'w') as f: 
    dset = f.create_dataset("totalTestingLabelsRandom", data = randomTestingLabelsTotal)
#Saves the randomized truth array to an hdf5 file.

In [206]:
# Arranging testing edges with testing indices
totalTestingEdgesRandom = []
for i in range(len(testingLabelsTotal)):
    totalTestingEdgesRandom.append(neighbor_pairs_unique_sorted[randomTestingIndicesTotal[i]])
#Creates a testing edges array by using the randomized testing indices on the sorted neighbor pairs array.

totalTestingEdgesRandom = np.array(totalTestingEdgesRandom)
#Convert the testing edges array into a numpy array.

In [207]:
# Arranging testing edges with testing indices
uniDirEdgeIndicesTest = []
for i in range(len(testingLabelsTotal)):
    uniDirEdgeIndicesTest.append(neighbor_pairs_unique_sorted)

uniDirEdgeIndicesTest = np.array(uniDirEdgeIndicesTest)
print(uniDirEdgeIndicesTest.shape)


totalTestingEdgesRandom = []
for i in range(len(testingLabelsTotal)):
    totalTestingEdgesRandom.append(neighbor_pairs_unique_sorted)
    # totalTestingEdgesRandom.append(neighbor_pairs_unique[total_testing_indices_rand[i]])
#Creates a testing edges array by using the randomized testing indices on the sorted neighbor pairs array.

totalTestingEdgesRandom = np.array(totalTestingEdgesRandom)
#Convert the testing edges array into a numpy array.

print(totalTestingEdgesRandom.shape)
#70 events, each with 54,000 pair numbers, each with two cell indices that are neighbors

(30, 1250242, 2)
(30, 1250242, 2)


In [208]:
test_edge_source_BD = createArray(uniDirEdgeIndicesTest, 30, True, True)
test_edge_source_BD = np.array(test_edge_source_BD)
#Converts the test_edge_source_BD array into a numpy array.

test_edge_dest_BD = createArray(uniDirEdgeIndicesTest, 30, False, True)
test_edge_dest_BD = np.array(test_edge_dest_BD)
#Converts the test_edge_dest_BD array into a numpy array.


test_edge_source_noBD = createArray(totalTestingEdgesRandom, 30, True, False)
test_edge_source_noBD = np.array(test_edge_source_noBD)
test_edge_dest_noBD = createArray(totalTestingEdgesRandom, 30, False, False)
test_edge_dest_noBD = np.array(test_edge_dest_noBD)

print(test_edge_source_BD.shape) #The bidirectionality doubles the number of pairs counted from 30,000 to 60,000
#Prints the shape of the test_edge_source_BD array.
print(test_edge_dest_BD.shape)
#Prints the shape of the test_edge_dest_BD array.
print(test_edge_source_noBD.shape)
#Prints the shape of the test_edge_source_noBD array.
print(test_edge_dest_noBD.shape)
#Prints the shape of the test_edge_dest_noBD array.

(30, 2500484)
(30, 2500484)
(30, 1250242)
(30, 1250242)


In [209]:
with h5py.File('/storage/mxg1065/MultiClassGNN/test_edge_source_BD_30evs.hdf5', 'w') as f: 
    dset = f.create_dataset("test_edge_source_BD", data = test_edge_source_BD)

with h5py.File('/storage/mxg1065/MultiClassGNN/test_edge_dest_BD_30evs.hdf5', 'w') as f: 
    dset = f.create_dataset("test_edge_dest_BD", data = test_edge_dest_BD)
    
    
with h5py.File('/storage/mxg1065/MultiClassGNN/test_edge_source_noBD_30evs.hdf5', 'w') as f: 
    dset = f.create_dataset("test_edge_source_noBD", data = test_edge_source_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/test_edge_dest_noBD_30evs.hdf5', 'w') as f: 
    dset = f.create_dataset("test_edge_dest_noBD", data = test_edge_dest_noBD)
#Saves the BD and noBD arrays into hdf5 files.

In [210]:
testingEdgesTrue = []
testingEdgesBkgLone = []
testingEdgesBkgClusterLone = []
testingEdgesBkgLoneCluster = []
testingEdgesBkgClusterCluster = []

for i in range(30):
    testingEdgesTrue.append(neighbor_pairs_unique_sorted[testingSetTrueIndices[i]])
    testingEdgesBkgLone.append(neighbor_pairs_unique_sorted[testingSetBkgLoneLoneIndices[i]])
    testingEdgesBkgClusterLone.append(neighbor_pairs_unique_sorted[testingSetBkgClusterLoneIndices[i]])
    testingEdgesBkgLoneCluster.append(neighbor_pairs_unique_sorted[testingSetBkgLoneClusterIndices[i]])
    testingEdgesBkgClusterCluster.append(neighbor_pairs_unique_sorted[testingSetBkgClusterClusterIndices[i]])
#Create a test edges arrays by putting the sampled length indices into the sorted neighbor arrays.

testingEdgesTrue = np.array(testingEdgesTrue)
testingEdgesBkgLone = np.array(testingEdgesBkgLone)
testingEdgesBkgClusterLone = np.array(testingEdgesBkgClusterLone)
testingEdgesBkgLoneCluster = np.array(testingEdgesBkgLoneCluster)
testingEdgesBkgClusterCluster = np.array(testingEdgesBkgClusterCluster)

In [211]:
test_edge_source_true_BD = createArray(testingEdgesTrue, 30, True, True)

test_edge_dest_true_BD = createArray(testingEdgesTrue, 30, False, True)

test_edge_source_bkg_lone_BD = createArray(testingEdgesBkgLone, 30, True, True)

test_edge_dest_bkg_lone_BD = createArray(testingEdgesBkgLone, 30, False, True)

test_edge_source_bkg_cluster_lone_BD = createArray(testingEdgesBkgClusterLone, 30, True, True)

test_edge_dest_bkg_cluster_lone_BD = createArray(testingEdgesBkgClusterLone, 30, False, True)

test_edge_source_bkg_lone_cluster_BD = createArray(testingEdgesBkgLoneCluster, 30, True, True)

test_edge_dest_bkg_lone_cluster_BD = createArray(testingEdgesBkgLoneCluster, 30, False, True)

test_edge_source_bkg_cluster_cluster_BD = createArray(testingEdgesBkgClusterCluster, 30, True, True)

test_edge_dest_bkg_cluster_cluster_BD = createArray(testingEdgesBkgClusterCluster, 30, False, True)

test_edge_source_true_noBD = createArray(testingEdgesTrue, 30, True, False)

test_edge_dest_true_noBD = createArray(testingEdgesTrue, 30, False, False)

test_edge_source_bkg_lone_noBD = createArray(testingEdgesBkgLone, 30, True, False)

test_edge_dest_bkg_lone_noBD = createArray(testingEdgesBkgLone, 30, False, False)

test_edge_source_bkg_cluster_lone_noBD = createArray(testingEdgesBkgClusterLone, 30, True, False)

test_edge_dest_bkg_cluster_lone_noBD = createArray(testingEdgesBkgClusterLone, 30, False, False)

test_edge_source_bkg_lone_cluster_noBD = createArray(testingEdgesBkgLoneCluster, 30, True, False)

test_edge_dest_bkg_lone_cluster_noBD = createArray(testingEdgesBkgLoneCluster, 30, False, False)

test_edge_source_bkg_cluster_cluster_noBD = createArray(testingEdgesBkgClusterCluster, 30, True, False)

test_edge_dest_bkg_cluster_cluster_noBD = createArray(testingEdgesBkgClusterCluster, 30, False, False)

In [212]:
with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_true_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_true_BD", data = test_edge_source_true_BD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_true_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_true_BD", data = test_edge_dest_true_BD)
    
with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_true_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_true_noBD", data = test_edge_source_true_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_true_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_true_noBD", data = test_edge_dest_true_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_bkg_lone_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_bkg_lone_BD", data = test_edge_source_bkg_lone_BD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_bkg_lone_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_bkg_lone_BD", data = test_edge_dest_bkg_lone_BD)
    
with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_bkg_lone_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_bkg_lone_noBD", data = test_edge_source_bkg_lone_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_bkg_lone_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_bkg_lone_noBD", data = test_edge_dest_bkg_lone_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_bkg_cluster_lone_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_bkg_cluster_lone_BD", data = test_edge_source_bkg_cluster_lone_BD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_bkg_cluster_lone_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_bkg_cluster_lone_BD", data = test_edge_dest_bkg_cluster_lone_BD)
    
with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_bkg_cluster_lone_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_bkg_cluster_lone_noBD", data = test_edge_source_bkg_cluster_lone_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_bkg_cluster_lone_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_bkg_cluster_lone_noBD", data = test_edge_dest_bkg_cluster_lone_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_bkg_lone_cluster_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_bkg_lone_cluster_BD", data = test_edge_source_bkg_lone_cluster_BD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_bkg_lone_cluster_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_bkg_lone_cluster_BD", data = test_edge_dest_bkg_lone_cluster_BD)
    
with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_bkg_lone_cluster_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_bkg_lone_cluster_noBD", data = test_edge_source_bkg_lone_cluster_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_bkg_lone_cluster_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_bkg_lone_cluster_noBD", data = test_edge_dest_bkg_lone_cluster_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_bkg_cluster_cluster_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_bkg_cluster_cluster_BD", data = test_edge_source_bkg_cluster_cluster_BD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_bkg_cluster_cluster_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_bkg_cluster_cluster_BD", data = test_edge_dest_bkg_cluster_cluster_BD)
    
with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_source_bkg_cluster_cluster_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_source_bkg_cluster_cluster_noBD", data = test_edge_source_bkg_cluster_cluster_noBD)

with h5py.File('/storage/mxg1065/MultiClassGNN/multi_test_edge_dest_bkg_cluster_cluster_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("multi_test_edge_dest_bkg_cluster_cluster_noBD", data = test_edge_dest_bkg_cluster_cluster_noBD)
#Saves these BD and noBD arrays into hdf5 files.