## Required Packages

In [1]:
import numpy as np
import h5py
#Import python packages

In [2]:
from ipynb.fs.full.GeneralFunctions import writeH5File, readH5File, sampleDataTraining, createRandomIndices, randomize2DArray, randomizeEdges, createEdgeArrays, sampleDataTesting, hdf5_to_dict, indices_of_edges, pairs_of_edges, sorting_indices, sortList, create_train_set_test_set, scaling_of_features, reshapeFeatures

In [3]:
def scalingDataPreparation(rangeValue, splitNum, mapping, outputDirectory, suffix):
    total_training_indices, total_truth_training, edgeTestingDictionary, total_testing_indices, total_truth_testing, sorted_cluster_cluster_pairNumber_indices = makeTruthAndData(splitNum, mapping, outputDirectory, suffix)
    makeEdgeFiles(total_training_indices, total_testing_indices, total_truth_training, total_truth_testing, edgeTestingDictionary, outputDirectory, suffix)    
    makeScaledFeatures(rangeValue, splitNum, sorted_cluster_cluster_pairNumber_indices, outputDirectory, suffix)

In [4]:
def makeScaledFeatures(rangeValue, splitNum, sorted_cluster_cluster_pairNumber_indices, outputDirectory, suffix):
    dynamic_variables = makeDynamicVariables(suffix, outputDirectory, rangeValue)
    scaleFeatures(dynamic_variables, splitNum, sorted_cluster_cluster_pairNumber_indices, outputDirectory, suffix)

In [5]:
def makeEdgeFiles(total_training_indices, total_testing_indices, total_truth_training, total_truth_testing, edgeTestingDictionary, outputDirectory, suffix):
    total_training_indices_rand = randomizeData(total_training_indices, total_truth_training, "train", outputDirectory, suffix)
    total_testing_indices_rand = randomizeData(total_testing_indices, total_truth_testing, "test", outputDirectory, suffix)

    neighbor_pairs_unique_sorted= readH5File(outputDirectory+"/neighbor_pairs_unique_sorted"+suffix+".hdf5", "neighbor_pair")

    makeAndWriteEdges(neighbor_pairs_unique_sorted, total_training_indices_rand, "train", "total", outputDirectory, suffix)
    makeAndWriteTestingEdges(neighbor_pairs_unique_sorted, edgeTestingDictionary, total_testing_indices_rand, outputDirectory, suffix) 

In [6]:
def makeAndWriteTestingEdges(neighbor_pairs_unique_sorted, edgeTestingDictionary, total_testing_indices_rand, outputDirectory, suffix):
    edgeType = ["true", "bkg_lone", "bkg_cluster_lone", "bkg_lone_cluster", "bkg_cluster_cluster"]
    for i in range(len(edgeType)):
        makeAndWriteEdges(neighbor_pairs_unique_sorted, edgeTestingDictionary[edgeType[i]], "test", edgeType[i], outputDirectory, suffix)
    makeAndWriteEdges(neighbor_pairs_unique_sorted, total_testing_indices_rand, "test", "total", outputDirectory, suffix)

In [7]:
def scaleFeatures(dynamic_variables, splitNum, sorted_cluster_cluster_pairNumber_indices, outputDirectory, suffix):
    cellFeatures_trainS, cellFeatures_testS = scaling_of_features(dynamic_variables, splitNum, sorted_cluster_cluster_pairNumber_indices, outputDirectory+"/scaler_neighbor_data"+suffix+".save")
    cellFeatures_trainS_reshape = reshapeFeatures(splitNum, cellFeatures_trainS)
    cellFeatures_testS_reshape = reshapeFeatures(len(sorted_cluster_cluster_pairNumber_indices)-splitNum, cellFeatures_testS)

    writeH5File(outputDirectory+'/cellFeaturesScaled_train'+suffix+'.hdf5', "cellFeatures_trainS", cellFeatures_trainS_reshape)
    print("Wrote training scaled features to: "+outputDirectory+"/cellFeaturesScaled_train"+suffix+".hdf5")
    writeH5File(outputDirectory+'/cellFeaturesScaled_test'+suffix+'.hdf5', "cellFeatures_testS", cellFeatures_testS_reshape)
    print("Wrote testing scaled features to: "+outputDirectory+"/cellFeaturesScaled_test"+suffix+".hdf5")

In [8]:
def makeAndWriteEdges(neighbor_pairs_unique_sorted, indices, trainingOrTesting, edgeType, outputDirectory, suffix):
    edges = randomizeEdges(indices, neighbor_pairs_unique_sorted)

    edgeDictionary = createEdgeArrays(edges)

    writeEdges(edgeDictionary, trainingOrTesting, edgeType, outputDirectory, suffix)

In [9]:
def writeEdges(edgeDictionary, trainingOrTesting, edgeType, outputDirectory, suffix):
    data = [edgeDictionary["source_BD"], edgeDictionary["dest_BD"], edgeDictionary["source_noBD"], edgeDictionary["dest_noBD"]]
    sourceOrDest = ["source", "dest", "source", "dest"]
    BDOrnoBD = ["BD", "BD", "noBD", "noBD"]
    for i in range(len(data)):
        writeEdge(data[i], trainingOrTesting, sourceOrDest[i], BDOrnoBD[i], edgeType, outputDirectory, suffix)

In [10]:
def writeEdge(edge, trainingOrTesting, sourceOrDest, BDOrnoBD, edgeType, outputDirectory, suffix):
    writeH5File(outputDirectory+'/'+trainingOrTesting+'_edge_'+sourceOrDest+'_'+edgeType+'_'+BDOrnoBD+suffix+'.hdf5', trainingOrTesting+"_edge_" +sourceOrDest+"_"+BDOrnoBD, edge)
    print("Wrote "+trainingOrTesting+"ing "+sourceOrDest+" "+edgeType+" "+BDOrnoBD+" files to: "+outputDirectory+"/"+trainingOrTesting+"_edge_"+sourceOrDest+"_"+BDOrnoBD+suffix+".hdf5")

In [11]:
def randomizeData(total_indices, total_truth, trainingOrTesting, outputDirectory, suffix):
    rand_index = createRandomIndices(total_indices.shape)

    total_indices_rand = randomize2DArray(rand_index, total_indices)
    total_truth_rand = randomize2DArray(rand_index, total_truth)
    writeH5File(outputDirectory+'/truth_label_'+trainingOrTesting+suffix+'.hdf5', "truth_label_"+trainingOrTesting, total_truth_rand)
    print("Wrote "+trainingOrTesting+ " truth labels to file: "+outputDirectory+"/truth_label_"+trainingOrTesting+suffix+".hdf5")
    return total_indices_rand

In [12]:
def makeTruthAndData(splitNum, mapping, outputDirectory, suffix):
    edgeDictionary, pairNumberDictionary, sorted_cluster_cluster_pairNumber_indices = makePairNumberAndIndices(suffix, outputDirectory, mapping)
    pairNumberDictionaryTraining, pairNumberDictionaryTesting = create_train_set_test_set(splitNum, pairNumberDictionary)
    edgeDictionaryTraining, edgeDictionaryTesting = create_train_set_test_set(splitNum, edgeDictionary)
    
    edgeDictionarySampledTraining, total_training_indices, total_truth_training = makeDataSamples(edgeDictionaryTraining, True)
    edgeDictionarySampledTesting, total_testing_indices, total_truth_testing = makeDataSamples(edgeDictionaryTesting, False)
    return total_training_indices, total_truth_training, edgeDictionarySampledTesting, total_testing_indices, total_truth_testing, sorted_cluster_cluster_pairNumber_indices

In [13]:
def makeDataSamples(dictionary, training):
    sampledDictionary, bkg_total = makeTruthAndBackground(dictionary, training)
    total_indices, total_truth = makeIndicesAndTruth(sampledDictionary["true"], bkg_total)
    return sampledDictionary, total_indices, total_truth

In [14]:
def makePairNumberAndIndices(suffix, outputDirectory, mapping):
    neighbor_truth = readH5File(outputDirectory+"/neighbor_truth"+suffix+".hdf5", "neighbor_truth")
    neighborTruthRangeValue = len(neighbor_truth)
    edgeDictionary = indices_of_edges(neighborTruthRangeValue, mapping, neighbor_truth)
    pairNumberDictionary = pairs_of_edges(neighborTruthRangeValue, edgeDictionary)
    sortedPairNumberDictionary, sorted_cluster_cluster_pairNumber_indices = sorting_indices(pairNumberDictionary["bkg_cluster_cluster"], pairNumberDictionary)
    sortedEdgeDictionary = sortData(edgeDictionary, sorted_cluster_cluster_pairNumber_indices)
    return sortedEdgeDictionary, sortedPairNumberDictionary, sorted_cluster_cluster_pairNumber_indices

In [15]:
def makeTruthAndBackground(dictionary, training):
    if training:
        sampleDictionary = sampleDataTraining(dictionary)
    else:
        sampleDictionary = sampleDataTesting(dictionary)
    bkg_total = np.concatenate((sampleDictionary["bkg_lone"], sampleDictionary["bkg_cluster_lone"], sampleDictionary["bkg_lone_cluster"], sampleDictionary["bkg_cluster_cluster"]), axis =1)
    return sampleDictionary, bkg_total

In [16]:
def sortData(edgeDictionary, sorted_cluster_cluster_pairNumber_indices):
    keys = edgeDictionary.keys()
    sortedEdgeDictionary = {}
    for key in keys:
        sortedEdgeDictionary[key] = sortList(edgeDictionary[key], sorted_cluster_cluster_pairNumber_indices)
    return sortedEdgeDictionary

In [17]:
def makeIndicesAndTruth(true, bkg_total):
    bkg_truth = np.zeros(bkg_total.shape, dtype=int)
    true_truth = np.ones(true.shape, dtype=int)

    total_truth = np.concatenate((true_truth,bkg_truth), axis =1)

    total_indices = np.concatenate((true,bkg_total), axis =1)
    return total_indices, total_truth

In [18]:
def makeDynamicVariables(suffix, outputDirectory, rangeValue):
    data_dict = openDataDict(suffix, outputDirectory)
    dynamic_variables = {}

    # Generate variable names dynamically and assign values to them
    for i in range(rangeValue):
        var_name = f"data_{i}"
        dynamic_variables[var_name] = data_dict[var_name]
    return dynamic_variables

In [19]:
def openDataDict(suffix, outputDirectory):
    with h5py.File(outputDirectory+'/cellFeatures'+suffix+'.hdf5', 'r') as f:
        # Convert HDF5 to dictionary
        data_dict = hdf5_to_dict(f)
    f.close()
    return data_dict

In [20]:
scalingDataPreparation(100, 70, (0, 2, 3, 4), ".", "_100evsAll2D")

Wrote train truth labels to file: ./truth_label_train_100evsAll2D.hdf5
Wrote test truth labels to file: ./truth_label_test_100evsAll2D.hdf5
Wrote training source total BD files to: ./train_edge_source_BD_100evsAll2D.hdf5
Wrote training dest total BD files to: ./train_edge_dest_BD_100evsAll2D.hdf5
Wrote training source total noBD files to: ./train_edge_source_noBD_100evsAll2D.hdf5
Wrote training dest total noBD files to: ./train_edge_dest_noBD_100evsAll2D.hdf5
Wrote testing source true BD files to: ./test_edge_source_BD_100evsAll2D.hdf5
Wrote testing dest true BD files to: ./test_edge_dest_BD_100evsAll2D.hdf5
Wrote testing source true noBD files to: ./test_edge_source_noBD_100evsAll2D.hdf5
Wrote testing dest true noBD files to: ./test_edge_dest_noBD_100evsAll2D.hdf5
Wrote testing source bkg_lone BD files to: ./test_edge_source_BD_100evsAll2D.hdf5
Wrote testing dest bkg_lone BD files to: ./test_edge_dest_BD_100evsAll2D.hdf5
Wrote testing source bkg_lone noBD files to: ./test_edge_source_

## Reading and arranging data 

In [3]:
# cell features of 100 events (100 X 180k)
with h5py.File('./cellFeatures_100evsAll2D.hdf5', 'r') as f:
    # Convert HDF5 to dictionary
    data_dict = hdf5_to_dict(f)
#Calls the methods to open the hdf5 file and save it to a data dictionary.

In [4]:
f.close()
#Closes the hdf5 file.

In [5]:
# storing 100 events as dict data_0, data_1 ... data_599 not sure why I am doing this
dynamic_variables = {}

# Generate variable names dynamically and assign values to them
for i in range(100):
    var_name = f"data_{i}"
    dynamic_variables[var_name] = data_dict[var_name]
#Transfer the data to a dictionary called dynamic_variables. I’m not sure why he is doing this.

In [6]:
# alligning data feature wise for scaling
data_100 = np.concatenate([value for key, value in list(dynamic_variables.items())])
#Concatenates the data dictionary into a single array called data_100.

In [7]:
data_100.shape
#Checks the shape of data_100.

(18765200, 8)

In [8]:
# truth labels for each neighbour edges
neighbor_truth_100evs = readH5File("./neighbor_truth_100evs_all2D.hdf5", "neighbor_truth_100evs")
#Pulls the neighbor truth data from the hdf5 file.

In [9]:
neighbor_truth_100evs[99]
#Checks the shape of the neighbor truth array.

array([1, 1, 1, ..., 0, 0, 0])

## Collecting indices of neighbour edges based on truth (1,9,10,0)

In [10]:
rangeValue = len(neighbor_truth_100evs)
print(neighbor_truth_100evs[rangeValue-1])
true, bkg_lone, bkg_cluster_lone, bkg_lone_cluster, bkg_cluster_cluster = indices_of_edges(rangeValue, neighbor_truth_100evs)

#Creates arrays for true, bkg_lone, bkg_cluster_lone, bkg_cluster_cluster. The first is for pairs where both are part of a cluster. I’m not sure what
#the others are, but I believe they have to do with whether the pairs contain some combo of background cells and cluster cells, or false cells that are
#still part of a cluster. They then populate these arrays with data from neighbor_truth array.

[1 1 1 ... 0 0 0]


## Collecting length of each edge (true, bkg_lone, bkg_cluster_lone, bkg_cluster_cluster)

In [11]:
# checking length is important since its complicated to deal with awkward arrays
# try make sample as n x m matrix (so n: events, m: true/bkg samples)

true_pairNumber, bkg_lone_pairNumber, bkg_cluster_lone_pairNumber, bkg_lone_cluster_pairNumber, bkg_cluster_cluster_pairNumber = pairs_of_edges(rangeValue, true, bkg_lone, bkg_cluster_lone, bkg_lone_cluster, bkg_cluster_cluster)
#Makes arrays that store the length of the number of pairs stored in each of the events in the arrays from the previous cell. It then populates these
#arrays with the appropriate data.

## Sorting all len arrays in Dscennding order of cluster_cluster bkg sample

In [12]:
# Rearranging all edge/pair samples of 100 events based on cluster_cluster sample len
true_pairNumber_sorted, bkg_lone_pairNumber_sorted, bkg_cluster_lone_pairNumber_sorted, bkg_lone_cluster_pairNumber_sorted, bkg_cluster_cluster_pairNumber_sorted, sorted_cluster_cluster_pairNumber_indices = sorting_indices(bkg_cluster_cluster_pairNumber, true_pairNumber, bkg_lone_pairNumber, bkg_cluster_lone_pairNumber, bkg_lone_cluster_pairNumber, bkg_cluster_cluster_pairNumber)
#Sorts all of the length arrays using the indices found in cell 15 and saves them as new sorted arrays.

In [13]:
# Rearranging all edge/pair samples of 100 events based on cluster_cluster sample len
true_sorted = sortList(true, sorted_cluster_cluster_pairNumber_indices)
bkg_lone_sorted = sortList(bkg_lone, sorted_cluster_cluster_pairNumber_indices)
bkg_cluster_lone_sorted = sortList(bkg_cluster_lone, sorted_cluster_cluster_pairNumber_indices)
bkg_lone_cluster_sorted = sortList(bkg_lone_cluster, sorted_cluster_cluster_pairNumber_indices)
bkg_cluster_cluster_sorted = sortList(bkg_cluster_cluster, sorted_cluster_cluster_pairNumber_indices)
#Sorts all of the arrays created in cell 12 using the cell 15 indices and saves them as new sorted arrays.

In [14]:
splitNum = 70
true_pairNumber_sorted_70, bkg_lone_pairNumber_sorted_70, bkg_cluster_lone_pairNumber_sorted_70, bkg_lone_cluster_pairNumber_sorted_70, bkg_cluster_cluster_pairNumber_sorted_70, true_pairNumber_sorted_30, bkg_lone_pairNumber_sorted_30, bkg_cluster_lone_pairNumber_sorted_30, bkg_lone_cluster_pairNumber_sorted_30, bkg_cluster_cluster_pairNumber_sorted_30 = create_train_set_test_set(splitNum, true_pairNumber, bkg_lone_pairNumber, bkg_cluster_lone_pairNumber, bkg_lone_cluster_pairNumber, bkg_cluster_cluster_pairNumber)  
#Creates new arrays that represent training and testing length datasets made from the arrays made in cell 16. The training data is 70% of the data,
#and the testing set is 30% of the data.

In [15]:
true_sorted_70, bkg_lone_sorted_70, bkg_cluster_lone_sorted_70, bkg_lone_cluster_sorted_70, bkg_cluster_cluster_sorted_70, true_sorted_30, bkg_lone_sorted_30, bkg_cluster_lone_sorted_30, bkg_lone_cluster_sorted_30, bkg_cluster_cluster_sorted_30 = create_train_set_test_set(splitNum, true_sorted, bkg_lone_sorted, bkg_cluster_lone_sorted, bkg_lone_cluster_sorted, bkg_cluster_cluster_sorted) 
#Creates new arrays that represent training and testing datasets made from the arrays made in cell 17. The training data is 70% of the data, and the
#testing set is 30% of the data.

In [16]:
len(bkg_cluster_cluster_sorted_70)
#Checks the length of one of the training datasets.

70

### Checking the minimum length of each type of edges (true, bkg_l, bkg_c_l, bkg_c_c)

In [17]:
min(true_pairNumber_sorted_70)

28118

In [18]:
min(bkg_lone_pairNumber_sorted_70)

554288

In [19]:
min(bkg_cluster_lone_pairNumber_sorted_70)

24068

In [20]:
min(bkg_cluster_cluster_pairNumber_sorted_70)
#Prints minimum lengths from the length training datasets.

1983

In [21]:
min([len(row) for row in bkg_cluster_cluster_sorted_70])

5688

### Based on minimum lengths concluding true: 15k, bkg_l: 6_5k, bkg_c_l: 6_5k, bkg_c_c: 2k

In [22]:
true_training, bkg_lone_training, bkg_cluster_lone_training, bkg_lone_cluster_training, bkg_cluster_cluster_training = sampleDataTraining(true_sorted_70, bkg_lone_sorted_70, bkg_cluster_lone_sorted_70, bkg_lone_cluster_sorted_70, bkg_cluster_cluster_sorted_70)
#I’m not sure what this is doing. I believe that it is pulling a random sampling of data in each training sample so that there will be an equal amount
#of signal and background, and then saving these samples into new arrays.

In [23]:
true_training.shape
#Prints the shape of the sampled true array.

(70, 22400)

In [24]:
bkg_lone_training.shape

(70, 5600)

In [25]:
bkg_cluster_cluster_training.shape
#Prints the shape of two of the sampled background arrays.

(70, 5600)

In [26]:
bkg_total_training = np.concatenate((bkg_lone_training, bkg_cluster_lone_training, bkg_lone_cluster_training, bkg_cluster_cluster_training), axis =1)
#Concatenates all of the sampled background arrays into one sampled background array.

In [27]:
bkg_total_training.shape
#Checks the shape of the sampled background array.

(70, 22400)

### Labelling for true/signal and fake/bkg as 1, 0

In [28]:
bkg_truth_training = np.zeros(bkg_total_training.shape, dtype=int)
true_truth_training = np.ones(true_training.shape, dtype=int)
#Creates an array of zeros in the shape of the sampled background array. Creates an array of ones in the shape of the sampled true array. These are to
#create arrays to show the truth value of the sampled arrays. I am going to call these truth arrays.

In [29]:
total_truth_training = np.concatenate((true_truth_training,bkg_truth_training), axis =1)
#Concatenate the truth arrays together to create one truth array.

In [30]:
total_truth_training.shape
#Check the shape of the truth array.

(70, 44800)

In [31]:
total_training_indices = np.concatenate((true_training ,bkg_total_training), axis =1)
#Concatenate the sampled signal and background arrays into one array that I shall refer to as the training array.

In [32]:
total_training_indices.shape
#Check the shape of the training array.

(70, 44800)

In [33]:
rand_index_training = createRandomIndices(total_training_indices.shape)
#Creates an array that releases a random sequence of indices. I’m going to call this the random indices array.

In [34]:
rand_index_training[0]
#Prints the first element of the random indices array

array([22625,  8118, 33526, ..., 38044, 14738, 36545])

In [35]:
# Randomizing training sample
total_training_indices_rand = randomize2DArray(rand_index_training, total_training_indices)
#Uses the random indices array to randomize the training array and saves it into a randomized training array.

In [36]:
# Randomizing truth of training sample according to the sample
total_truth_rand_training = randomize2DArray(rand_index_training, total_truth_training)
#Uses the random indices array to randomize the truth array and saves it into a randomized truth array.

In [37]:
total_truth_rand_training[0]
#Prints the first element of the randomized truth array.

array([0, 1, 0, ..., 0, 1, 0])

In [38]:
writeH5File('./truth_label_train_70evs_all2D.hdf5', "truth_label_train", total_truth_rand_training)
#Saves the randomized truth array to an hdf5 file.

In [39]:
total_training_indices_rand.shape
#Prints the shape of the randomized training array.

(70, 44800)

In [40]:
neighbor_pairs_unique_sorted= readH5File("./neighbor_pairs_unique_sorted_all2D.hdf5", "neighbor_pair")
#Retrieves the sorted neighbor pairs from an hdf5 file.

In [41]:
neighbor_pairs_unique_sorted.shape

(741512, 2)

In [42]:
# Arranging training edges with training indices
total_training_edges_rand = randomizeEdges(total_training_indices_rand, neighbor_pairs_unique_sorted) 
#Creates a training edges array by using the randomized training indices on the sorted neighbor pairs array.

In [43]:
total_training_edges_rand.shape
#Print the shape of the training edges array.

(70, 44800, 2)

In [44]:
train_edge_source_BD, train_edge_dest_BD, train_edge_source_noBD, train_edge_dest_noBD = createEdgeArrays(total_training_edges_rand)
#Creates an array called train_edge_source_BD out of the randomized training edges array. I’m not sure what this is, to be honest.

In [45]:
train_edge_source_BD.shape
#Prints the shape of the train_edge_source_BD array.

(70, 89600)

In [46]:
train_edge_dest_BD.shape
#Prints the shape of the train_edge_dest_BD array.

(70, 89600)

In [47]:
train_edge_source_noBD.shape
#Prints the shape of the train_edge_source_noBD array.

(70, 44800)

In [48]:
train_edge_dest_noBD.shape
#Prints the shape of the train_edge_dest_noBD array.

(70, 44800)

In [49]:
writeH5File('./train_edge_source_BD_70evs_all2D.hdf5', "train_edge_source_BD", train_edge_source_BD)

writeH5File('./train_edge_dest_BD_70evs_all2D.hdf5', "train_edge_dest_BD", train_edge_dest_BD)
     
writeH5File('./train_edge_source_noBD_70evs_all2D.hdf5', "train_edge_source_noBD", train_edge_source_noBD)

writeH5File('./train_edge_dest_noBD_70evs_all2D.hdf5', "train_edge_dest_noBD", train_edge_dest_noBD)
#Saves the BD and noBD arrays into hdf5 files.

# Scaling of features

In [50]:
cellFeatures_trainS, cellFeatures_testS = scaling_of_features(dynamic_variables, splitNum, sorted_cluster_cluster_pairNumber_indices, "./scaler_neighbor_data_70_sorted_all2D.save")

In [51]:
cellFeatures_trainS.shape

(13135640, 8)

In [52]:
cellFeatures_trainS_reshape = reshapeFeatures(splitNum, cellFeatures_trainS)
#Reshape the cell feature training array.

In [53]:
cellFeatures_testS_reshape = reshapeFeatures(len(neighbor_truth_100evs)-splitNum, cellFeatures_testS)
#Reshape the cell feature testing array.

In [54]:
# creating Scaled Cell Feature file
writeH5File('./cellFeaturesScaled_train_70evs_all2D.hdf5', "cellFeatures_trainS", cellFeatures_trainS_reshape)

In [55]:
writeH5File('./cellFeaturesScaled_test_30evs_all2D.hdf5', "cellFeatures_testS", cellFeatures_testS_reshape)
#Save the cell features arrays into hdf5 files.

### for test sample

In [56]:
min(true_pairNumber_sorted_30)

31367

In [57]:
min(bkg_lone_pairNumber_sorted_30)

540266

In [58]:
min(bkg_cluster_lone_pairNumber_sorted_30)

25427

In [59]:
min(bkg_cluster_cluster_pairNumber_sorted_30)
#Print the minimum of the length test data arrays

2785

In [60]:
min([len(row) for row in bkg_cluster_cluster_sorted_30])

1983

In [61]:
true_testing_sample, bkg_lone_testing_sample, bkg_cluster_lone_testing_sample, bkg_lone_cluster_testing_sample, bkg_cluster_cluster_testing_sample = sampleDataTesting(true_sorted_30, bkg_lone_sorted_30, bkg_cluster_lone_sorted_30, bkg_lone_cluster_sorted_30, bkg_cluster_cluster_sorted_30)
#Create a random sampling out of the length test arrays and save them to new arrays.

In [62]:
bkg_total_testing = np.concatenate((bkg_lone_testing_sample, bkg_cluster_lone_testing_sample, bkg_lone_cluster_testing_sample, bkg_cluster_cluster_testing_sample), axis = 1) #(30, 1777000)

In [63]:
bkg_total_testing.shape

(30, 655800)

In [64]:
true_testing_sample.shape

(30, 28100)

In [65]:
bkg_truth_testing = np.zeros(bkg_total_testing.shape) #(30, 1777000)
true_truth_testing = np.ones(true_testing_sample.shape) #(30, 45000)

In [66]:
total_truth_testing = np.concatenate((true_truth_testing, bkg_truth_testing), axis = 1) #(30, 222000)

In [67]:
total_truth_testing.shape

(30, 683900)

In [68]:
total_testing_indices = np.concatenate((true_testing_sample, bkg_total_testing), axis=1) #(30, 222000)

In [69]:
total_testing_indices.shape

(30, 683900)

In [70]:
rand_index_testing = createRandomIndices(total_testing_indices.shape)

In [71]:
rand_index_testing.shape

(30, 683900)

In [72]:
rand_index_testing[0]

array([376534, 586909,  86837, ..., 605768,  75159, 669115])

In [73]:
total_testing_indices_rand = randomize2DArray(rand_index_testing, total_testing_indices)

In [74]:
total_truth_rand_testing = randomize2DArray(rand_index_testing, total_truth_testing)

In [75]:
total_truth_rand_testing.shape

(30, 683900)

In [76]:
total_truth_rand_testing[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [77]:
writeH5File('./truth_label_test_30evs_all2D.hdf5', "truth_label_test", total_truth_rand_testing)

In [92]:
true_test_edges = randomizeEdges(true_testing_sample, neighbor_pairs_unique_sorted)
bkg_lone_test_edges = randomizeEdges(bkg_lone_testing_sample, neighbor_pairs_unique_sorted)
bkg_cluster_lone_test_edges = randomizeEdges(bkg_cluster_lone_testing_sample, neighbor_pairs_unique_sorted)
bkg_lone_cluster_test_edges = randomizeEdges(bkg_lone_cluster_testing_sample, neighbor_pairs_unique_sorted)
bkg_cluster_cluster_test_edges = randomizeEdges(bkg_cluster_cluster_testing_sample, neighbor_pairs_unique_sorted)
#Create a test edges arrays by putting the sampled length indices into the sorted neighbor arrays.

In [79]:
true_test_edges.shape
#Print the shape of the true test edges array.

(30, 28100, 2)

In [80]:
total_testing_edges_rand = randomizeEdges(total_testing_indices_rand, neighbor_pairs_unique_sorted)

In [81]:
total_testing_edges_rand.shape

(30, 683900, 2)

In [82]:
test_edge_source_BD, test_edge_dest_BD, test_edge_source_noBD, test_edge_dest_noBD = createEdgeArrays(total_testing_edges_rand)

In [83]:
test_edge_source_BD.shape

(30, 1367800)

In [84]:
test_edge_dest_BD.shape

(30, 1367800)

In [85]:
test_edge_source_noBD.shape

(30, 683900)

In [86]:
test_edge_dest_noBD.shape

(30, 683900)

In [87]:
writeH5File('./test_edge_source_BD_30evs_all2D.hdf5', "test_edge_source_BD", test_edge_source_BD)

writeH5File('./test_edge_dest_BD_30evs_all2D.hdf5', "test_edge_dest_BD", test_edge_dest_BD)

writeH5File('./test_edge_source_noBD_30evs_all2D.hdf5', "test_edge_source_noBD", test_edge_source_noBD)

writeH5File('./test_edge_dest_noBD_30evs_all2D.hdf5', "test_edge_dest_noBD", test_edge_dest_noBD)

In [88]:
test_edge_source_true_BD, test_edge_dest_true_BD, test_edge_source_true_noBD, test_edge_dest_true_noBD = createEdgeArrays(true_test_edges)

In [89]:
test_edge_source_bkg_lone_BD, test_edge_dest_bkg_lone_BD, test_edge_source_bkg_lone_noBD, test_edge_dest_bkg_lone_noBD = createEdgeArrays(bkg_lone_test_edges)

In [90]:
test_edge_source_bkg_cluster_lone_BD, test_edge_dest_bkg_cluster_lone_BD, test_edge_source_bkg_cluster_lone_noBD, test_edge_dest_bkg_cluster_lone_noBD = createEdgeArrays(bkg_cluster_lone_test_edges)

In [93]:
test_edge_source_bkg_lone_cluster_BD, test_edge_dest_bkg_lone_cluster_BD, test_edge_source_bkg_lone_cluster_noBD, test_edge_dest_bkg_lone_cluster_noBD = createEdgeArrays(bkg_lone_cluster_test_edges)

In [94]:
test_edge_source_bkg_cluster_cluster_BD, test_edge_dest_bkg_cluster_cluster_BD, test_edge_source_bkg_cluster_cluster_noBD, test_edge_dest_bkg_cluster_cluster_noBD = createEdgeArrays(bkg_cluster_cluster_test_edges)

In [95]:
writeH5File('./test_edge_source_true_BD_all2D.hdf5', "test_edge_source_true_BD", test_edge_source_true_BD)

writeH5File('./test_edge_dest_true_BD_all2D.hdf5', "test_edge_dest_true_BD", test_edge_dest_true_BD)
    
writeH5File('./test_edge_source_true_noBD_all2D.hdf5', "test_edge_source_true_noBD", test_edge_source_true_noBD)

writeH5File('./test_edge_dest_true_noBD_all2D.hdf5', "test_edge_dest_true_noBD", test_edge_dest_true_noBD)

In [96]:
writeH5File('./test_edge_source_bkg_lone_BD_all2D.hdf5', "test_edge_source_bkg_lone_BD", test_edge_source_bkg_lone_BD)

writeH5File('./test_edge_dest_bkg_lone_BD_all2D.hdf5', "test_edge_dest_bkg_lone_BD", test_edge_dest_bkg_lone_BD)
    
writeH5File('./test_edge_source_bkg_lone_noBD_all2D.hdf5', "test_edge_source_bkg_lone_noBD", test_edge_source_bkg_lone_noBD)

writeH5File('./test_edge_dest_bkg_lone_noBD_all2D.hdf5', "test_edge_dest_bkg_lone_noBD", test_edge_dest_bkg_lone_noBD)

In [97]:
writeH5File('./test_edge_source_bkg_cluster_lone_BD_all2D.hdf5', "test_edge_source_bkg_cluster_lone_BD", test_edge_source_bkg_cluster_lone_BD)

writeH5File('./test_edge_dest_bkg_cluster_lone_BD_all2D.hdf5', "test_edge_dest_bkg_cluster_lone_BD", test_edge_dest_bkg_cluster_lone_BD)
    
writeH5File('./test_edge_source_bkg_cluster_lone_noBD_all2D.hdf5', "test_edge_source_bkg_cluster_lone_noBD", test_edge_source_bkg_cluster_lone_noBD)

writeH5File('./test_edge_dest_bkg_cluster_lone_noBD_all2D.hdf5', "test_edge_dest_bkg_cluster_lone_noBD", test_edge_dest_bkg_cluster_lone_noBD)

In [98]:
writeH5File('./test_edge_source_bkg_lone_cluster_BD_all2D.hdf5', "test_edge_source_bkg_lone_cluster_BD", test_edge_source_bkg_lone_cluster_BD)

writeH5File('./test_edge_dest_bkg_lone_cluster_BD_all2D.hdf5', "test_edge_dest_bkg_lone_cluster_BD", test_edge_dest_bkg_lone_cluster_BD)
    
writeH5File('./test_edge_source_bkg_lone_cluster_noBD_all2D.hdf5', "test_edge_source_bkg_lone_cluster_noBD", test_edge_source_bkg_lone_cluster_noBD)

writeH5File('./test_edge_dest_bkg_lone_cluster_noBD_all2D.hdf5', "test_edge_dest_bkg_lone_cluster_noBD", test_edge_dest_bkg_lone_cluster_noBD)

In [99]:
writeH5File('./test_edge_source_bkg_cluster_cluster_BD_all2D.hdf5', "test_edge_source_bkg_cluster_cluster_BD", test_edge_source_bkg_cluster_cluster_BD)

writeH5File('./test_edge_dest_bkg_cluster_cluster_BD_all2D.hdf5', "test_edge_dest_bkg_cluster_cluster_BD", test_edge_dest_bkg_cluster_cluster_BD)
    
writeH5File('./test_edge_source_bkg_cluster_cluster_noBD_all2D.hdf5', "test_edge_source_bkg_cluster_cluster_noBD", test_edge_source_bkg_cluster_cluster_noBD)

writeH5File('./test_edge_dest_bkg_cluster_cluster_noBD_all2D.hdf5', "test_edge_dest_bkg_cluster_cluster_noBD", test_edge_dest_bkg_cluster_cluster_noBD)
#Saves these BD and noBD arrays into hdf5 files.