## Notebook Merge_Datasets
This notebook takes the data from pi0, eta and bck previously separated into individual train/test-sets and creates a train-dataset and a test-dataset 

In [1]:
import numpy as np

In [2]:
def load_into(arr, keyword, dataset, ClusN_max):
    #function used for loading data from file into array 
    print("Currently merging {}".format(keyword))
    
    #Create dataloader and load size of datasets
    if dataset == 'train':
        data_pi0 = np.load('Data-Split/pi0_train.npz', allow_pickle=True)
        data_eta = np.load('Data-Split/eta_train.npz', allow_pickle=True)
        data_bck = np.load('Data-Split/bck_train.npz', allow_pickle=True)
        
        pi0_size = data_pi0['Size']
        eta_size = data_eta['Size']
        bck_size = data_bck['Size']
        
    elif dataset == 'test':       
        data_pi0 = np.load('Data-Split/pi0_test.npz', allow_pickle=True)
        data_eta = np.load('Data-Split/eta_test.npz', allow_pickle=True)
        data_bck = np.load('Data-Split/bck_test.npz', allow_pickle=True)
        
        pi0_size = data_pi0['Size']
        eta_size = data_eta['Size']
        bck_size = data_bck['Size']
        
    else:
        print("Give valid dataset type!")
        
    #Fill given array with data from files, merging the seperate datasets into a single train-/testset    
    if arr.ndim == 2:
        if arr.dtype == np.float32:
            arr[0:pi0_size] = np.lib.pad( data_pi0[keyword], ((0,0)
                                         ,(0, np.abs(data_pi0[keyword].shape[1] - ClusN_max)))
                                         ,'constant', constant_values=(np.NaN))            
            arr[pi0_size:pi0_size+eta_size] = np.lib.pad( data_eta[keyword], ((0,0)
                                                ,(0, np.abs(data_eta[keyword].shape[1] - ClusN_max)))
                                                ,'constant', constant_values=(np.NaN))            
            arr[-bck_size:] = np.lib.pad( data_bck[keyword], ((0,0)
                                         ,(0, np.abs(data_bck[keyword].shape[1] - ClusN_max)))
                                         ,'constant', constant_values=(np.NaN))
        else:
            arr[0:pi0_size] = np.lib.pad (data_pi0[keyword], ((0,0)
                                         ,(0, np.abs(data_pi0[keyword].shape[1] - ClusN_max)))
                                         ,'constant', constant_values=(100))           
            arr[pi0_size:pi0_size+eta_size] = np.lib.pad( data_eta[keyword], ((0,0)
                                                         ,(0, np.abs(data_eta[keyword].shape[1] - ClusN_max)))
                                                         ,'constant', constant_values=(100))            
            arr[-bck_size:] = np.lib.pad( data_bck[keyword], ((0,0)
                                         ,(0, np.abs(data_bck[keyword].shape[1] - ClusN_max)))
                                         ,'constant', constant_values=(100))

    else:        
        arr[0:pi0_size] = data_pi0[keyword]
        arr[pi0_size:pi0_size+eta_size] = data_eta[keyword]
        arr[-bck_size:] = data_bck[keyword]

In [3]:
def merge_train(size_train, ClusN_max):
    #setup arrays
    ClusterN_train = np.zeros((size_train), dtype=np.ubyte)
    Cluster_train = np.zeros((size_train, ClusN_max), dtype=np.float32)
    ClusterTiming_train = np.zeros((size_train, ClusN_max), dtype=np.float32)
    ClusterType_train = np.zeros((size_train), dtype=np.ubyte)
    ClusterE_train = np.zeros((size_train), dtype=np.float32)
    ClusterPt_train = np.zeros((size_train), dtype=np.float32)
    ClusterModuleNumber_train = np.zeros((size_train, ClusN_max), dtype=np.ubyte)
    ClusterCol_train = np.zeros((size_train, ClusN_max), dtype=np.ubyte)
    ClusterRow_train = np.zeros((size_train, ClusN_max), dtype=np.ubyte)
    ClusterM20_train = np.zeros((size_train), dtype=np.float32)
    ClusterM02_train = np.zeros((size_train), dtype= np.float32)
    ClusterDistFromVert_train = np.zeros((size_train), dtype=np.float32)
    PartE_train = np.zeros((size_train), dtype=np.float32)
    PartPt_train = np.zeros((size_train), dtype=np.float32)
    PartEta_train = np.zeros((size_train), dtype=np.float32)
    PartPhi_train = np.zeros((size_train), dtype=np.float32)
    PartIsPrimary_train = np.zeros((size_train), dtype=bool)
    PartPID_train = np.zeros((size_train), dtype=np.short)
    
    #load data into arrays
    load_into(ClusterN_train, 'ClusterN', 'train', ClusN_max)
    load_into(Cluster_train, 'Cluster', 'train', ClusN_max)
    load_into(ClusterTiming_train, 'ClusterTiming', 'train', ClusN_max)
    load_into(ClusterType_train, 'ClusterType', 'train', ClusN_max)
    load_into(ClusterE_train, 'ClusterE', 'train', ClusN_max)
    load_into(ClusterPt_train, 'ClusterPt', 'train', ClusN_max)
    load_into(ClusterModuleNumber_train, 'ClusterModuleNumber', 'train', ClusN_max)
    load_into(ClusterRow_train, 'ClusterRow', 'train', ClusN_max)
    load_into(ClusterCol_train, 'ClusterCol', 'train', ClusN_max)
    load_into(ClusterM20_train, 'ClusterM20','train', ClusN_max)
    load_into(ClusterM02_train, 'ClusterM02','train', ClusN_max)
    load_into(ClusterDistFromVert_train, 'ClusterDistFromVert', 'train', ClusN_max)
    load_into(PartE_train, 'PartE','train', ClusN_max)
    load_into(PartPt_train, 'PartPt','train', ClusN_max)
    load_into(PartEta_train, 'PartEta', 'train', ClusN_max)
    load_into(PartPhi_train, 'PartPhi', 'train', ClusN_max)
    load_into(PartIsPrimary_train, 'PartIsPrimary', 'train', ClusN_max)
    load_into(PartPID_train, 'PartPID', 'train', ClusN_max)
    
    #Get minimum and maximum values for normalization in later stage
    maxCellEnergy = np.nanmax(Cluster_train)
    maxCellTiming = np.nanmax(ClusterTiming_train)
    maxClusterEnergy = np.nanmax(ClusterE_train)
    maxClusterPt = np.nanmax(ClusterPt_train)
    maxClusterM20 = np.nanmax(ClusterM20_train)
    maxClusterM02 = np.nanmax(ClusterM02_train)
    maxClusterDistFromVert = np.nanmax(ClusterDistFromVert_train)
    maxPartE = np.nanmax(PartE_train)
    maxPartPt = np.nanmax(PartPt_train)
    maxPartEta = np.nanmax(PartEta_train)
    maxPartPhi = np.nanmax(PartPhi_train)
    
    minCellEnergy = np.nanmin(Cluster_train)
    minCellTiming = np.nanmin(ClusterTiming_train)
    minClusterEnergy = np.nanmin(ClusterE_train)
    minClusterPt = np.nanmin(ClusterPt_train)
    minClusterM20 = np.nanmin(ClusterM20_train)
    minClusterM02 = np.nanmin(ClusterM02_train)
    minClusterDistFromVert = np.nanmin(ClusterDistFromVert_train)
    minPartE = np.nanmin(PartE_train)
    minPartPt = np.nanmin(PartPt_train)
    minPartEta = np.nanmin(PartEta_train)
    minPartPhi = np.nanmin(PartPhi_train)
    
    #save merged dataset and normalization data
    print("Saving train-dataset")
    np.savez_compressed('../CNN/Data/data_train', Size = ClusterN_train.size, ClusterN=ClusterN_train
                        , Cluster=Cluster_train, ClusterTiming=ClusterTiming_train, ClusterE=ClusterE_train
                        , ClusterPt=ClusterPt_train, ClusterModuleNumber=ClusterModuleNumber_train
                        , ClusterType=ClusterType_train, ClusterCol=ClusterCol_train
                        , ClusterRow=ClusterRow_train, ClusterM02=ClusterM02_train, ClusterM20=ClusterM20_train
                        , ClusterDistFromVert=ClusterDistFromVert_train
                        , PartE=PartE_train, PartPt=PartPt_train, PartEta=PartEta_train, PartPhi=PartPhi_train
                        , PartIsPrimary=PartIsPrimary_train, PartPID=PartPID_train)
    
    print("Saving normalization data")    
    np.savez_compressed('../CNN/Data/normalization', maxCellEnergy = maxCellEnergy
                        , maxCellTiming = maxCellTiming, maxClusterE = maxClusterEnergy
                        , maxClusterPt = maxClusterPt, maxClusterM20 = maxClusterM20
                        , maxClusterM02 = maxClusterM02, maxClusterDistFromVert = maxClusterDistFromVert
                        , maxPartE = maxPartE, maxPartPt = maxPartPt, maxPartEta = maxPartEta
                        , maxPartPhi= maxPartPhi, minCellEnergy = minCellEnergy, minCellTiming = minCellTiming
                        , minClusterE = minClusterEnergy, minClusterPt = minClusterPt
                        , minClusterM20 = minClusterM20, minClusterM02 = minClusterM02
                        , minClusterDistFromVert = minClusterDistFromVert, minPartE = minPartE
                        , minPartPt = minPartPt, minPartEta = minPartEta, minPartPhi = minPartPhi)

In [4]:
def merge_test(size_test, ClusN_max):
    #setup arrays
    ClusterN_test = np.zeros((size_test), dtype=np.ubyte)
    Cluster_test = np.zeros((size_test, ClusN_max), dtype=np.float32)
    ClusterTiming_test = np.zeros((size_test, ClusN_max), dtype=np.float32)
    ClusterType_test = np.zeros((size_test), dtype=np.ubyte)
    ClusterE_test = np.zeros((size_test), dtype=np.float32)
    ClusterPt_test = np.zeros((size_test), dtype=np.float32)
    ClusterModuleNumber_test = np.zeros((size_test, ClusN_max), dtype=np.ubyte)
    ClusterCol_test = np.zeros((size_test, ClusN_max), dtype=np.ubyte)
    ClusterRow_test = np.zeros((size_test, ClusN_max), dtype=np.ubyte)
    ClusterM20_test = np.zeros((size_test), dtype=np.float32)
    ClusterM02_test = np.zeros((size_test), dtype= np.float32)
    ClusterDistFromVert_test = np.zeros((size_test), np.float32)
    PartE_test = np.zeros((size_test), dtype=np.float32)
    PartPt_test = np.zeros((size_test), dtype=np.float32)
    PartEta_test = np.zeros((size_test), dtype=np.float32)
    PartPhi_test = np.zeros((size_test), dtype=np.float32)
    PartIsPrimary_test = np.zeros((size_test), dtype=bool)
    PartPID_test = np.zeros((size_test), dtype=np.short)
    
    #load data into arrays
    load_into(ClusterN_test, 'ClusterN', 'test', ClusN_max)
    load_into(Cluster_test, 'Cluster', 'test', ClusN_max)
    load_into(ClusterTiming_test, 'ClusterTiming', 'test', ClusN_max)
    load_into(ClusterType_test, 'ClusterType', 'test', ClusN_max)
    load_into(ClusterE_test, 'ClusterE', 'test', ClusN_max)
    load_into(ClusterPt_test, 'ClusterPt', 'test', ClusN_max)
    load_into(ClusterModuleNumber_test, 'ClusterModuleNumber', 'test', ClusN_max)
    load_into(ClusterCol_test, 'ClusterCol', 'test', ClusN_max)
    load_into(ClusterRow_test, 'ClusterRow', 'test', ClusN_max)
    load_into(ClusterM20_test, 'ClusterM20','test', ClusN_max)
    load_into(ClusterM02_test, 'ClusterM02','test', ClusN_max)
    load_into(ClusterDistFromVert_test, 'ClusterDistFromVert', 'test', ClusN_max)
    load_into(PartE_test, 'PartE','test', ClusN_max)
    load_into(PartPt_test, 'PartPt','test', ClusN_max)
    load_into(PartEta_test, 'PartEta', 'test', ClusN_max)
    load_into(PartPhi_test, 'PartPhi', 'test', ClusN_max)
    load_into(PartIsPrimary_test, 'PartIsPrimary', 'test', ClusN_max)
    load_into(PartPID_test, 'PartPID', 'test', ClusN_max)
    
    #save merged datasets
    print("Saving test-dataset")
    np.savez_compressed('../CNN/Data/data_test', Size = ClusterN_test.size, ClusterN=ClusterN_test
                        , Cluster=Cluster_test, ClusterTiming=ClusterTiming_test
                        , ClusterE=ClusterE_test, ClusterPt=ClusterPt_test
                        , ClusterModuleNumber=ClusterModuleNumber_test, ClusterType=ClusterType_test
                        , ClusterCol=ClusterCol_test, ClusterRow=ClusterRow_test, ClusterM02=ClusterM02_test
                        , ClusterM20=ClusterM20_test, ClusterDistFromVert=ClusterDistFromVert_test
                        , PartE=PartE_test, PartPt=PartPt_test, PartEta=PartEta_test, PartPhi=PartPhi_test
                        , PartIsPrimary=PartIsPrimary_test, PartPID=PartPID_test)

In [5]:
def get_SizeAndCellN():
    #The maximum size of clusters and the individual size are needed for further steps
    #first is needed to set up array sizes
    #second is needed later for reconstruction of the clusters
    
    #path to files
    pi0_train = 'Data-Split/pi0_train.npz'
    eta_train = 'Data-Split/eta_train.npz'
    bck_train = 'Data-Split/bck_train.npz'

    pi0_test = 'Data-Split/pi0_test.npz'
    eta_test = 'Data-Split/eta_test.npz'
    bck_test = 'Data-Split/bck_test.npz'
    
    #Create dataloader
    data_pi0_train = np.load(pi0_train, allow_pickle=True)
    data_eta_train = np.load(eta_train, allow_pickle=True)
    data_bck_train = np.load(bck_train, allow_pickle=True)
    
    data_pi0_test = np.load(pi0_test, allow_pickle=True)
    data_eta_test = np.load(eta_test, allow_pickle=True)
    data_bck_test = np.load(bck_test, allow_pickle=True)

    
    #Calculate size of dataset
    pi0_size_train = data_pi0_train['Size']
    eta_size_train = data_eta_train['Size']
    bck_size_train = data_bck_train['Size']
    
    train = [pi0_size_train, eta_size_train, eta_size_train]

    Size_train = pi0_size_train.item() + eta_size_train.item() + bck_size_train.item()
    print("Size of train dataset: {}".format(Size_train))
    
    pi0_size_test = data_pi0_test['Size']
    eta_size_test = data_eta_test['Size']
    bck_size_test = data_bck_test['Size']

    Size_test = pi0_size_test.item() + eta_size_test.item() + bck_size_test.item()
    print("Size of test dataset: {}\n".format(Size_test))
    
    #Readout the clustersize from file
    ClusterN_train = np.zeros((Size_train), dtype=np.ubyte)
    ClusterN_test = np.zeros((Size_test), dtype=np.ubyte)

    ClusterN_train[0:pi0_size_train] = data_pi0_train["ClusterN"]
    ClusterN_train[pi0_size_train:pi0_size_train+eta_size_train] = data_eta_train["ClusterN"]
    ClusterN_train[-bck_size_train:] = data_bck_train["ClusterN"]

    ClusterN_test[0:pi0_size_test] = data_pi0_test["ClusterN"]
    ClusterN_test[pi0_size_test:pi0_size_test+eta_size_test] = data_eta_test["ClusterN"]
    ClusterN_test[-bck_size_test:] = data_bck_test["ClusterN"]

    #Get maximum value
    ClusN_train_max = np.max(ClusterN_train)
    ClusN_test_max = np.max(ClusterN_test)
    ClusN_max = np.max([ClusN_train_max, ClusN_test_max])
    
    return Size_train, Size_test, ClusN_max

In [6]:
def merge_all():
    
    size_train, size_test, ClusN_max = get_SizeAndCellN()
    
    print("-------- Start Merging Trainset -------")
    merge_train(size_train, ClusN_max)
    print("------ Finished Merging Trainset ------\n")
    
    print("-------- Start Merging Testset --------")
    merge_test(size_test, ClusN_max)
    print("------ Finished Merging Testset -------\n")
    print("Merging finished!\n")

In [7]:
merge_all()

Size of train dataset: 5508407
Size of test dataset: 1377103

-------- Start Merging Trainset -------
Currently merging ClusterN
Currently merging Cluster
Currently merging ClusterTiming
Currently merging ClusterType
Currently merging ClusterE
Currently merging ClusterPt
Currently merging ClusterModuleNumber
Currently merging ClusterRow
Currently merging ClusterCol
Currently merging ClusterM20
Currently merging ClusterM02
Currently merging ClusterDistFromVert
Currently merging PartE
Currently merging PartPt
Currently merging PartEta
Currently merging PartPhi
Currently merging PartIsPrimary
Currently merging PartPID
Saving train-dataset
Saving normalization data
------ Finished Merging Trainset ------

-------- Start Merging Testset --------
Currently merging ClusterN
Currently merging Cluster
Currently merging ClusterTiming
Currently merging ClusterType
Currently merging ClusterE
Currently merging ClusterPt
Currently merging ClusterModuleNumber
Currently merging ClusterCol
Currently me