In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from os import path
import gc

In [2]:
#Path to data
pi0_data_path = path.abspath('Data-Raw/pi0.npz')
eta_data_path = path.abspath('Data-Raw/eta.npz')
bck_data_path = path.abspath('Data-Raw/bck.npz')

Read in data and split the raw data into train and test sets
This is done in a function, so that pythons garbage collect frees up any memory leaving scope of function

In [3]:
def load_data(path, test_size):
    data = np.load(path, allow_pickle=True)
    
    print("Start Loading Data")
    data_ClusterCellN = data["ClusterCellN"]
    data_CellEnergy = data["CellEnergy"]
    data_CellTiming = data["CellTiming"]   
    data_CellModuleNumber = data["CellModuleNumber"]
    data_CellCol = data["CellCol"]
    data_CellRow = data["CellRow"]
    data_ClusterType = data["ClusterType"]
    data_ClusterE = data["ClusterE"]
    data_ClusterPt = data["ClusterPt"]
    data_ClusterM02 = data["ClusterM02"]
    data_ClusterM20 = data["ClusterM20"]
    data_ClusterDistFromVert = data["ClusterDistFromVert"]
    data_PartE = data["PartE"]
    data_PartPt = data["PartPt"]
    data_PartEta = data["PartEta"]
    data_PartPhi = data["PartPhi"]
    if 'bck' in path:
        data_PartIsPrimary = np.zeros_like(data_PartE, dtype=bool)
    else:
        data_PartIsPrimary = data["PartIsPrimary"]
    data_PartPID = data["PartPID"]

    
    print("Start Splitting Data")
    
    #Keywords in train_test_data
    #train arryas have even index listed below
    #0 ClusterCellN, 2 CellEnergy, 4 CellTiming, 6 CellModuleNumber, 8 CellCol, 10 CellRow,
    #12 ClusterE, 14 ClusterType, 16 ClusterPt, 18 ClusterM02, 20 ClusterM20, 22 ClusterDistFromVert, 
    #24 PartE, 26 PartPt, 28 PartEta, 30 PartPhi, 32 PartIsPrimary, 34 PartPID
    #test in data has index+1
    train_test_data = train_test_split(
        data_ClusterCellN, data_CellEnergy, data_CellTiming, data_CellModuleNumber, data_CellCol, data_CellRow, 
        data_ClusterE, data_ClusterType, data_ClusterPt, data_ClusterM02, data_ClusterM20,
        data_ClusterDistFromVert, data_PartE, data_PartPt, data_PartEta, data_PartPhi, data_PartIsPrimary, 
        data_PartPID, test_size=test_size, random_state=42)
    
    return train_test_data

In [4]:
def split_data(path, name, test_size=0.2):
    
    print("------- Start Splitting {} Data -------".format(name))
    data = load_data(path, test_size) 
    
    #Keywords in data for train/test
    #train arryas have even index listed below
    #0 ClusterCellN, 2 CellEnergy, 4 CellTiming, 6 CellModuleNumber, 8 CellCol, 10 CellRow,
    #12 ClusterE, 14 ClusterType, 16 ClusterPt, 18 ClusterM02, 20 ClusterM20, 22 ClusterDistFromVert, 
    #24 PartE, 26 PartPt, 28 PartEta, 30 PartPhi, 32 PartIsPrimary, 34 PartPID
    #test in data has index+1

    print("Saving {} Train Data".format(name))
    np.savez_compressed('Data-Split/'+name+'_train', Size=data[0].size
                        , ClusterCellN=data[0], CellEnergy=data[2]
                        , CellTiming=data[4], CellModuleNumber=data[6]
                        , CellCol=data[8], CellRow=data[10]
                        , ClusterE=data[12], ClusterPt=data[16]
                        , ClusterType=data[14], ClusterM02=data[18]
                        , ClusterM20=data[20], ClusterDistFromVert=data[22] 
                        , PartE=data[24], PartPt=data[26], PartEta=data[28]
                        , PartPhi=data[30], PartIsPrimary=data[32]
                        , PartPID=data[34])

    print("Saving {} Test Data".format(name))
    np.savez_compressed('Data-Split/'+name+'_test', Size=data[1].size, ClusterN=data[1]
                        , CellEnergy=data[3], CellTiming=data[5]
                        , CellModuleNumber=data[7] , CellCol=data[9]
                        , CellRow=data[11], ClusterE=data[13], ClusterPt=data[17]
                        , ClusterType=data[15], ClusterM02=data[19]
                        , ClusterM20=data[21], ClusterDistFromVert=data[23], PartE=data[25]
                        , PartPt=data[27], PartEta=data[29], PartPhi=data[31]
                        , PartIsPrimary=data[33], PartPID=data[35])
    
    print("----- Finished Splitting {} Data ------\n".format(name))

In [5]:
split_data(pi0_data_path, 'pi0')

------- Start Splitting pi0 Data -------
Start Loading Data
Start Splitting Data
Saving pi0 Train Data
Saving pi0 Test Data
----- Finished Splitting pi0 Data ------



In [6]:
split_data(eta_data_path, 'eta')

------- Start Splitting eta Data -------
Start Loading Data
Start Splitting Data
Saving eta Train Data
Saving eta Test Data
----- Finished Splitting eta Data ------



In [7]:
split_data(bck_data_path, 'bck')

------- Start Splitting bck Data -------
Start Loading Data
Start Splitting Data
Saving bck Train Data
Saving bck Test Data
----- Finished Splitting bck Data ------

