In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gc

In [2]:
#Path to data
pi0_data_path = 'Data-Raw/pi0.npz'
eta_data_path = 'Data-Raw/eta.npz'
bck_data_path = 'Data-Raw/bck.npz'

Read in data and split the raw data into train and test sets
This is done in a function, so that pythons garbage collect frees up any memory leaving scope of function

In [6]:
def load_data(path, test_size):
    data = np.load(path, allow_pickle=True)
    
    print("Start Loading Data")
    data_ClusterN = data["ClusterN"]
    data_Cluster = data["Cluster"]
    data_ClusterTiming = data["ClusterTiming"]    
    data_ClusterType = data["ClusterType"]
    data_ClusterE = data["ClusterE"]
    data_ClusterPt = data["ClusterPt"]
    data_ClusterModuleNumber = data["ClusterModuleNumber"]
    data_ClusterCol = data["ClusterCol"]
    data_ClusterRow = data["ClusterRow"]
    data_ClusterM02 = data["ClusterM02"]
    data_ClusterM20 = data["ClusterM20"]
    data_ClusterDistFromVert = data["ClusterDistFromVert"]
    data_PartE = data["PartE"]
    data_PartPt = data["PartPt"]
    data_PartEta = data["PartEta"]
    data_PartPhi = data["PartPhi"]
    if 'bck' in path:
        data_PartIsPrimary = np.zeros_like(data_PartE, dtype=bool)
    else:
        data_PartIsPrimary = data["PartIsPrimary"]
    data_PartPID = data["PartPID"]

    
    print("Start Splitting Data")
    data_ClusterN_train, data_ClusterN_test, data_Cluster_train, data_Cluster_test, data_ClusterTiming_train, \
    data_ClusterTiming_test, data_ClusterE_train, data_ClusterE_test, data_ClusterType_train,\
    data_ClusterType_test,data_ClusterPt_train, data_ClusterPt_test, data_ClusterModuleNumber_train,\
    data_ClusterModuleNumber_test, data_ClusterCol_train, data_ClusterCol_test, data_ClusterRow_train,\
    data_ClusterRow_test, data_ClusterM02_train, data_ClusterM02_test, data_ClusterM20_train,\
    data_ClusterM20_test, data_ClusterDistFromVert_train, data_ClusterDistFromVert_test, data_PartE_train,\
    data_PartE_test, data_PartPt_train, data_PartPt_test, data_PartEta_train, data_PartEta_test,\
    data_PartPhi_train, data_PartPhi_test, data_PartIsPrimary_train, data_PartIsPrimary_test,\
    data_PartPID_train, data_PartPID_test = train_test_split(
        data_ClusterN, data_Cluster, data_ClusterTiming, data_ClusterE, data_ClusterType, data_ClusterPt,
        data_ClusterModuleNumber, data_ClusterCol, data_ClusterRow, data_ClusterM02, data_ClusterM20,
        data_ClusterDistFromVert, data_PartE, data_PartPt, data_PartEta, data_PartPhi, data_PartIsPrimary, 
        data_PartPID, test_size=test_size, random_state=42)
    
    return (data_ClusterN_train, data_ClusterN_test, data_Cluster_train, data_Cluster_test
        , data_ClusterTiming_train, data_ClusterTiming_test, data_ClusterE_train, data_ClusterE_test
        , data_ClusterType_train, data_ClusterType_test, data_ClusterPt_train, data_ClusterPt_test
        , data_ClusterModuleNumber_train, data_ClusterModuleNumber_test, data_ClusterCol_train
        , data_ClusterCol_test, data_ClusterRow_train, data_ClusterRow_test, data_ClusterM02_train
        , data_ClusterM02_test, data_ClusterM20_train, data_ClusterM20_test, data_ClusterDistFromVert_train
        , data_ClusterDistFromVert_test, data_PartE_train, data_PartE_test, data_PartPt_train, data_PartPt_test
        , data_PartEta_train, data_PartEta_test, data_PartPhi_train, data_PartPhi_test
        , data_PartIsPrimary_train, data_PartIsPrimary_test, data_PartPID_train, data_PartPID_test)

In [7]:
def split_data(path, name,test_size=0.2):
    
    print("------- Start Splitting {} Data -------".format(name))
    data_ClusterN_train, data_ClusterN_test, data_Cluster_train, data_Cluster_test, data_ClusterTiming_train,\
    data_ClusterTiming_test, data_ClusterE_train, data_ClusterE_test, data_ClusterType_train,\
    data_ClusterType_test,data_ClusterPt_train, data_ClusterPt_test, data_ClusterModuleNumber_train,\
    data_ClusterModuleNumber_test, data_ClusterCol_train, data_ClusterCol_test, data_ClusterRow_train,\
    data_ClusterRow_test, data_ClusterM02_train, data_ClusterM02_test, data_ClusterM20_train,\
    data_ClusterM20_test, data_ClusterDistFromVert_train, data_ClusterDistFromVert_test, data_PartE_train,\
    data_PartE_test, data_PartPt_train, data_PartPt_test, data_PartEta_train, data_PartEta_test,\
    data_PartPhi_train, data_PartPhi_test, data_PartIsPrimary_train, data_PartIsPrimary_test,\
    data_PartPID_train, data_PartPID_test = load_data(path, test_size) 
    
    print("Saving {} Train Data".format(name))
    np.savez_compressed('Data-Split/'+name+'_train', Size=data_ClusterE_train.size
                        , ClusterN=data_ClusterN_train, Cluster=data_Cluster_train
                        , ClusterTiming=data_ClusterTiming_train, ClusterE=data_ClusterE_train
                        , ClusterPt=data_ClusterPt_train, ClusterModuleNumber=data_ClusterModuleNumber_train
                        , ClusterType=data_ClusterType_train, ClusterRow=data_ClusterRow_train
                        , ClusterCol=data_ClusterCol_train, ClusterM02=data_ClusterM02_train
                        , ClusterM20=data_ClusterM20_train, ClusterDistFromVert=data_ClusterDistFromVert_train 
                        , PartE=data_PartE_train, PartPt=data_PartPt_train, PartEta=data_PartEta_train
                        , PartPhi=data_PartPhi_train, PartIsPrimary=data_PartIsPrimary_train
                        , PartPID=data_PartPID_train)

    print("Saving {} Test Data".format(name))
    np.savez_compressed('Data-Split/'+name+'_test', Size=data_ClusterE_test.size, ClusterN=data_ClusterN_test
                        , Cluster=data_Cluster_test, ClusterTiming=data_ClusterTiming_test
                        , ClusterE=data_ClusterE_test, ClusterPt=data_ClusterPt_test
                        , ClusterModuleNumber=data_ClusterModuleNumber_test, ClusterType=data_ClusterType_test
                        , ClusterRow=data_ClusterRow_test, ClusterCol=data_ClusterCol_test
                        , ClusterM02=data_ClusterM02_test, ClusterM20=data_ClusterM20_test
                        , ClusterDistFromVert=data_ClusterDistFromVert_test, PartE=data_PartE_test
                        , PartPt=data_PartPt_test, PartEta=data_PartEta_test, PartPhi=data_PartPhi_test
                        , PartIsPrimary=data_PartIsPrimary_test, PartPID=data_PartPID_test)
    
    print("----- Finished Splitting {} Data ------\n".format(name))

In [8]:
split_data(pi0_data_path, 'pi0')

------- Start Splitting pi0 Data -------
Start Loading Data
Start Splitting Data
Saving pi0 Train Data
Saving pi0 Test Data
----- Finished Splitting pi0 Data ------



In [9]:
split_data(eta_data_path, 'eta')

------- Start Splitting eta Data -------
Start Loading Data
Start Splitting Data
Saving eta Train Data
Saving eta Test Data
----- Finished Splitting eta Data ------



In [10]:
split_data(bck_data_path, 'bck')

------- Start Splitting bck Data -------
Start Loading Data
Start Splitting Data
Saving bck Train Data
Saving bck Test Data
----- Finished Splitting bck Data ------

