In [1]:
import numpy as np
import torch

In [120]:
x_train =np.load("data/X_train_cleaned.npy")
x_test =np.load("data/X_test_cleaned.npy")

In [3]:
y_train =np.load("data/Y_train.npy")
y_test =np.load("data/Y_test.npy")

In [121]:
def get_edge_indices(x):
        # Getting all the info for the edges NB: this only needs to be done once->outside loop
        ## Edge indices: 2 lists, first is source node indices and second is destination node indices
        ## x has shape, (instances, nodes, attributes)
        source = np.arange(0,x.shape[1]).repeat(x.shape[1]-1)
        d = np.arange(0,x.shape[1])
        dest = np.asarray([np.delete(d,i) for i in d]).flatten()
        return np.asarray([source,dest])

In [124]:
indices = get_edge_indices(x_test)
indices


array([[ 0,  0,  0, ..., 26, 26, 26],
       [ 1,  2,  3, ..., 23, 24, 25]])

### Need to get the info for each oindiceshe nodes from the variables 

In [5]:
node_indices = [1,2,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]
%time x_train_ = x_train[:,node_indices] #These are the object-related variables
x_train_.shape

CPU times: user 576 ms, sys: 130 ms, total: 706 ms
Wall time: 705 ms


(3167060, 17)

In [104]:
def populate_nodes(x):
    # Getting all the info for the individual nodes
    ## Last index is the id of the node-jets have their b-quantile, leptons are 0, met is -1
    ### met
    met_arr = x[:,met_vars]
    met_eta_quant = np.full((x.shape[0],2), [0, -1])
    met_arr = np.concatenate((met_arr, met_eta_quant), axis=1) # Appending the eta and quant values to met
    ### lepton
    lept_arr = x[:,l1_vars]
    lep_quant = np.zeros((x.shape[0],1))
    lept_arr = np.concatenate((lept_arr, lep_quant), axis=1) # Appending the eta and quant values to met
    ### Jets
    b1_arr = x[:,b1_vars]
    b2_arr = x[:,b2_vars]
    j1_arr = x[:,j1_vars]
    ### Put them all together
    return np.stack((met_arr, lept_arr, b1_arr, b2_arr, j1_arr),axis=1)

In [105]:
def populate_edges(x, edge_indices):
    sources  = x[:,edge_indices[0],:]# Map a matrix of values by the source indices
    sources  = sources[:,:,[0,1,2]]
    dests    = x[:,edge_indices[1],:]# Map a matrix of values by the dest indices
    dests  = dests[:,:,[0,1,2]]
    deltas = sources - dests
    return deltas

### Need to pick out the variables we'll use for building the edges
This will be a function of the angular distance between objets and their pT:
$$\delta_{ij} = |\phi_i-\phi_j|,  |\eta_i-\eta_j|,  |p_{T_i}-p_{T_j}|$$

Think this is the fastest way to do it:

Populate matrices containing the $p_T, \phi, \eta$ of the source and dests then subtract in one go 

In [109]:
from torch_geometric.data import Data

In [49]:
%time x = torch.tensor(x_arr,dtype=torch.float)
edge_index = torch.tensor([source,dest],dtype=torch.long)

data = Data(x=x, edge_index=edge_index, edge_attr= y=y_train[0])

In [110]:
from torch_geometric.data import InMemoryDataset

In [143]:
class wh_dataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
    @property
    def raw_file_names(self):
        return["X_train.npy", "Y_train.npy", "W_train.npy"]
    @property
    def processed_file_names(self):
        return ["data.pt"]
    @property
    def processed_dir(self):
        return "data/processed/"
    @property
    def processed_paths(self):
        return [self.processed_dir+fname for fname in self.processed_file_names]
    
    #Helper functions for process
    def get_variables(self):
        import NN_driver as driver #The variable names are stored in the driver config
        variables = driver.variables
        variables  = [v for v in variables if "j3" not in v]
        variables  = [v for v in variables if "j2" not in v]
        return variables
    
    def get_edge_indices(self, x):
        # Getting all the info for the edges NB: this only needs to be done once->outside loop
        ## Edge indices: 2 lists, first is source node indices and second is destination node indices
        ## x has shape, (instances, nodes, attributes)
        source = np.arange(0,x.shape[1]).repeat(x.shape[1]-1)
        d = np.arange(0,x.shape[1])
        dest = np.asarray([np.delete(d,i) for i in d]).flatten()
        return np.asarray([source,dest])
    
    def populate_nodes(self, x):
        print("Populating the nodes with attributes")
        # Getting all the info for the individual nodes
        ## Last index is the id of the node-jets have their b-quantile, leptons are 0, met is -1
        ### met
        met_arr = x[:,self.met_vars]
        met_eta_quant = np.full((x.shape[0],2), [0, -1])
        met_arr = np.concatenate((self.met_arr, met_eta_quant), axis=1) # Appending the eta and quant values to met
        ### lepton
        lept_arr = x[:,self.l1_vars]
        lep_quant = np.zeros((x.shape[0],1))
        lept_arr = np.concatenate((self.lept_arr, lep_quant), axis=1) # Appending the eta and quant values to met
        ### Jets
        b1_arr = x[:,self.b1_vars]
        b2_arr = x[:,self.b2_vars]
        j1_arr = x[:,self.j1_vars]
        ### Put them all together
        return np.stack((met_arr, lept_arr, b1_arr, b2_arr, j1_arr),axis=1)
    
    def populate_edges(self, x, edge_indices):
        print("Populating the edges with attributes")
        sources  = x[:,edge_indices[0],:]# Map a matrix of values by the source indices
        sources  = sources[:,:,[0,1,2]]
        dests    = x[:,edge_indices[1],:]# Map a matrix of values by the dest indices
        dests  = dests[:,:,[0,1,2]]
        deltas = sources - dests
        return deltas
    
    def process(self):
        # Sort out the variables we need based on the driver config
        variables = get_variables()
        self.met_vars = [variables.index('ETMiss'),variables.index('ETMissPhi')]
        self.l1_vars = [variables.index('pTl1'),variables.index('phil1'),variables.index('etal1')]
        self.b1_vars = [variables.index('pTb1'),variables.index('phib1'),variables.index('etab1'),variables.index('b1_quantile')]
        self.b2_vars = [variables.index('pTb2'),variables.index('phib2'),variables.index('etab2'),variables.index('b2_quantile')]
        self.j1_vars = [variables.index('pTj1'),variables.index('phij1'),variables.index('etaj1'),variables.index('j1_quantile')]
        print("Loading input data")
        x = np.load(self.root+"/"+self.raw_file_names[0])
        y = np.load(self.root+"/"+self.raw_file_names[1])
        
        # Get the variables associated to the objects in the event
        #node_indices = [1,2,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]
        x = x[:100,:] # These are the object-related variables
        node_attrs = populate_nodes(x) # Convert the objects to node data
        # Get the edges and populate them with edge data
        edge_indices = get_edge_indices(node_attrs)
        edge_attrs = populate_edges(node_attrs, edge_indices)
        
        node_attrs = torch.tensor(node_attrs, dtype=torch.float)
        edge_indices = torch.tensor(edge_indices, dtype=torch.long)
        edge_attrs = torch.tensor(edge_attrs, dtype=torch.float)
        
        data_list = [Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y) for x,edge_index,edge_attr,y in zip(node_attrs, edge_indices, edge_attrs, y)]
        
        
        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]
        data, slices = self.collate(data_list)
        torch.save((data,slices), self.processed_paths+"/"+self.processed_file_names[0])

In [144]:
data = wh_dataset("data/")

In [145]:
data

wh_dataset(2)