## Required package

In [1]:
#import uproot
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from collections import namedtuple, defaultdict
#import open3d as o3d
import random
random.seed(42)
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import add_self_loops
from torch_geometric.transforms import ToUndirected
from torchvision import transforms
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib
from torch.nn import BatchNorm1d
from torch.optim.lr_scheduler import LambdaLR
import pickle
#Import needed python libraries.

In [2]:
from ipynb.fs.full.GeneralFunctions import CustomDataset, collate, collate_true, collate_bkg_lone, collate_bkg_cluster_lone, collate_bkg_cluster_cluster, collate_bkg_total, EdgeClassifier, lr_schedule, train, testModel, pickleData, makeTruthArray

## Required data generated by GNNonCalo_Scaling_DataPreparation.ipynb

In [3]:
hf_cellFeaturesScaled_neighbor= h5py.File("./cellFeaturesScaled_train_70evs_all2D.hdf5", 'r')
hf_train_edge_source_BD = h5py.File("./train_edge_source_BD_70evs_all2D.hdf5", 'r')
hf_train_edge_dest_BD = h5py.File("./train_edge_dest_BD_70evs_all2D.hdf5", 'r')
hf_train_edge_source_noBD = h5py.File("./train_edge_source_noBD_70evs_all2D.hdf5", 'r')
hf_train_edge_dest_noBD = h5py.File("./train_edge_dest_noBD_70evs_all2D.hdf5", 'r')
hf_truth_label_train_neighbor= h5py.File("./truth_label_train_70evs_all2D.hdf5", 'r')
#Open needed hdf5 files.

In [4]:
cellFeaturesScaled = hf_cellFeaturesScaled_neighbor.get("cellFeatures_trainS")[:]
train_edge_source_BD = hf_train_edge_source_BD.get("train_edge_source_BD")[:]
train_edge_dest_BD = hf_train_edge_dest_BD.get("train_edge_dest_BD")[:]
train_edge_source_noBD = hf_train_edge_source_noBD.get("train_edge_source_noBD")[:]
train_edge_dest_noBD = hf_train_edge_dest_noBD.get("train_edge_dest_noBD")[:]
truth_label_train = hf_truth_label_train_neighbor.get("truth_label_train")[:]
#Pull the data from hdf5 files as arrays.

In [5]:
hf_cellFeaturesScaled_neighbor.close()
hf_train_edge_source_BD.close()
hf_train_edge_dest_BD.close()
hf_train_edge_source_noBD.close()
hf_train_edge_dest_noBD.close()
hf_truth_label_train_neighbor.close()
#Close hdf5 files.

In [6]:
hf_test_edge_source_true_BD = h5py.File("./test_edge_source_true_BD_all2D.hdf5", "r")
hf_test_edge_dest_true_BD = h5py.File("./test_edge_dest_true_BD_all2D.hdf5", "r")
hf_test_edge_source_true_noBD = h5py.File("./test_edge_source_true_noBD_all2D.hdf5", "r")
hf_test_edge_dest_true_noBD = h5py.File("./test_edge_dest_true_noBD_all2D.hdf5", "r")

hf_test_edge_source_bkg_lone_BD = h5py.File("./test_edge_source_bkg_lone_BD_all2D.hdf5", "r")
hf_test_edge_dest_bkg_lone_BD = h5py.File("./test_edge_dest_bkg_lone_BD_all2D.hdf5", "r")
hf_test_edge_source_bkg_lone_noBD = h5py.File("./test_edge_source_bkg_lone_noBD_all2D.hdf5", "r")
hf_test_edge_dest_bkg_lone_noBD = h5py.File("./test_edge_dest_bkg_lone_noBD_all2D.hdf5", "r")

hf_test_edge_source_bkg_cluster_lone_BD = h5py.File("./test_edge_source_bkg_cluster_lone_BD_all2D.hdf5", "r")
hf_test_edge_dest_bkg_cluster_lone_BD = h5py.File("./test_edge_dest_bkg_cluster_lone_BD_all2D.hdf5", "r")
hf_test_edge_source_bkg_cluster_lone_noBD = h5py.File("./test_edge_source_bkg_cluster_lone_noBD_all2D.hdf5", "r")
hf_test_edge_dest_bkg_cluster_lone_noBD = h5py.File("./test_edge_dest_bkg_cluster_lone_noBD_all2D.hdf5", "r")

hf_test_edge_source_bkg_cluster_cluster_BD = h5py.File("./test_edge_source_bkg_cluster_cluster_BD_all2D.hdf5", "r")
hf_test_edge_dest_bkg_cluster_cluster_BD = h5py.File("./test_edge_dest_bkg_cluster_cluster_BD_all2D.hdf5", "r")
hf_test_edge_source_bkg_cluster_cluster_noBD = h5py.File("./test_edge_source_bkg_cluster_cluster_noBD_all2D.hdf5", "r")
hf_test_edge_dest_bkg_cluster_cluster_noBD = h5py.File("./test_edge_dest_bkg_cluster_cluster_noBD_all2D.hdf5", "r")


In [7]:
test_edge_source_true_BD = hf_test_edge_source_true_BD.get("test_edge_source_true_BD")[:]
test_edge_dest_true_BD = hf_test_edge_dest_true_BD.get("test_edge_dest_true_BD")[:]
test_edge_source_true_noBD = hf_test_edge_source_true_noBD.get("test_edge_source_true_noBD")[:]
test_edge_dest_true_noBD = hf_test_edge_dest_true_noBD.get("test_edge_dest_true_noBD")[:]

test_edge_source_bkg_lone_BD = hf_test_edge_source_bkg_lone_BD.get("test_edge_source_bkg_lone_BD")[:]
test_edge_dest_bkg_lone_BD = hf_test_edge_dest_bkg_lone_BD.get("test_edge_dest_bkg_lone_BD")[:]
test_edge_source_bkg_lone_noBD = hf_test_edge_source_bkg_lone_noBD.get("test_edge_source_bkg_lone_noBD")[:]
test_edge_dest_bkg_lone_noBD = hf_test_edge_dest_bkg_lone_noBD.get("test_edge_dest_bkg_lone_noBD")[:]

test_edge_source_bkg_cluster_lone_BD = hf_test_edge_source_bkg_cluster_lone_BD.get("test_edge_source_bkg_cluster_lone_BD")[:]
test_edge_dest_bkg_cluster_lone_BD = hf_test_edge_dest_bkg_cluster_lone_BD.get("test_edge_dest_bkg_cluster_lone_BD")[:]
test_edge_source_bkg_cluster_lone_noBD = hf_test_edge_source_bkg_cluster_lone_noBD.get("test_edge_source_bkg_cluster_lone_noBD")[:]
test_edge_dest_bkg_cluster_lone_noBD = hf_test_edge_dest_bkg_cluster_lone_noBD.get("test_edge_dest_bkg_cluster_lone_noBD")[:]

test_edge_source_bkg_cluster_cluster_BD = hf_test_edge_source_bkg_cluster_cluster_BD.get("test_edge_source_bkg_cluster_cluster_BD")[:]
test_edge_dest_bkg_cluster_cluster_BD = hf_test_edge_dest_bkg_cluster_cluster_BD.get("test_edge_dest_bkg_cluster_cluster_BD")[:]
test_edge_source_bkg_cluster_cluster_noBD = hf_test_edge_source_bkg_cluster_cluster_noBD.get("test_edge_source_bkg_cluster_cluster_noBD")[:]
test_edge_dest_bkg_cluster_cluster_noBD = hf_test_edge_dest_bkg_cluster_cluster_noBD.get("test_edge_dest_bkg_cluster_cluster_noBD")[:]

In [8]:
hf_test_edge_source_true_BD.close()
hf_test_edge_dest_true_BD.close()
hf_test_edge_source_true_noBD.close()
hf_test_edge_dest_true_noBD.close()

hf_test_edge_source_bkg_lone_BD.close()
hf_test_edge_dest_bkg_lone_BD.close()
hf_test_edge_source_bkg_lone_noBD.close()
hf_test_edge_dest_bkg_lone_noBD.close() 

hf_test_edge_source_bkg_cluster_lone_BD.close()
hf_test_edge_dest_bkg_cluster_lone_BD.close()
hf_test_edge_source_bkg_cluster_lone_noBD.close()
hf_test_edge_dest_bkg_cluster_lone_noBD.close()

hf_test_edge_source_bkg_cluster_cluster_BD.close()
hf_test_edge_dest_bkg_cluster_cluster_BD.close()
hf_test_edge_source_bkg_cluster_cluster_noBD.close()
hf_test_edge_dest_bkg_cluster_cluster_noBD.close()


In [9]:
cellFeaturesScaled.shape
#Print the shape of the cellFeaturesScaled array.

(70, 187652, 8)

In [10]:
cellFeaturesScaled[2][1]
#Print an element of the cellFeaturesScaled array.

array([0.58421445, 0.51289224, 0.16380877, 0.23466876, 0.52418755,
       0.26086957, 0.09700815, 0.22900586])

In [11]:
x = torch.tensor(cellFeaturesScaled, dtype=torch.float)
#Convert cellFeaturesScaled into a torch tensor called x.

In [12]:
x.shape
#Print the shape of x.

torch.Size([70, 187652, 8])

In [13]:
train_edge_source_BD.shape
#Print the shape of train_edge_source_BD.

(70, 89600)

## Preparing bi directional edges (align source and destination) for GNN

In [14]:
edge_index = torch.tensor([train_edge_source_BD, train_edge_dest_BD], dtype=torch.long)
#Create a torch tensor called edge_index out of train_edge_source_BD and train_edge_dest_BD.

  edge_index = torch.tensor([train_edge_source_BD, train_edge_dest_BD], dtype=torch.long)


In [15]:
edge_index.shape
#Print the shape of edge_index.

torch.Size([2, 70, 89600])

In [16]:
edge_index_ch = edge_index.permute(1, 0, 2)
#Make a new torch tensor called edge_index_ch by permuting the dimensions of edge_index.

In [17]:
edge_index_ch.shape
#Print the shape of edge_index_ch.

torch.Size([70, 2, 89600])

## Preparing uni directional edges for final binary classification

In [18]:
edge_index_out = torch.tensor([train_edge_source_noBD, train_edge_dest_noBD], dtype=torch.long)
#Create a torch tensor called edge_index_out out of train_edge_source_noBD and train_edge_dest_noBD.

In [19]:
edge_index_out.shape
#Print the shape of edge_index_out.

torch.Size([2, 70, 44800])

In [20]:
edge_index_out_ch = edge_index_out.permute(1, 0, 2)
#Make a new torch tensor called edge_index_out_ch by permuting the dimensions of edge_index_out.

In [21]:
edge_index_out_ch.shape
#Print the shape of edge_index_out_ch.

torch.Size([70, 2, 44800])

## Preparing label (true/Fake) tensor 

In [22]:
truth_label_train = np.expand_dims(truth_label_train, axis=1)
#Expands the dimensions of truth_label_train.

In [23]:
truth_label_train.shape
#Prints the shape of truth_label_train.

(70, 1, 44800)

In [24]:
y_train = torch.tensor(truth_label_train, dtype=torch.float)
#Converts truth_label_train into a torch tensor named y_train.

In [25]:
y_train.shape
#Prints the shape of y_train.

torch.Size([70, 1, 44800])

## Data customization specific to pytorch 

In [26]:
# Generate data_list
data_list = []
for i in range(70):
    x_mat = x[i]
    edge_index = edge_index_ch[i]
    edge_index, _ = add_self_loops(edge_index)
    data = Data(x=x_mat, edge_index=edge_index, edge_index_out = edge_index_out_ch[i], y=y_train[i])
    data = ToUndirected()(data)
    data_list.append(data)

In [27]:
ind0 = data_list[0].edge_index
ind1 = data_list[1].edge_index
ind2 = data_list[2].edge_index

In [28]:
print(ind0.shape)
print(ind1.shape)
print(ind2.shape)

torch.Size([2, 277248])
torch.Size([2, 277232])
torch.Size([2, 277235])


In [29]:
custom_dataset = CustomDataset(data_list)

In [30]:
batch_size = 20
data_loader = torch.utils.data.DataLoader(custom_dataset, batch_size=batch_size, collate_fn=collate)
#Formats the data and loads it into a data_loader. Iâ€™m honestly not sure about some of the details here.

In [31]:
for batch_x, batch_edge_index, batch_edge_index_out, _ in data_loader:
    print(len(batch_edge_index))

20
20
20
10


In [32]:
#y_test.shape

In [33]:
data.edge_index.shape

torch.Size([2, 277250])

In [34]:
data.y.shape

torch.Size([1, 44800])

In [35]:
x.size(1)
#Prints some information about the shape of the data.

187652

In [36]:
edge_index_true = torch.tensor([test_edge_source_true_BD, test_edge_dest_true_BD], dtype=torch.long)
edge_index_bkg_lone = torch.tensor([test_edge_source_bkg_lone_BD, test_edge_dest_bkg_lone_BD], dtype=torch.long)
edge_index_bkg_cluster_lone = torch.tensor([test_edge_source_bkg_cluster_lone_BD, test_edge_dest_bkg_cluster_lone_BD], dtype=torch.long)
edge_index_bkg_cluster_cluster = torch.tensor([test_edge_source_bkg_cluster_cluster_BD, test_edge_dest_bkg_cluster_cluster_BD], dtype=torch.long)

In [37]:
edge_index_true_ch = edge_index_true.permute(1, 0, 2)
edge_index_bkg_lone_ch = edge_index_bkg_lone.permute(1, 0, 2)
edge_index_bkg_cluster_lone_ch = edge_index_bkg_cluster_lone.permute(1, 0, 2)
edge_index_bkg_cluster_cluster_ch = edge_index_bkg_cluster_cluster.permute(1, 0, 2)

In [38]:
edge_index_true_out = torch.tensor([test_edge_source_true_noBD, test_edge_dest_true_noBD], dtype=torch.long)
edge_index_bkg_lone_out = torch.tensor([test_edge_source_bkg_lone_noBD, test_edge_dest_bkg_lone_noBD], dtype=torch.long)
edge_index_bkg_cluster_lone_out = torch.tensor([test_edge_source_bkg_cluster_lone_noBD, test_edge_dest_bkg_cluster_lone_noBD], dtype=torch.long)
edge_index_bkg_cluster_cluster_out = torch.tensor([test_edge_source_bkg_cluster_cluster_noBD, test_edge_dest_bkg_cluster_cluster_noBD], dtype=torch.long)

In [39]:
edge_index_true_out_ch = edge_index_true_out.permute(1, 0, 2)
edge_index_bkg_lone_out_ch = edge_index_bkg_lone_out.permute(1, 0, 2)
edge_index_bkg_cluster_lone_out_ch = edge_index_bkg_cluster_lone_out.permute(1, 0, 2)
edge_index_bkg_cluster_cluster_out_ch = edge_index_bkg_cluster_cluster_out.permute(1, 0, 2)

In [40]:
edge_index_bkg_cluster_cluster_out_ch.shape

torch.Size([30, 2, 1900])

In [41]:
y_test_bkg_lone = makeTruthArray(edge_index_bkg_lone_out_ch, False)

In [42]:
y_test_bkg_cluster_lone = makeTruthArray(edge_index_bkg_cluster_lone_out_ch, False)

In [43]:
y_test_bkg_cluster_cluster = makeTruthArray(edge_index_bkg_cluster_cluster_out_ch, False)

In [44]:
y_test_true = makeTruthArray(edge_index_true_out_ch, True)

In [45]:
data_list_true = []
for i in range(30):
    x_mat = x[i]
    edge_index = edge_index_true_ch[i]
    edge_index, _ = add_self_loops(edge_index)
    data = Data(x=x_mat, edge_index=edge_index, edge_index_out = edge_index_true_out_ch[i], y= y_test_true[i])
    data = ToUndirected()(data)
    data_list_true.append(data)

In [46]:
data_list_bkg_lone = []
for i in range(30):
    x_mat = x[i]
    edge_index = edge_index_bkg_lone_ch[i]
    edge_index, _ = add_self_loops(edge_index)
    data = Data(x=x_mat, edge_index=edge_index, edge_index_out = edge_index_bkg_lone_out_ch[i], y = y_test_bkg_lone[i])
    data = ToUndirected()(data)
    data_list_bkg_lone.append(data)

In [47]:
data_list_bkg_cluster_lone = []
for i in range(30):
    x_mat = x[i]
    edge_index = edge_index_bkg_cluster_lone_ch[i]
    edge_index, _ = add_self_loops(edge_index)
    data = Data(x=x_mat, edge_index=edge_index, edge_index_out = edge_index_bkg_cluster_lone_out_ch[i], y = y_test_bkg_cluster_lone[i])
    data = ToUndirected()(data)
    data_list_bkg_cluster_lone.append(data)

In [48]:
data_list_bkg_cluster_cluster = []
for i in range(30):
    x_mat = x[i]
    edge_index = edge_index_bkg_cluster_cluster_ch[i]
    edge_index, _ = add_self_loops(edge_index)
    data = Data(x=x_mat, edge_index=edge_index, edge_index_out = edge_index_bkg_cluster_cluster_out_ch[i], y = y_test_bkg_cluster_cluster[i])
    data = ToUndirected()(data)
    data_list_bkg_cluster_cluster.append(data)

In [49]:
custom_dataset_true = CustomDataset(data_list_true)
#custom_dataset_bkg_lone = CustomDataset(data_list_bkg_lone)
#custom_dataset_bkg_cluster_lone = CustomDataset(data_list_bkg_cluster_lone)
#custom_dataset_bkg_cluster_cluster = CustomDataset(data_list_bkg_cluster_cluster)

In [50]:
batch_size = 20
data_loader_true = torch.utils.data.DataLoader(custom_dataset_true, batch_size = batch_size, collate_fn = collate_true)
#data_loader_bkg_lone = torch.utils.data.DataLoader(custom_dataset_bkg_lone, batch_size = batch_size, collate_fn = collate_bkg_lone)
#data_loader_bkg_cluster_lone = torch.utils.data.DataLoader(custom_dataset_bkg_cluster_lone, batch_size = batch_size, collate_fn = collate_bkg_cluster_lone)
#data_loader_bkg_cluster_cluster = torch.utils.data.DataLoader(custom_dataset_bkg_cluster_cluster, batch_size = batch_size, collate_fn = collate_bkg_cluster_cluster)

In [51]:
data_list_total_bkg = data_list_bkg_lone + data_list_bkg_cluster_lone + data_list_bkg_cluster_cluster

In [52]:
custom_dataset_total_bkg = CustomDataset(data_list_total_bkg)

In [53]:
data_loader_total_bkg = torch.utils.data.DataLoader(custom_dataset_total_bkg, batch_size = batch_size, collate_fn = collate_bkg_total)

## Edge Classifier Model

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Instantiate the model
input_dim = 8
hidden_dim = 256
output_dim = 1  # Binary classification (citing or not citing)
model = EdgeClassifier(input_dim, hidden_dim, output_dim)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Define the learning rate scheduler
scheduler = LambdaLR(optimizer, lr_lambda=lr_schedule)
#Sets up the GPU. Creates an edge classifier with 8 input dimensions, 256 hidden dimensions, and 1 output dimensions. Create a loss function
#(binary cross entropy) and optimizer (Adam). Uses lambdaLR to create a scheduler.

In [None]:
num_epochs = 500
lossPerEpochTraining = []
scores = []
truth_labels = []
allLossPerEpochTestTrue = []
allLossPerEpochTestBackground = []
for epoch in range(num_epochs):
    lossPerEpochTraining.append(train(model, device, data_loader, optimizer, criterion))
    # Update the learning rate at the end of each epoch
    scheduler.step()
    epoch_scores, epoch_truth_labels, totalLossPerEpochTestTrue, totalLossPerEpochTestBackground = testModel(model, device, data_loader_true, data_loader_total_bkg, criterion)
    scores.append(epoch_scores)
    truth_labels.append(epoch_truth_labels)
    allLossPerEpochTestTrue.append(totalLossPerEpochTestTrue)
    allLossPerEpochTestBackground.append(totalLossPerEpochTestBackground)
#Trains the GNN for 500 epochs.

total_loss_per_epoch: tensor(0.7263, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.6870, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.6725, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.6518, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.6374, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.6267, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.6136, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.6062, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.6009, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.5955, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.5917, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.5908, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.5899, device='cuda:0', grad_fn=<D

In [None]:
lossPerEpochTraining = [tensor.cpu() for tensor in lossPerEpochTraining]
allLossPerEpochTestTrue = [tensor.cpu() for tensor in allLossPerEpochTestTrue]
allLossPerEpochTestBackground = [tensor.cpu() for tensor in allLossPerEpochTestBackground]

In [None]:
lossPerEpochTraining = [tensor.detach().numpy() for tensor in lossPerEpochTraining]
allLossPerEpochTestTrue = [tensor.detach().numpy() for tensor in allLossPerEpochTestTrue]
allLossPerEpochTestBackground = [tensor.detach().numpy() for tensor in allLossPerEpochTestBackground]
#moves the loss back to the cpu and changes it into a numpy array

In [None]:
path = "./GNNCalo_cluster_all2D.pth"
#torch.save(model, path)
torch.save(model.state_dict(), path)
#Saves the trained model to an external file.

In [None]:
pickleData("lossDataTraining_all2D", lossPerEpochTraining)
pickleData("lossDataTestingTrue_all2D", allLossPerEpochTestTrue)
pickleData("lossDataTestingBackground_all2D", allLossPerEpochTestBackground)
pickleData("scores_all2D", scores)
pickleData("truth_labels_all2D", truth_labels)