In [1]:
import pandas as pd
import os
import numpy as np

import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import TensorDataset, Dataset, random_split
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

from functools import partial
from tqdm import tqdm

import os
import sys

from pyspark.sql import SparkSession, Row

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
from pyspark.sql import SparkSession, Row
spark = SparkSession.builder.config("spark.driver.memory", "40g").appName("ml4sci").getOrCreate()

24/03/24 14:53:57 WARN Utils: Your hostname, fff002 resolves to a loopback address: 127.0.1.1; using 200.145.157.3 instead (on interface enp5s0)
24/03/24 14:53:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/24 14:53:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
run0_path = "datasets/task_ii/jet0_run0.parquet"
run1_path = "datasets/task_ii/jet0_run1.parquet"
run2_path = "datasets/task_ii/jet0_run2.parquet"

In [4]:
def partition_to_tensor(iterator):
    for row in iterator:
        # Assuming 'array_column' is the name of your multidimensional array column
        tensor = torch.tensor(row['X_jets'])
        # Now you can save the tensor to disk or do something else with it
        yield tensor


In [108]:
import joblib

def save_train_tensor_to_disk(path, tensor, row_index):
    path = path + "X_jet_partition_{}.pt".format(row_index)
    torch.save(tensor, path)

# first, mapPartitionWithIndex( f) has to receive some function f that receives the index and iterator for
# each partition, and returns (partition_index, iter).
# then, forEachPartition( ) receives the iterator for what's inside each partition

    
# Use foreachPartition to save each partition's tensors to disk
# This argument now has both the partition_idx and the tensor iterator!
def save_train_partition(iterator):

    partition_tensors = []
    first_tensor_idx = -1
    for tensor, tensor_idx in iterator:
        if first_tensor_idx == -1:
            first_tensor_idx = tensor_idx
            
        partition_tensors.append(tensor)

    if len(partition_tensors) == 0: #if partition is not empty, then save tensors
        return
        
    partition_tensors = torch.cat(partition_tensors, axis=0)
    torch.save(partition_tensors, train_path + "X_jet_partition_{}.pt".format(str(first_tensor_idx)))

In [128]:
def save_val_tensor_to_disk(tensor, index):
    path = val_path + "X_jet_partition_{}.pt".format(index)
    torch.save(tensor, path)

# Use foreachPartition to save each partition's tensors to disk
def save_val_partition(iterator):
    # for index, tensor in enumerate(iterator):
    #     save_val_tensor_to_disk(tensor, index)
    partition_tensors = []
    first_tensor_idx = -1
    for tensor, tensor_idx in iterator:
        if first_tensor_idx == -1:
            first_tensor_idx = tensor_idx
            
        partition_tensors.append(tensor)

    if len(partition_tensors) == 0: #if partition is not empty, then save tensors
        return
        
    partition_tensors = torch.cat(partition_tensors, axis=0)
    torch.save(partition_tensors, val_path + "X_jet_partition_{}.pt".format(str(first_tensor_idx)))

### Now, we'll load all parquet files via spark.

In [7]:
run0_df = spark.read.parquet(run0_path)

                                                                                

#### Let's also check the schema and take a peek of the data:

In [8]:
# let's check the schema..
run0_df.printSchema()

root
 |-- X_jets: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: double (containsNull = true)
 |-- pt: double (nullable = true)
 |-- m0: double (nullable = true)
 |-- y: double (nullable = true)



In [9]:
# ..and take a peek:
run0_df.show(3, truncate=True)

                                                                                

+--------------------+------------------+------------------+---+
|              X_jets|                pt|                m0|  y|
+--------------------+------------------+------------------+---+
|[[[0.0, 0.0, 0.0,...|112.41109466552734|21.098247528076172|0.0|
|[[[0.0, 0.0, 0.0,...| 95.22040557861328|14.030599594116211|1.0|
|[[[0.0, 0.0, 0.0,...| 97.00731658935547|17.728967666625977|1.0|
+--------------------+------------------+------------------+---+
only showing top 3 rows



In [8]:
run1_df = spark.read.parquet(run1_path)

In [9]:
run2_df = spark.read.parquet(run2_path)

                                                                                

### Now let's concat all spark dataframes into one single dataframe.

In [10]:
final_df = run0_df.union(run1_df)

In [11]:
final_df = final_df.union(run2_df)

In [8]:
final_df.count()

                                                                                

139306

In [126]:
(139306 * 0.180 * 0.2) / 80

62.68769999999999

### Next, we'll split into train and validation

In [12]:
train_df, val_df = final_df.randomSplit([0.8, 0.2], seed=42)

### Now let's save the tensors locally in our directory

In [2]:
train_path = "datasets/task_ii/train/X_jets/"
val_path = "datasets/task_ii/validation/X_jets/"

In [120]:
"""
This is important: from the spark DataFrame, we need to create an RDD for efficient data storage.
zipWithIndex() will allow us to retrieve, for each jet, not only its tensor, but also its corresponding index.
"""
# Apply the function to each partition #zipWithIndex()
rdd_train = train_df.rdd.zipWithIndex().map(lambda row: (torch.Tensor(row[0]['X_jets']).expand(1, 3, 125, 125), row[1]) ).repartition(300)

                                                                                

In [357]:
rdd_train.foreachPartition(save_train_partition)

                                                                                

In [127]:
# Apply the function to each partition
rdd_val = val_df.rdd.zipWithIndex().map(lambda row: (torch.Tensor(row[0]['X_jets']).expand(1, 3, 125, 125), row[1]) ).repartition(80) #(partition_to_tensor)

                                                                                

In [358]:
rdd_val.foreachPartition(save_val_partition)

                                                                                

## Lastly, we'll just rename each partition

In [5]:
tensor_list = [tensor_name for tensor_name in os.listdir(train_path) if "pt" in tensor_name]
tensor_list = sorted([tensor_name for tensor_name in tensor_list], key=lambda x: int(x.split("_")[-1].split(".")[0]))

for idx, partition in enumerate(tensor_list):
    os.rename(train_path + partition, train_path + "X_jet_partition_{}.pt".format(str(idx)))
    # print(train_path + partition + "--->" + train_path + "X_jet_partition_{}.pt".format(str(idx)))

In [6]:
tensor_list = [tensor_name for tensor_name in os.listdir(val_path) if "pt" in tensor_name]
tensor_list = sorted([tensor_name for tensor_name in tensor_list], key=lambda x: int(x.split("_")[-1].split(".")[0]))

for idx, partition in enumerate(tensor_list):
    os.rename(val_path + partition, val_path + "X_jet_partition_{}.pt".format(str(idx)))

### Just some sanity checking, to see the shape of saved tensors and number of partitions

In [138]:
torch.load(train_path + "X_jet_partition_0.pt").shape

torch.Size([332, 3, 125, 125])

In [140]:
torch.load(val_path + "X_jet_partition_0.pt").shape

torch.Size([398, 3, 125, 125])

In [129]:
rdd_train.getNumPartitions()

300

In [141]:
rdd_val.getNumPartitions()

80

In [241]:
tensor_list

['X_jet_partition_0.pt',
 'X_jet_partition_10.pt',
 'X_jet_partition_20.pt',
 'X_jet_partition_30.pt',
 'X_jet_partition_40.pt',
 'X_jet_partition_50.pt',
 'X_jet_partition_60.pt',
 'X_jet_partition_70.pt',
 'X_jet_partition_80.pt',
 'X_jet_partition_90.pt',
 'X_jet_partition_100.pt',
 'X_jet_partition_110.pt',
 'X_jet_partition_120.pt',
 'X_jet_partition_130.pt',
 'X_jet_partition_140.pt',
 'X_jet_partition_150.pt',
 'X_jet_partition_160.pt',
 'X_jet_partition_170.pt',
 'X_jet_partition_180.pt',
 'X_jet_partition_190.pt',
 'X_jet_partition_200.pt',
 'X_jet_partition_210.pt',
 'X_jet_partition_220.pt',
 'X_jet_partition_230.pt',
 'X_jet_partition_240.pt',
 'X_jet_partition_250.pt',
 'X_jet_partition_260.pt',
 'X_jet_partition_270.pt',
 'X_jet_partition_280.pt',
 'X_jet_partition_290.pt',
 'X_jet_partition_300.pt',
 'X_jet_partition_310.pt',
 'X_jet_partition_320.pt',
 'X_jet_partition_330.pt',
 'X_jet_partition_340.pt',
 'X_jet_partition_350.pt',
 'X_jet_partition_360.pt',
 'X_jet_part

In [3]:
"""
Right now, we will create a map of tensor_idx -> partition_num.
It's basically a lookup table for faster performance.

Since the training data is already shuffled, we'll use a SequentialSampler in Torch,
and for each tensor idx, we load each partition only as we need, or keep it in cache.
"""

train_tensor_map = {}

tensor_list = [tensor_name for tensor_name in os.listdir(train_path) if "pt" in tensor_name]
tensor_list = sorted([tensor_name for tensor_name in tensor_list], key=lambda x: int(x.split("_")[-1].split(".")[0]))

cum_sum = 0
for partition_idx, tensor_name in tqdm(enumerate(tensor_list)):

    
    tensor_num = int(tensor_name.split("_")[-1].split(".")[0])
    partition_size = torch.load(train_path + tensor_name).shape[0]
    
    for tensor_idx in range(partition_size):
        train_tensor_map[cum_sum + tensor_idx] = tensor_name
        
    cum_sum += partition_size    
list(train_tensor_map.items())[:3]

300it [00:13, 21.55it/s]


[(0, 'X_jet_partition_0.pt'),
 (1, 'X_jet_partition_0.pt'),
 (2, 'X_jet_partition_0.pt')]

In [4]:
val_tensor_map = {}

tensor_list = [tensor_name for tensor_name in os.listdir(val_path) if "pt" in tensor_name]
tensor_list = sorted([tensor_name for tensor_name in tensor_list], key=lambda x: int(x.split("_")[-1].split(".")[0]))

cum_sum = 0
for partition_idx, tensor_name in tqdm(enumerate(tensor_list)):

    tensor_num = tensor_name.split("_")[-1].split(".")[0]
    partition_size = torch.load(val_path + tensor_name).shape[0]
    
    for tensor_idx in range(partition_size):
        val_tensor_map[cum_sum + tensor_idx] = tensor_name
    cum_sum += partition_size 
list(val_tensor_map.items())[:3]

80it [00:03, 22.84it/s]


[(0, 'X_jet_partition_0.pt'),
 (1, 'X_jet_partition_0.pt'),
 (2, 'X_jet_partition_0.pt')]

### We also need to save the labels, but since these are way less memory than tensors, the process is straightforward

In [170]:
y_train = train_df.select(['y']).collect()

                                                                                

In [171]:
y_val = val_df.select(['y']).collect()

                                                                                

In [172]:
len(y_train)

111510

In [173]:
len(y_val)

27796

In [174]:
np.array(y_train)

array([[0.],
       [1.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [175]:
# we need to flatten this array
y_train = np.array(y_train).flatten()

In [176]:
# we need to flatten this array
y_val = np.array(y_val).flatten()

In [177]:
np.savez_compressed("datasets/task_ii/train/y/y.npz", y= y_train)

In [178]:
np.savez_compressed("datasets/task_ii/validation/y/y.npz", y= y_val)

### And define our Quark-Gluon dataset in Torch format:

In [20]:
# Define a PyTorch Dataset that can load data from a Spark DataFrame
class QuarkGluonDataset(Dataset):
    def __init__(self, data_path, tensor_map, transform=None):
        self.transform = transform
        self.data_path = data_path
        self.tensor_map = tensor_map
        self.partition = tensor_map[0]
        self.starting_tensor_idx = 0
        self.end_tensor_idx = np.max(list(tensor_map.keys()))
        self.cached_tensors = torch.load(data_path + "X_jets/" + self.partition)
        self.labels = torch.tensor(dict(np.load(data_path + "y/y.npz", "r"))['y'], dtype=torch.long)
        self.flag = 0
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):


        # if the sequential sampler hits the last tensor in the partition, then we reset
        # all internal states of the dataset. This is required to reiterate the dataloader
        # for more than 1 epoch.
        if idx == self.end_tensor_idx:
            
            del self.partition
            
            idx -= self.starting_tensor_idx
            self.starting_tensor_idx = 0
            
            self.partition = self.tensor_map[0]
            
            X = self.cached_tensors[idx - self.starting_tensor_idx]
            
            del self.cached_tensors
            self.cached_tensors = torch.load(self.data_path + "X_jets/" + self.partition) 
            
            return X, self.labels[idx]

        # if tensor index (idx) advanced to a new partition, only then we update cache.
        elif self.tensor_map[idx] != self.partition:

            self.starting_tensor_idx += self.cached_tensors.shape[0]
            
            del self.cached_tensors
            del self.partition
            
            self.partition = self.tensor_map[idx]
            self.cached_tensors = torch.load(self.data_path + "X_jets/" + self.partition) 

        X = self.cached_tensors[idx - self.starting_tensor_idx] #torch.load(self.data_path + "X_jet_" + str(idx) + ".pt")
        if self.transform:
            X = self.transform(X)
            
        return X, self.labels[idx]

In [21]:
training_data = QuarkGluonDataset("datasets/task_ii/train/", train_tensor_map)

In [22]:
val_data = QuarkGluonDataset("datasets/task_ii/validation/", val_tensor_map)

In [23]:
train_dataloader = DataLoader(training_data, batch_size=8, shuffle=False)

In [24]:
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)

In [25]:
dataloaders = {"train": train_dataloader,
               "test_dataloader": val_dataloader}

## Now, for the 12-layered VGG:

In [None]:
"""
VGG is a deep convolutional neural network with 16 layers that are trainable, known for its use of small 3x3 filters 
throughout the network. This is a relatively simple model and at the same time, very good performance on classical image-recognition
tasks.

In a nutshell, each layer consists of convolution (3,3) -> ReLU -> maxpool.
At the end, some fully-connected layers are stacked before the final logits, e.g.:

                                            *** VGG-12 architecture ***
          ___________________     ____      _______                ______      _____     __________
   M -->  |convolution (3,3)| -> |ReLU| -> |maxpool| -> ...... -> |FC (1)| -> |ReLU| -> |FC(logits)| -> Quarks/Gluons
          -------------------    ------    ---------              --------    ------    ------------

Where M is a tensor of shape (batch_size, 3, 125, 125). 

Here, we modify the network to have only 12 layers, consisting of 10 convolutions and 2 FC.

"""
class VGG12(nn.Module):
    def __init__(self):
        super(VGG16_NET, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1)

        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1)

        self.conv5 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
        self.conv6 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv7 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)

        self.conv8 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
        self.conv9 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
        self.conv10 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)

        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

        self.fc14 = nn.Linear(25088, 64)
        # self.fc15 = nn.Linear(64, 64)
        self.fc15 = nn.Linear(64, 2)
        # self.fc16 = nn.Linear(64, 2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.maxpool(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.maxpool(x)
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))
        x = F.relu(self.conv7(x))
        x = self.maxpool(x)
        x = F.relu(self.conv8(x))
        x = F.relu(self.conv9(x))
        x = F.relu(self.conv10(x))
        x = self.maxpool(x)
        x = x.reshape(x.shape[0], -1)
        x = F.relu(self.fc14(x))
        x = F.dropout(x, 0.5) # this prevents overfitting
        x = F.relu(self.fc15(x))
        # x = F.dropout(x, 0.5)
        # x = self.fc16(x)
        return x

In [13]:
model = VGG12()

In [224]:
model(next(iter(train_dataloader))[0])

tensor([[ 8.0340e-02,  3.0663e-02],
        [ 9.0766e-02,  9.8202e-05],
        [ 1.0704e-01,  5.9923e-02],
        [ 1.2804e-01,  3.3648e-02],
        [ 9.4983e-02, -1.6154e-02],
        [ 1.1228e-01, -2.0752e-02],
        [ 9.5117e-02,  3.1757e-02],
        [ 8.2396e-02,  1.1045e-02]], grad_fn=<AddmmBackward0>)

### Let's do a simple sanity check to see if the shape of returned tensor are as expected, an also the output of our model!

In [215]:
next(iter(train_dataloader))[0].shape

torch.Size([64, 3, 125, 125])

In [218]:
model(next(iter(train_dataloader))[0]).shape

torch.Size([64, 2])

In [None]:
"""
Indeed, the input tensors come in batches of 64 samples, each being a 3-channel 125 x 125 matrix M.

The output, as expected, consists of 64 samples, each a 2d-tensor representing the logits, 
that'll later represent probabilities of being either quark or gluon.
"""

In [27]:
from tqdm import tqdm

def train_model(
        model,
        dataloaders,
        device: str = 'cpu',
        num_epoch: int = 50
):
    # get model
    model# = VGG16_NET()

    # define loss function
    loss_fun = torch.nn.CrossEntropyLoss()

    # define optimiser
    opt = torch.optim.SGD(model.parameters(), lr=0.01)

    for epoch in range(num_epoch):
        batch_losses = 0
        model.train()
        for batch, data in tqdm(enumerate(dataloaders['train'])):
            opt.zero_grad()

            features = data[0].to(device)
            label = data[1].to(device)

            y_pred = model(features)

            # print(label, y_pred)
            loss = loss_fun(y_pred, label)

            loss.backward()
            opt.step()

            # track batch losses
            batch_losses += loss.item()

        avg_batch_loss = batch_losses / (batch + 1)
        print(f'Epoch {epoch} average loss: {avg_batch_loss}')

        model.eval()
        
        batch_losses = 0
        for batch, data in tqdm(enumerate(dataloaders['val'])):

            features = data[0].to(device)
            label = data[1].to(device)

            with torch.no_grad():
                y_pred = model(features)
                loss = loss_fun(y_pred, label)


            # track batch losses
            batch_losses += loss.item()

        avg_batch_loss = batch_losses / (batch + 1)
        print(f'Epoch {epoch} average loss: {avg_batch_loss}')

    return model

In [None]:
train_model(model, dataloaders)

9555it [1:07:18,  2.37it/s]

In [None]:
from pyspark.sql import SparkSession
import horovod.spark
import horovod.torch as hvd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os



# Define a PyTorch Dataset that can load data from a Spark DataFrame
class QuarkGluonDataset(Dataset):
    def __init__(self, data_path, transform=None):
        self.transform = transform
        self.data_path = data_path + "X_jets/"
        self.labels = pd.read_csv(data_path + "X_jets/labels.npy")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        
        image = torch.tensor(self.df.select('X_jets').collect()[idx][0]).float()
        image = torch.load(self.data_path + "X_jet_" + str(idx) + "pt")
        
        if self.transform:
            image = self.transform(image)
            
        return image, self.labels[idx]
        

# Define transformations for the MNIST images
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((28, 28)),
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Create a PyTorch Dataset
mnist_dataset = MNISTDataset(mnist_df, transform=transform)

# Horovod: initialize library
hvd.init()
torch.cuda.set_device(hvd.local_rank())

# Horovod: adjust learning rate based on number of GPUs
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr * hvd.size())

# Horovod: wrap optimizer with DistributedOptimizer
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

# Horovod: broadcast parameters & optimizer state
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)

# Create a DataLoader for the MNIST dataset
train_sampler = torch.utils.data.distributed.DistributedSampler(
    mnist_dataset, num_replicas=hvd.size(), rank=hvd.rank())
train_loader = DataLoader(mnist_dataset, batch_size=64, sampler=train_sampler)

# Define a simple neural network model
class SimpleNN(torch.nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.linear = torch.nn.Linear(784, 10)  # MNIST images are 28x28

    def forward(self, x):
        return torch.relu(self.linear(x))

model = SimpleNN().cuda()

# Train the model
def train(epoch):
    model.train()
    train_sampler.set_epoch(epoch)
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(data)
        loss = torch.nn.functional.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

for epoch in range(1, args.epochs + 1):
    train(epoch)

# Close the Spark session
spark.stop()

In [72]:
def print_device_usage(device):
  tot_memory = torch.cuda.get_device_properties(device).total_memory/1024.0**3
  reserved_memory = torch.cuda.memory_reserved(device)/1024.0**3
  allocated_memory = torch.cuda.memory_allocated(device)/1024.0**3
  free_memory = reserved_memory-allocated_memory  # free inside reserved
  print('Total memory in Gb: %.2f'%tot_memory)
  print('Reserved memory in Gb: %.2f'%reserved_memory)
  print('Allocated memory in Gb: %.2f'%allocated_memory)
  print('Free memory in Gb: %.2f'%free_memory)


# check if a GPU is available. Otherwise run on CPU
device = 'cpu'
args_cuda = torch.cuda.is_available()
if args_cuda: device = "cuda:0"
print('device : ',device)
if args_cuda: print_device_usage(device)

device :  cuda:0
Total memory in Gb: 11.77
Reserved memory in Gb: 0.00
Allocated memory in Gb: 0.00
Free memory in Gb: 0.00


In [74]:
# let's open the file
data_dir = 'datasets/Data-MLtutorial/JetDataset/'
fileIN = data_dir+'jetImage_7_100p_30000_40000.h5'
f = h5py.File(fileIN)
# and see what it contains
print(list(f.keys()))

['jetConstituentList', 'jetFeatureNames', 'jetImage', 'jetImageECAL', 'jetImageHCAL', 'jets', 'particleFeatureNames']


In [78]:
f['jetImageHCAL']

<HDF5 dataset "jetImageHCAL": shape (10000, 100, 100), type "<f8">

In [12]:
class VGG16_NET(nn.Module):
    def __init__(self):
        super(VGG16_NET, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1)

        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1)

        self.conv5 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
        self.conv6 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv7 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)

        self.conv8 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
        self.conv9 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
        self.conv10 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)

        # self.conv11 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
        # self.conv12 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
        # self.conv13 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)

        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

        self.fc14 = nn.Linear(25088, 64)
        self.fc15 = nn.Linear(64, 64)
        self.fc16 = nn.Linear(64, 2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.maxpool(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.maxpool(x)
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))
        x = F.relu(self.conv7(x))
        x = self.maxpool(x)
        x = F.relu(self.conv8(x))
        x = F.relu(self.conv9(x))
        x = F.relu(self.conv10(x))
        x = self.maxpool(x)
        # x = F.relu(self.conv11(x))
        # x = F.relu(self.conv12(x))
        # x = F.relu(self.conv13(x))
        # x = self.maxpool(x)
        x = x.reshape(x.shape[0], -1)
        x = F.relu(self.fc14(x))
        x = F.dropout(x, 0.5) #dropout was included to combat overfitting
        x = F.relu(self.fc15(x))
        x = F.dropout(x, 0.5)
        x = self.fc16(x)
        return x

In [None]:
num_classes = 2
num_epochs = 300
batch_size = 16
learning_rate = 0.0003

model = ResNet(ResidualBlock).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.001, momentum = 0.9)  

# Train the model
total_step = len(train_loader)

In [40]:
del model

In [37]:
model = VGG16_NET()

In [None]:
import gc
device = 'cpu'
total_step = len(train_loader)
p2d = (40,40,40,40) # pad last dim by (1, 1) and 2nd to last by (2, 2)

for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)

        images = F.pad(images, p2d, "constant", 0)
        # print(images.expand(-1,1,-1,-1).shape)
        # images =torch.cat((images, torch.randn(64,1,224,224)), axis=1).shape
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        del images, labels, outputs
        torch.cuda.empty_cache()
        gc.collect()

    print ('Epoch [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, loss.item()))
            
    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
    
        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total)) 