In [100]:
import numpy as np
import pandas as pd
import matplotlib as plt
import uproot
import awkward as ak
import os
from utils import *
from tqdm import tqdm, trange

In [2]:
path  = "/Users/cpare/Downloads/val_5M/HToBB_120.root"
HtoBB = uproot.open(path)
BB_branch = HtoBB['tree']
BB_branch.keys()
awkArray_ptx = BB_branch['part_px'].array()
list_ptx  = list(awkArray_ptx)
print(list_ptx[:4])
print(list_ptx[0])

[<Array [-67.3, -48, -42, ... 0.0173, -0.52] type='53 * float32'>, <Array [-32.4, -40.6, ... -0.316, -0.328] type='68 * float32'>, <Array [-30.7, -8.82, ... -0.0422, 0.0874] type='26 * float32'>, <Array [40, 34.2, 16.2, ... 0.593, 0.598] type='48 * float32'>]
[-67.3, -48, -42, -47.5, -9.11, -4.85, ... -0.748, -0.694, -0.107, 0.0173, -0.52]


In [3]:
def getting_Data(list_hep):
    list_hep_max = findMaxLengthList(list_hep)
    design_matrix = np.zeros((1, list_hep_max))
    for sub_list in list_hep:
        sub_list = np.array(sub_list)
        pad_list = np.pad(sub_list, (0, list_hep_max - len(sub_list)), 'constant', constant_values = 0)
        design_matrix = np.vstack([design_matrix, pad_list])
    new_design_matrix = np.delete(design_matrix, 0 , axis = 0)
    return new_design_matrix




# Notes on the Branch

- **The particle features are the awkward arrays that we can't simply transform to numpy arrays**


- We only have to do zero-padding on the particle features as the nested/sub-arrays are all of different lengths.



In [4]:
jet_pt_arr = BB_branch['jet_sdmass'].array()
jet_pt_arr = np.array(jet_pt_arr)

print(jet_pt_arr)
print(type(jet_pt_arr))
print(len(jet_pt_arr))



[236.60558  244.01183  113.344536 ... 125.35571   45.36766  131.21666 ]
<class 'numpy.ndarray'>
100000


## Testing method that works efficiently

In [165]:
path  = "/Users/cpare/Downloads/val_5M/HToBB_122.root"
HtoBB = uproot.open(path)
BB_branch = HtoBB['tree']
len(BB_branch.keys())

41

In [166]:
arr_awk = BB_branch['part_px'].array()
awk_list = list(arr_awk)

Processing Data: :   0%|          | 0/50 [05:56<?, ?it/s]


In [167]:
m = findMaxLengthList(awk_list)
m

126

In [88]:
from utils import *
zero_pad_arr = zero_padding(awk_list)
zero_pad_arr.shape


(100000, 136)

In [97]:
path  = "/Users/cpare/Downloads/val_5M/HToBB_120.root"
particle_features = ["part_px", "part_py"]
open_file = uproot.open(path)
branch = open_file['tree']
particle_matrix = np.zeros((1, 136))
for i in branch.keys():
    for particle in particle_features:
        if particle in i:
            arr_awk = branch[i].array()
            awk_list = list(arr_awk)
            zero_pad_arr = zero_padding(awk_list)
            particle_matrix = np.vstack([particle_matrix, zero_pad_arr])
            updated_particle_matrix = np.delete(particle_matrix, 0 , axis = 0)


print(updated_particle_matrix.shape)


(200000, 136)


In [127]:
updated_particle_matrix[2,135]

0.0

# Shape of Particle features Matrix is [100000,136]

In [81]:
zero_pad_arr.shape

(100000, 136)

## Testing validation (if file is already downloaded)

In [133]:
jet_type = ["HToBB"]
jet_features = ["jet_pt", "jet_eta", "jet_phi"]
particle_features = ["part_px"]
data_dir = "/Users/cpare/Downloads"

In [65]:
jet_type[0]

'HToBB'

In [180]:
def getData(jet_type, jet_features, particle_features, data_dir):
    dataset_name = "JetClass Validation Set"
    file_download_name = "Val_5M"
    key = "JetClass_Pythia_val_5M.tar"
    record_id = 6619768
    jet_matrix = np.zeros((1, 100000))
    particle_matrix = np.zeros((1, 136))
    file_path = checkDownloadZenodoDataset(data_dir, dataset_name, record_id, key, file_download_name)
    print("Processing Data: ...")
    for jet_file in os.listdir(file_path):
        f = os.path.join(file_path, jet_file)
        for jet in jet_type:
            if jet in f:
                open_file = uproot.open(f)
                branch = open_file['tree']
                for i in branch.keys():
                    for feature in jet_features:
                        if feature in i:
                            arr = branch[i].array()
                            arr = np.array(arr)
                            jet_matrix = np.vstack([jet_matrix, arr])
                    for particle in particle_features:
                        if particle in i:
                            arr_awk = branch[i].array()
                            awk_list = list(arr_awk)
                            zero_pad_arr = zero_padding(awk_list)
                            length_curr = findMaxLengthList(zero_pad_arr)
                            length_matrix = findMaxLengthList(particle_matrix)
                            zeros = np.zeros(100001)
                            if (length_curr > length_matrix) :
                                zeros = np.zeros(100001)
                                diff = length_curr - length_matrix
                                for i in range(diff):
                                    particle_matrix = np.column_stack((particle_matrix,zeros))
                            elif (length_curr < length_matrix):
                                zeros = np.zeros(100000)
                                diff = length_matrix - length_curr
                                for i in range(diff):
                                    zero_pad_arr = np.column_stack((zero_pad_arr,zeros))
                            particle_matrix = np.vstack([particle_matrix, zero_pad_arr])
                            updated_particle_matrix = np.delete(particle_matrix, 0 , axis = 0)   
                
    updated_jet_matrix = np.delete(jet_matrix, 0 , axis = 0)                
    dim1 = updated_jet_matrix.shape[0]
    dim2 = updated_jet_matrix.shape[1]
    dim_res = dim1/len(jet_features)
    dim = int(dim_res * dim2)
    return updated_jet_matrix.reshape(dim,len(jet_features)) , updated_particle_matrix

                      




In [181]:
jet_data, particle_data = getData(jet_type, jet_features, particle_features,data_dir)
## Combibing the first axis, instead having 500k by 3
jet_data.shape

Processing Data: ...


Processing Data: :   0%|          | 0/50 [14:32<?, ?it/s]


KeyboardInterrupt: 

In [177]:
jet_data.shape

(500000, 3)

## To-DO List

- [x] Generalize to more Jet_types
- [x] Generalize to more Jet_features
- [x]  Generalize to particle features
- [x] Generalize for validation dataset
- [] Write Class
- [x] Use tqdm library to show progress bars
- [] TEST Clas

In [150]:
a = np.array([1,2,3])
b = np.array((2,3,4))
zeros = np.zeros(3)
c = np.column_stack((a,b))
c = np.column_stack((c,zeros))
c


array([[1., 2., 0.],
       [2., 3., 0.],
       [3., 4., 0.]])

In [152]:
for i in range(6):
    print(i)

0
1
2
3
4
5


In [154]:
xd = np.zeros(5)
xd

array([0., 0., 0., 0., 0.])

In [184]:
class JetClass:
    """
    PyTorch ``torch.unit.data.Dataset`` class for the JetClass dataset.
    If root files are not found in the ``data_dir`` directory then dataset will be downloaded
    from Zenodo (https://zenodo.org/record/6975118 or https://zenodo.org/record/6975117).
    Args:
        jet_type (Union[str, Set[str]], optional): individual type or set of types out of
            'g' (gluon), 'q' (light quarks), 't' (top quarks), 'w' (W bosons), or 'z' (Z bosons).
            "all" will get all types. Defaults to "all".
        data_dir (str, optional): directory in which data is (to be) stored. Defaults to "./".
        particle_features (List[str], optional): list of particle features to retrieve. If empty
            or None, gets no particle features. Defaults to
            ``["etarel", "phirel", "ptrel", "mask"]``.
        jet_features (List[str], optional): list of jet features to retrieve.  If empty or None,
            gets no jet features. Defaults to
            ``["type", "pt", "eta", "mass", "num_particles"]``.
        particle_normalisation (NormaliseABC, optional): optional normalisation to apply to
            particle data. Defaults to None.
        jet_normalisation (NormaliseABC, optional): optional normalisation to apply to jet data.
            Defaults to None.
        particle_transform (callable, optional): A function/transform that takes in the particle
            data tensor and transforms it. Defaults to None.
        jet_transform (callable, optional): A function/transform that takes in the jet
            data tensor and transforms it. Defaults to None.
        num_particles (int, optional): number of particles to retain per jet, max of 150.
            Defaults to 30.
        split (str, optional): dataset split, out of {"train", "valid", "test", "all"}. Defaults
            to "train".
        split_fraction (List[float], optional): splitting fraction of training, validation,
            testing data respectively. Defaults to [0.7, 0.15, 0.15].
        seed (int, optional): PyTorch manual seed - important to use the same seed for all
            dataset splittings. Defaults to 42.
    """

    zenodo_record_id = 6619768

    jet_type = ["HtoBB", "HtoCC", "HtoGG", "HtoWW", "HtoWW2Q1L", "HtoWW4Q", "TTBar", "TTBarLep", 
                "WtoQQ", "ZJetstoNuNu", "ZtoQQ"]
    all_particle_features = ["part_px", "part_py", "part_pz", "part_energy", "part_deta", "part_dphi", "part_d0val", "part_d0err", "part_dzval",
                             "part_dzerr", "part_charge", "part_isChargedHadron", "part_isNeutralHadron", "part_isPhoton", "part_isElectron", "part_isMuon"]
    all_jet_features = ["jet_pt", "jet_eta", "jet_phi", "jet_energy", "jet_nparticles", "jet_sdmass", "jet_tau1", "jet_tau2", "jet_tau3", "jet_tau4"]
    splits = ["train", "valid", "test", "all"]

    def __init__(
        self,
        jet_type: Union[str, Set[str]] = "all",
        data_dir: str = "./",
        particle_features: List[str] = all_particle_features,
        jet_features: List[str] = all_jet_features,
        split: str = "train",
        split_fraction: List[float] = [0.7, 0.15, 0.15],
        seed: int = 42,
    ):
        self.particle_data, self.jet_data = self.getData(
            jet_type,
            data_dir,
            particle_features,
            jet_features
        )

        super().__init__(
            data_dir=data_dir,
            particle_features=particle_features,
            jet_features=jet_features
        )

        self.jet_type = jet_type
        self.split = split
        self.split_fraction = split_fraction

    @classmethod
    def getData(jet_type, data_dir, particle_features, jet_features):
        dataset_name = "JetClass Validation Set"
        file_download_name = "Val_5M"
        key = "JetClass_Pythia_val_5M.tar"
        record_id = 6619768
        jet_matrix = np.zeros((1, 100000))
        particle_matrix = np.zeros((1, 136))
        file_path = checkDownloadZenodoDataset(data_dir, dataset_name, record_id, key, file_download_name)
        print("Processing Data: ...")
        for jet_file in os.listdir(file_path):
            f = os.path.join(file_path, jet_file)
            for jet in jet_type:
                if jet in f:
                    open_file = uproot.open(f)
                    branch = open_file['tree']
                    for i in branch.keys():
                        for feature in jet_features:
                            if feature in i:
                                arr = branch[i].array()
                                arr = np.array(arr)
                                jet_matrix = np.vstack([jet_matrix, arr])
                        for particle in particle_features:
                            if particle in i:
                                arr_awk = branch[i].array()
                                awk_list = list(arr_awk)
                                zero_pad_arr = zero_padding(awk_list)
                                length_curr = findMaxLengthList(zero_pad_arr)
                                length_matrix = findMaxLengthList(particle_matrix)
                                zeros = np.zeros(100001)
                                if (length_curr > length_matrix) :
                                    zeros = np.zeros(100001)
                                    diff = length_curr - length_matrix
                                    for i in range(diff):
                                        particle_matrix = np.column_stack((particle_matrix,zeros))
                                elif (length_curr < length_matrix):
                                    zeros = np.zeros(100000)
                                    diff = length_matrix - length_curr
                                    for i in range(diff):
                                        zero_pad_arr = np.column_stack((zero_pad_arr,zeros))
                                particle_matrix = np.vstack([particle_matrix, zero_pad_arr])
                                updated_particle_matrix = np.delete(particle_matrix, 0 , axis = 0)   
                
        updated_jet_matrix = np.delete(jet_matrix, 0 , axis = 0)                
        dim1 = updated_jet_matrix.shape[0]
        dim2 = updated_jet_matrix.shape[1]
        dim_res = dim1/len(jet_features)
        dim = int(dim_res * dim2)
        return updated_jet_matrix.reshape(dim,len(jet_features)) , updated_particle_matrix


In [185]:
model = JetClass()

TypeError: getData() takes 4 positional arguments but 5 were given