diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ccea816fa..e22af0ddb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,12 +10,12 @@ on: jobs: deps: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt deps-pyg: @@ -25,68 +25,68 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt - run: pip3 install torch==1.13.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu - run: pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric==2.2.0 -f https://data.pyg.org/whl/torch-1.13.0+cpu.html tf-clic-pipeline: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: [deps] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt - run: ./scripts/local_test_clic_pipeline.sh tf-clic-hits-pipeline: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: [deps] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt - run: ./scripts/local_test_clic_hits_pipeline.sh tf-delphes-pipeline: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: [deps] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt - run: ./scripts/local_test_delphes_pipeline.sh tf-cms-pipeline: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: [deps] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt - run: ./scripts/local_test_cms_pipeline.sh pyg-cms-pipeline: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: [deps-pyg] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt - run: pip3 install torch==1.13.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu @@ -94,13 +94,13 @@ jobs: - run: ./scripts/local_test_pyg_cms.sh pyg-delphes-pipeline: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: [deps-pyg] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt - run: pip3 install torch==1.13.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu @@ -108,13 +108,13 @@ jobs: - run: ./scripts/local_test_pyg_delphes.sh pyg-clic-pipeline: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: [deps-pyg] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt - run: pip3 install torch==1.13.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu @@ -122,13 +122,13 @@ jobs: - run: ./scripts/local_test_pyg_clic.sh pyg-ssl-pipeline: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: [deps-pyg] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.10' cache: 'pip' - run: pip install -r requirements.txt - run: pip3 install torch==1.13.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu diff --git a/.gitignore b/.gitignore index c265ed700..2caa84efa 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,9 @@ nohup.out slurm-*.out .vscode + +models + +*.root + +logs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 62be6ee40..4b8a0cdf0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,4 +46,4 @@ repos: # E203 is not PEP8 compliant # E402 due to logging.basicConfig in pipeline.py args: ['--max-line-length=125', # github viewer width - '--extend-ignore=E203,E402'] + '--extend-ignore=E203,E402,W605'] diff --git a/apptainer/python_base.txt b/apptainer/python_base.txt index 086caa0e1..6984a5bb6 100644 --- a/apptainer/python_base.txt +++ b/apptainer/python_base.txt @@ -25,6 +25,8 @@ imageio-ffmpeg ipyparallel isort jupyter +jupyterlab +kaleido line_profiler lmfit lz4 @@ -45,6 +47,7 @@ pre-commit pyarrow pydot pygraphviz +pyhf pymultinest pynbody pytest diff --git a/apptainer/python_tf.txt b/apptainer/python_tf.txt index 306d9a06d..8bbf178af 100644 --- a/apptainer/python_tf.txt +++ b/apptainer/python_tf.txt @@ -12,6 +12,5 @@ tensorflow-datasets tensorflow-estimator tensorflow-model-optimization tensorflow-text -tf-models-official tf2onnx transformers diff --git a/apptainer/tf-2.13.0.singularity b/apptainer/tf-2.13.0.singularity index ad717f289..07b724439 100644 --- a/apptainer/tf-2.13.0.singularity +++ b/apptainer/tf-2.13.0.singularity @@ -3,11 +3,12 @@ Bootstrap: docker From: tensorflow/tensorflow:2.13.0-gpu-jupyter %files - python_base.txt /opt/python_base.txt - python_tf.txt /opt/python_tf.txt + specs/python_base.txt /opt/python_base.txt + specs/python_tf.txt /opt/python_tf.txt %post apt update -y --fix-missing + DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y tzdata apt install -y make cmake parallel gcc g++ gfortran binutils apt install -y libblas3 libblas-dev liblapack3 liblapack-dev libatlas3-base libatlas-base-dev apt install -y libtcmalloc-minimal4 @@ -19,6 +20,8 @@ From: tensorflow/tensorflow:2.13.0-gpu-jupyter python3 -m pip install -r /opt/python_base.txt python3 -m pip install -r /opt/python_tf.txt python3 -m pip install hls4ml[profiling] + HOROVOD_WITH_TENSORFLOW=1 python3 -m pip install horovod[tensorflow,keras] + python3 -m pip install open3d-cpu %environment export PIP_DEFAULT_TIMEOUT=500 diff --git a/mlpf/data_clic/__init__.py b/mlpf/data_clic/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpf/data_clic/postprocessing.py b/mlpf/data_clic/postprocessing.py deleted file mode 100644 index 1526b1974..000000000 --- a/mlpf/data_clic/postprocessing.py +++ /dev/null @@ -1,321 +0,0 @@ -import awkward -import networkx as nx -import numpy as np - -# 12,14,16 are neutrinos. -neutrinos = [12, 14, 16] - -# this is what I can reconstruct -labels_ys_cand = [0, 211, 130, 22, 11, 13] -labels_ys_gen = [0, 211, 130, 22, 11, 13] - - -def map_pdgid_to_candid(pdgid, charge): - if pdgid in [0, 22, 11, 13]: - return pdgid - - # charged hadron - if abs(charge) > 0: - return 211 - - # neutral hadron - return 130 - - -def track_pt(omega): - a = 3 * 10**-4 - b = 5 # B-field in tesla - - return a * np.abs(b / omega) - - -# this defines the track features -def track_as_array(df_tr, itr): - row = df_tr[itr] - return np.array( - [ - 1, # tracks are type 1 - row["px"], - row["py"], - row["pz"], - row["nhits"], - row["d0"], - row["z0"], - row["dedx"], - row["radius_innermost_hit"], - row["tan_lambda"], - row["nhits"], - row["chi2"], - ] - ) - - -# this defines the cluster features -def cluster_as_array(df_cl, icl): - row = df_cl[icl] - return np.array( - [ - 2, - row["x"], - row["y"], - row["z"], - row["nhits_ecal"], - row["nhits_hcal"], - row["energy"], - ] # clusters are type 2 - ) - - -# this defines the genparticle features -def gen_as_array(df_gen, igen): - if igen: - row = df_gen[igen] - return np.array( - [ - abs(row["pdgid"]), - row["charge"], - row["px"], - row["py"], - row["pz"], - row["energy"], - ] - ) - else: - return np.zeros(6) - - -# this defines the PF particle features -def pf_as_array(df_pfs, igen): - if igen: - row = df_pfs[igen] - return np.array( - [ - abs(row["type"]), - row["charge"], - row["px"], - row["py"], - row["pz"], - row["energy"], - ] - ) - else: - return np.zeros(6) - - -def filter_gp(df_gen, gp): - row = df_gen[gp] - # status 1 is stable particle in this case - # energy cutoff 0.2 is arbitrary and might need to be tuned - if row["status"] == 1 and row["energy"] > 0.2: - return True - return False - - -def flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs): - Xs_tracks = [] - Xs_clusters = [] - ys_gen = [] - ys_cand = [] - - # find all track-associated particles - for itr in range(len(df_tr)): - - k = ("tr", itr) - gp = None - rp = None - if k in pairs: - gp = pairs[k][0] - rp = pairs[k][1] - - # normalize ysgen and yscand - ys = gen_as_array(df_gen, gp) - cand = pf_as_array(df_pfs, rp) - - # skip the neutrinos - if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos): - continue - else: - ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[1])) - cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[1])) - - ys_gen.append(ys) - ys_cand.append(cand) - Xs_tracks.append(track_as_array(df_tr, itr)) - - # find all cluster-associated particles - for icl in range(len(df_cl)): - - k = ("cl", icl) - gp = None - rp = None - if k in pairs: - gp = pairs[k][0] - rp = pairs[k][1] - - # normalize ysgen and yscand - ys = gen_as_array(df_gen, gp) - cand = pf_as_array(df_pfs, rp) - # skip the neutrinos - if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos): - continue - else: - ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[1])) - cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[1])) - - ys_gen.append(ys) - ys_cand.append(cand) - Xs_clusters.append(cluster_as_array(df_cl, icl)) - - Xs_clusters = np.stack(Xs_clusters, axis=-1).T # [Nclusters, Nfeat_cluster] - Xs_tracks = np.stack(Xs_tracks, axis=-1).T # [Ntracks, Nfeat_track] - - # Here we pad the tracks and clusters to the same shape along the feature dimension - if Xs_tracks.shape[1] > Xs_clusters.shape[-1]: - Xs_clusters = np.pad( - Xs_clusters, - [(0, 0), (0, Xs_tracks.shape[1] - Xs_clusters.shape[-1])], - ) - elif Xs_tracks.shape[1] < Xs_clusters.shape[-1]: - Xs_clusters = np.pad( - Xs_clusters, - [(0, 0), (0, Xs_clusters.shape[-1] - Xs_tracks.shape[1])], - ) - - Xs = np.concatenate([Xs_tracks, Xs_clusters], axis=0) # [Ntracks+Nclusters, max(Nfeat_cluster, Nfeat_track)] - ys_gen = np.stack(ys_gen, axis=-1).T - ys_cand = np.stack(ys_cand, axis=-1).T - - return Xs, ys_gen, ys_cand - - -def prepare_data_clic(fn): - """ - Processing function that takes as input a raw parquet file and processes it. - - Returns - a list of events, each containing three arrays [Xs, ygen, ycand]. - - """ - - data = awkward.from_parquet(fn) - - ret = [] - # loop over the events in the dataset - for iev in range(len(data)): - df_gen = data[iev]["genparticles"] - - df_cl = data[iev]["clusters"] - df_tr = data[iev]["tracks"] - df_pfs = data[iev]["pfs"] - # print("Clusters={}, tracks={}, PFs={}, Gen={}".format(len(df_cl), len(df_tr), len(df_pfs), len(df_gen))) - - # skip events that don't have enough activity from training - if len(df_pfs) < 2 or len(df_gen) < 2 or len(df_tr) < 2 or len(df_cl) < 2: - continue - - # compute pt, px,py,pz - df_tr["pt"] = track_pt(df_tr["omega"]) - df_tr["px"] = np.cos(df_tr["phi"]) * df_tr["pt"] - df_tr["py"] = np.sin(df_tr["phi"]) * df_tr["pt"] - df_tr["pz"] = df_tr["tan_lambda"] * df_tr["pt"] - - # fill track/cluster to genparticle contributions - matrix_tr_to_gp = np.zeros((len(df_tr), len(df_gen))) - matrix_cl_to_gp = np.zeros((len(df_cl), len(df_gen))) - - for itr in range(len(df_tr)): - gps = df_tr[itr]["gp_contributions"] - for gp, val in zip(gps["0"], gps["1"]): - matrix_tr_to_gp[itr, int(gp)] += val - - for icl in range(len(df_cl)): - gps = df_cl[icl]["gp_contributions"] - for gp, val in zip(gps["0"], gps["1"]): - matrix_cl_to_gp[icl, int(gp)] += val - - # fill track/cluster to PF map - reco_to_pf = {} - for ipf in range(len(df_pfs)): - row = df_pfs[ipf] - if row["track_idx"] != -1: - k = ("tr", int(row["track_idx"])) - assert not (k in reco_to_pf) - reco_to_pf[k] = ipf - elif row["cluster_idx"] != -1: - k = ("cl", int(row["cluster_idx"])) - assert not (k in reco_to_pf) - reco_to_pf[k] = ipf - else: - # PF should always have a track or a cluster associated - assert False - - dg = nx.Graph() - gps = set() - - # loop over clusters, get all genparticles associated to clusters - for icl in range(len(df_cl)): - dg.add_node(("cl", icl)) - gp_contrib = df_cl[icl]["gp_contributions"] - for gp, weight in zip(gp_contrib["0"], gp_contrib["1"]): - gp = int(gp) - if filter_gp(df_gen, gp): - dg.add_node(("gp", gp)) - gps.add(gp) - dg.add_edge(("gp", gp), ("cl", icl), weight=weight) - - # loop over tracks, get all genparticles associated to tracks - for itr in range(len(df_tr)): - dg.add_node(("tr", itr)) - gp_contrib = df_tr[itr]["gp_contributions"] - for gp, weight in zip(gp_contrib["0"], gp_contrib["1"]): - gp = int(gp) - if filter_gp(df_gen, gp): - dg.add_node(("gp", gp)) - gps.add(gp) - - # the track is added to the genparticle with a very high weight - # because we always want to associate the genparticle to a track if it's possible - dg.add_edge(("gp", gp), ("tr", itr), weight=9999.0) - - # uniqe genparticles - gps = set(gps) - - # now loop over all the genparticles - pairs = {} - for gp in gps: - gp_node = ("gp", gp) - - # find the neighboring reco elements (clusters and tracks) - neighbors = list(dg.neighbors(gp_node)) - weights = [dg.edges[gp_node, n]["weight"] for n in neighbors] - nw = zip(neighbors, weights) - - # sort the neighbors by the edge weight (deposited energy) - nw = sorted(nw, key=lambda x: x[1], reverse=True) - reco_obj = None - if len(nw) > 0: - # choose the closest neighbor as the "key" reco element - reco_obj = nw[0][0] - - # remove the reco element from the list, so it can't be associated to anything else - dg.remove_node(reco_obj) - - # this genparticle had a unique reco element - if reco_obj: - pf_obj = None - if reco_obj and reco_obj in reco_to_pf: - pf_obj = reco_to_pf[reco_obj] - - assert not (reco_obj in pairs) - pairs[reco_obj] = (gp, pf_obj) - - # this is a case where a genparticle did not have a key reco element, but instead was smeared between others - # else: - # print("genparticle {} is merged and cannot be reconstructed".format(gp)) - # print(df_gen.loc[gp]) - - Xs, ys_gen, ys_cand = flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs) - - ret.append([Xs, ys_gen, ys_cand]) - - return ret diff --git a/mlpf/heptfds/clic_pf_edm4hep/qq.py b/mlpf/heptfds/clic_pf_edm4hep/qq.py index fbe2aa345..c723f6a62 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/qq.py +++ b/mlpf/heptfds/clic_pf_edm4hep/qq.py @@ -10,17 +10,24 @@ ) import tensorflow_datasets as tfds +import numpy as np _DESCRIPTION = """ -CLIC EDM4HEP dataset with ee -> gamma/Z* -> quarks +CLIC EDM4HEP dataset with ee -> gamma/Z* -> quarks at 380GeV. + - X: reconstructed tracks and clusters, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmQqPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.4.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "update stats, move to 380 GeV", @@ -28,11 +35,19 @@ class ClicEdmQqPf(tfds.core.GeneratorBasedBuilder): "1.3.0": "Update stats to ~1M events", "1.3.1": "Update stats to ~2M events", "1.4.0": "Fix ycand matching", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmQqPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( @@ -47,8 +62,8 @@ def _info(self) -> tfds.core.DatasetInfo: ), dtype=tf.float32, ), - "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), - "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), } ), supervised_keys=None, diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py index 09fb0c9fb..21bf35966 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py @@ -12,26 +12,40 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with ttbar +CLIC EDM4HEP dataset with ee -> ttbar at 380GeV. + - X: reconstructed tracks and clusters, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.4.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "update stats, move to 380 GeV", "1.2.0": "sin/cos phi separately", "1.3.0": "Update stats to ~1M events", "1.4.0": "Fix ycand matching", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmTtbarPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar_pu10.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar_pu10.py index b5993434a..3c79e04db 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ttbar_pu10.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar_pu10.py @@ -12,23 +12,38 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with ttbar + PU10 +CLIC EDM4HEP dataset with ee -> ttbar + PU10 at 380 GeV. +PU is generated with ee->gg, overlaying random events from Poisson(10). + - X: reconstructed tracks and clusters, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmTtbarPu10Pf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.4.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.3.0": "Update stats to ~1M events", "1.4.0": "Fix ycand matching", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmTtbarPu10Pf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py index 2d07aace0..ef5baff85 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py @@ -51,7 +51,7 @@ def split_sample(path, test_frac=0.8): files = sorted(list(path.glob("*.parquet"))) - print("Found {} files in {}".format(files, path)) + print("Found {} files in {}".format(len(files), path)) assert len(files) > 0 idx_split = int(test_frac * len(files)) files_train = files[:idx_split] diff --git a/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py b/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py index b4db98f30..1054d41e0 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py @@ -12,23 +12,37 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with WW fullhad +CLIC EDM4HEP dataset with ee -> WW -> fully hadronic at 380 GeV. + - X: reconstructed tracks and clusters, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmWwFullhadPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.4.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.3.0": "Update stats to ~1M events", "1.4.0": "Fix ycand matching", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmWwFullhadPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep/zh.py b/mlpf/heptfds/clic_pf_edm4hep/zh.py index a97ec64ca..e8247fa76 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/zh.py +++ b/mlpf/heptfds/clic_pf_edm4hep/zh.py @@ -13,22 +13,36 @@ _DESCRIPTION = """ CLIC EDM4HEP dataset with ZH->tautau + - X: reconstructed tracks and clusters, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmZhTautauPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.4.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.3.0": "First version", "1.4.0": "Fix ycand matching", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmZhTautauPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py b/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py index 126da7cff..fdf43fd57 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py @@ -12,25 +12,39 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with qq with raw hits +CLIC EDM4HEP dataset with qq with raw calorimeter hits. + - X: reconstructed tracks and calorimeter hits, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmQqHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.2.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "0.9.0": "Small stats", "1.0.0": "Initial release", "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmQqHitsPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_ele.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_ele.py index bafdb70ef..22ff25578 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_ele.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_ele.py @@ -12,20 +12,37 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with single electron with raw hits +CLIC EDM4HEP dataset with single electron with raw calorimeter hits. + - X: reconstructed tracks and calorimeter hits, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmSingleElectronHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.2.0") - RELEASE_NOTES = {"1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticels"} + VERSION = tfds.core.Version("1.5.0") + RELEASE_NOTES = { + "1.1.0": "Remove track referencepoint feature", + "1.2.0": "Keep all interacting genparticels", + "1.5.0": "Regenerate with ARRAY_RECORD", + } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSingleElectronHitsPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_gamma.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_gamma.py index 9cde2e27d..ab4493370 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_gamma.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_gamma.py @@ -12,23 +12,37 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with single gamma with raw hits +CLIC EDM4HEP dataset with single gamma with raw calorimeter hits. + - X: reconstructed tracks and calorimeter hits, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmSingleGammaHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.2.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSingleGammaHitsPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L.py index 54b47d28b..a5cc947f3 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L.py @@ -12,23 +12,37 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with single kaon0L with raw hits +CLIC EDM4HEP dataset with single kaon0L with raw calorimeter hits. + - X: reconstructed tracks and calorimeter hits, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmSingleKaon0lHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.2.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSingleKaon0lHitsPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_mu.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_mu.py index 346efb2b4..a40cc466d 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_mu.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_mu.py @@ -12,20 +12,37 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with single muon with raw hits +CLIC EDM4HEP dataset with single muon with raw hits. + - X: reconstructed tracks and calorimeter hits, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmSingleMuonHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.2.0") - RELEASE_NOTES = {"1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticels"} + VERSION = tfds.core.Version("1.5.0") + RELEASE_NOTES = { + "1.1.0": "Remove track referencepoint feature", + "1.2.0": "Keep all interacting genparticles", + "1.5.0": "Regenerate with ARRAY_RECORD", + } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSingleMuonHitsPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_neutron.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_neutron.py index e9a9948df..ef9569259 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_neutron.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_neutron.py @@ -12,23 +12,37 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with single neutron with raw hits +CLIC EDM4HEP dataset with single neutron with raw hits. + - X: reconstructed tracks and calorimeter hits, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmSingleNeutronHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.2.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSingleNeutronHitsPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi.py index 74d812d53..8b4ca7b0e 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi.py @@ -12,23 +12,37 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with single pi- with raw hits +CLIC EDM4HEP dataset with single pi- with raw hits. + - X: reconstructed tracks and calorimeter hits, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmSinglePiHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.2.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSinglePiHitsPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0.py index 611948b27..6570fd8a6 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0.py @@ -12,23 +12,37 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with single pi0 with raw hits +CLIC EDM4HEP dataset with single pi0 with raw hits. + - X: reconstructed tracks and calorimeter hits, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmSinglePi0HitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.2.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSinglePi0HitsPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py index 21f9be93e..0c9cb3d4e 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py @@ -12,25 +12,39 @@ import tensorflow_datasets as tfds _DESCRIPTION = """ -CLIC EDM4HEP dataset with ttbar with raw hits +CLIC EDM4HEP dataset with ttbar with raw hits. + - X: reconstructed tracks and calorimeter hits, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event """ _CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 """ class ClicEdmTtbarHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.2.0") + VERSION = tfds.core.Version("1.5.0") RELEASE_NOTES = { "0.9.0": "Small stats", "1.0.0": "Initial release", "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", + "1.5.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmTtbarHitsPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" return tfds.core.DatasetInfo( diff --git a/mlpf/heptfds/cms_pf/cms_pf_test.py b/mlpf/heptfds/cms_pf/cms_pf_test.py deleted file mode 100644 index 4172ad271..000000000 --- a/mlpf/heptfds/cms_pf/cms_pf_test.py +++ /dev/null @@ -1,25 +0,0 @@ -"""cms_pf dataset.""" -import tensorflow_datasets as tfds - -from . import cms_pf - - -class CmsPfTest(tfds.testing.DatasetBuilderTestCase): - """Tests for cms_pf dataset.""" - - # TODO(cms_pf): - DATASET_CLASS = cms_pf.CmsPf - SPLITS = { - "train": 3, # Number of fake train example - "test": 1, # Number of fake test example - } - - # If you are calling `download/download_and_extract` with a dict, like: - # dl_manager.download({'some_key': 'http://a.org/out.txt', ...}) - # then the tests needs to provide the fake output paths relative to the - # fake data directory - # DL_EXTRACT_RESULT = {'some_key': 'output_file1.txt', ...} - - -if __name__ == "__main__": - tfds.testing.test_main() diff --git a/mlpf/heptfds/cms_pf/qcd.py b/mlpf/heptfds/cms_pf/qcd.py index b098ef631..c9bc22747 100644 --- a/mlpf/heptfds/cms_pf/qcd.py +++ b/mlpf/heptfds/cms_pf/qcd.py @@ -21,18 +21,23 @@ class CmsPfQcd(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_qcd dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", "1.3.1": "Remove PS again", "1.4.0": "Add gen jet index information", "1.5.0": "No padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfQcd, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/qcd_high_pt.py b/mlpf/heptfds/cms_pf/qcd_high_pt.py index 02b65d4a1..b6b0c2dbf 100644 --- a/mlpf/heptfds/cms_pf/qcd_high_pt.py +++ b/mlpf/heptfds/cms_pf/qcd_high_pt.py @@ -21,18 +21,23 @@ class CmsPfQcdHighPt(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_qcd_high_pt dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", "1.3.1": "Remove PS again", "1.4.0": "Add gen jet index information", "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ FIXME """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfQcdHighPt, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/singleele.py b/mlpf/heptfds/cms_pf/singleele.py index 554c128e9..ad2b36606 100644 --- a/mlpf/heptfds/cms_pf/singleele.py +++ b/mlpf/heptfds/cms_pf/singleele.py @@ -21,7 +21,7 @@ class CmsPfSingleElectron(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singleele dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Initial release.", @@ -29,11 +29,16 @@ class CmsPfSingleElectron(tfds.core.GeneratorBasedBuilder): "1.4.0": "Add gen jet index information", "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleElectronFlatPt1To100_pythia8_cfi data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfSingleElectron, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/singlegamma.py b/mlpf/heptfds/cms_pf/singlegamma.py index 2786c29cd..48853f59c 100644 --- a/mlpf/heptfds/cms_pf/singlegamma.py +++ b/mlpf/heptfds/cms_pf/singlegamma.py @@ -21,18 +21,23 @@ class CmsPfSingleGamma(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlegamma dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.1.0": "Initial release", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", "1.4.0": "Add gen jet index information", "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleGammaFlatPt10To100_pythia8_cfi data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfSingleGamma, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/singlemu.py b/mlpf/heptfds/cms_pf/singlemu.py index e19fd30b7..fb6cee68e 100644 --- a/mlpf/heptfds/cms_pf/singlemu.py +++ b/mlpf/heptfds/cms_pf/singlemu.py @@ -21,18 +21,23 @@ class CmsPfSingleMu(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlemu dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleMuFlatLogPt_100MeVto2TeV_cfi data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfSingleMu, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/singleneutron.py b/mlpf/heptfds/cms_pf/singleneutron.py index 4952f4a83..560cc145e 100644 --- a/mlpf/heptfds/cms_pf/singleneutron.py +++ b/mlpf/heptfds/cms_pf/singleneutron.py @@ -21,18 +21,23 @@ class CmsPfSingleNeutron(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singleneutron dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.1.0": "Initial release", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", "1.4.0": "Add gen jet index information", "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleNeutronFlatPt0p7To1000_cfi/data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfSingleNeutron, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/singlepi.py b/mlpf/heptfds/cms_pf/singlepi.py index b97302cc0..be5cc6b94 100644 --- a/mlpf/heptfds/cms_pf/singlepi.py +++ b/mlpf/heptfds/cms_pf/singlepi.py @@ -21,7 +21,7 @@ class CmsPfSinglePi(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlepi dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", @@ -29,11 +29,16 @@ class CmsPfSinglePi(tfds.core.GeneratorBasedBuilder): "1.4.0": "Add genjet information", "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SinglePiMinusFlatPt0p7To1000_cfi data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfSinglePi, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/singlepi0.py b/mlpf/heptfds/cms_pf/singlepi0.py index f2391019e..d4b242f7b 100644 --- a/mlpf/heptfds/cms_pf/singlepi0.py +++ b/mlpf/heptfds/cms_pf/singlepi0.py @@ -21,18 +21,23 @@ class CmsPfSinglePi0(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlepi0 dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.1.0": "Initial release", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", "1.4.0": "Add gen jet index information", "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SinglePi0E10_pythia8_cfi data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfSinglePi0, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/singleproton.py b/mlpf/heptfds/cms_pf/singleproton.py index a78608370..a0b20a896 100644 --- a/mlpf/heptfds/cms_pf/singleproton.py +++ b/mlpf/heptfds/cms_pf/singleproton.py @@ -23,18 +23,23 @@ class CmsPfSingleProton(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singleproton dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.1.0": "Initial release", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", "1.4.0": "Add gen jet index information", "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleProtonMinusFlatPt0p7To1000_cfi/data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfSingleProton, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/singletau.py b/mlpf/heptfds/cms_pf/singletau.py index fc702f513..b9403397c 100644 --- a/mlpf/heptfds/cms_pf/singletau.py +++ b/mlpf/heptfds/cms_pf/singletau.py @@ -23,18 +23,23 @@ class CmsPfSingleTau(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singletau dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.1.0": "Add muon type, fix electron GSF association", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", "1.4.0": "Add genjet information", "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleTauFlatPt1To1000_cfi data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfSingleTau, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/ttbar.py b/mlpf/heptfds/cms_pf/ttbar.py index ea0fe3f4c..16c367436 100644 --- a/mlpf/heptfds/cms_pf/ttbar.py +++ b/mlpf/heptfds/cms_pf/ttbar.py @@ -21,7 +21,7 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", @@ -31,12 +31,17 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): "1.4.0": "Add gen jet index information", "1.5.0": "No padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ mkdir -p data rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfTtbar, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/cms_pf/ztt.py b/mlpf/heptfds/cms_pf/ztt.py index 7c6b50c03..0f0fff4ac 100644 --- a/mlpf/heptfds/cms_pf/ztt.py +++ b/mlpf/heptfds/cms_pf/ztt.py @@ -21,19 +21,24 @@ class CmsPfZtt(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_ztt dataset.""" - VERSION = tfds.core.Version("1.5.1") + VERSION = tfds.core.Version("1.6.0") RELEASE_NOTES = { "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", "1.3.1": "Remove PS again", "1.4.0": "Add gen jet index information", "1.5.0": "No padding", "1.5.1": "Remove outlier caps", + "1.6.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ mkdir -p data rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi data/ """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfZtt, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object diff --git a/mlpf/heptfds/delphes_pf/delphes_pf.py b/mlpf/heptfds/delphes_pf/delphes_data_pf.py similarity index 66% rename from mlpf/heptfds/delphes_pf/delphes_pf.py rename to mlpf/heptfds/delphes_pf/delphes_data_pf.py index 4ee91f400..a66ff9780 100644 --- a/mlpf/heptfds/delphes_pf/delphes_pf.py +++ b/mlpf/heptfds/delphes_pf/delphes_data_pf.py @@ -1,10 +1,8 @@ from pathlib import Path -import tensorflow as tf -import tqdm +from utils_delphes import prepare_data_delphes, X_FEATURES, Y_FEATURES import tensorflow_datasets as tfds - -from delphes_utils import prepare_data_delphes, X_FEATURES, Y_FEATURES +import numpy as np _DESCRIPTION = """ Dataset generated with Delphes. @@ -12,41 +10,38 @@ TTbar and QCD events with PU~200. """ -# TODO(delphes_pf): BibTeX citation _CITATION = """ https://zenodo.org/record/4559324#.YTs853tRVH4 """ -class DelphesPf(tfds.core.GeneratorBasedBuilder): - """DatasetBuilder for delphes_pf dataset.""" - - VERSION = tfds.core.Version("1.1.0") +class DelphesDataPf(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.2.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Do not pad events to the same size", + "1.2.0": "Regenerate with ARRAY_RECORD", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ Download from https://zenodo.org/record/4559324#.YTs853tRVH4 """ + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(DelphesDataPf, self).__init__(*args, **kwargs) + def _info(self) -> tfds.core.DatasetInfo: - """Returns the dataset metadata.""" - # TODO(delphes_pf): Specifies the tfds.core.DatasetInfo object return tfds.core.DatasetInfo( builder=self, description=_DESCRIPTION, features=tfds.features.FeaturesDict( { - "X": tfds.features.Tensor(shape=(None, len(X_FEATURES)), dtype=tf.float32), - "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), - "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "X": tfds.features.Tensor(shape=(None, len(X_FEATURES)), dtype=np.float32), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), } ), - # If there's a common (input, target) tuple from the - # features, specify them here. They'll be used if - # `as_supervised=True` in `builder.as_dataset`. - supervised_keys=("X", "ygen"), # Set to `None` to disable + supervised_keys=None, homepage="https://zenodo.org/record/4559324#.YTs853tRVH4", citation=_CITATION, metadata=tfds.core.MetadataDict(x_features=X_FEATURES), @@ -60,8 +55,7 @@ def _split_generators(self, dl_manager: tfds.download.DownloadManager): } def _generate_examples(self, path): - """Yields examples.""" - for fi in tqdm.tqdm(list(path.glob("*.pkl.bz2"))): + for fi in list(path.glob("*.pkl.bz2")): Xs, ygens, ycands = prepare_data_delphes(str(fi)) for iev in range(len(Xs)): yield str(fi) + "_" + str(iev), { diff --git a/mlpf/heptfds/delphes_pf/delphes_pf_test.py b/mlpf/heptfds/delphes_pf/delphes_pf_test.py deleted file mode 100644 index 95bd2e685..000000000 --- a/mlpf/heptfds/delphes_pf/delphes_pf_test.py +++ /dev/null @@ -1,25 +0,0 @@ -"""delphes_pf dataset.""" -import tensorflow_datasets as tfds - -from . import delphes_pf - - -class DelphesPfTest(tfds.testing.DatasetBuilderTestCase): - """Tests for delphes_pf dataset.""" - - # TODO(delphes_pf): - DATASET_CLASS = delphes_pf.DelphesPf - SPLITS = { - "train": 3, # Number of fake train example - "test": 1, # Number of fake test example - } - - # If you are calling `download/download_and_extract` with a dict, like: - # dl_manager.download({'some_key': 'http://a.org/out.txt', ...}) - # then the tests needs to provide the fake output paths relative to the - # fake data directory - # DL_EXTRACT_RESULT = {'some_key': 'output_file1.txt', ...} - - -if __name__ == "__main__": - tfds.testing.test_main() diff --git a/mlpf/heptfds/delphes_pf/delphes_utils.py b/mlpf/heptfds/delphes_pf/utils_delphes.py similarity index 100% rename from mlpf/heptfds/delphes_pf/delphes_utils.py rename to mlpf/heptfds/delphes_pf/utils_delphes.py diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py index 1bd9c75f5..734a6a6cb 100644 --- a/mlpf/pipeline.py +++ b/mlpf/pipeline.py @@ -1247,6 +1247,9 @@ def plots(train_dir, max_files): get_class_names, plot_rocs, plot_particle_multiplicity, + compute_3dmomentum_and_ratio, + plot_3dmomentum_ratio, + plot_3dmomentum_response_binned, ) mplhep.set_style(mplhep.styles.CMS) @@ -1309,6 +1312,16 @@ def plots(train_dir, max_files): plot_jet_response_binned(yvals, cp_dir=cp_dir, title=_title) plot_met_response_binned(met_data, cp_dir=cp_dir, title=_title) + mom_data = compute_3dmomentum_and_ratio(yvals) + plot_3dmomentum_ratio(mom_data, cp_dir=cp_dir, title=_title, bins=np.linspace(0, 20, 100), logy=True) + plot_3dmomentum_ratio( + mom_data, cp_dir=cp_dir, title=_title, bins=np.linspace(0, 2, 100), logy=True, file_modifier="_bins_0_2" + ) + plot_3dmomentum_ratio( + mom_data, cp_dir=cp_dir, title=_title, bins=np.linspace(0, 5, 100), logy=True, file_modifier="_bins_0_5" + ) + plot_3dmomentum_response_binned(mom_data, cp_dir=cp_dir, title=_title) + if __name__ == "__main__": main() diff --git a/mlpf/plotting/plot_utils.py b/mlpf/plotting/plot_utils.py index caaa28235..409ce9ec6 100644 --- a/mlpf/plotting/plot_utils.py +++ b/mlpf/plotting/plot_utils.py @@ -77,6 +77,20 @@ r"$\mu^\pm$", ] +labels = { + "met": "$p_{\mathrm{T}}^{\mathrm{miss}}$ [GeV]", + "gen_met": "$p_{\mathrm{T,gen}}^\text{miss}$ [GeV]", + "gen_mom": "$p_{\mathrm{gen}}$ [GeV]", + "gen_jet": "jet $p_{\mathrm{T,gen}}$ [GeV]", + "reco_met": "$p_{\mathrm{T,reco}}^\text{miss}$ [GeV]", + "reco_gen_met_ratio": "$p_{\mathrm{T,reco}}^\mathrm{miss} / p_{\\mathrm{T,gen}}^\mathrm{miss}$", + "reco_gen_mom_ratio": "$p_{\mathrm{reco}} / p_{\\mathrm{gen}}$", + "reco_gen_jet_ratio": "jet $p_{\mathrm{T,reco}} / p_{\\mathrm{T,gen}}$", + "gen_met_range": "${} \less p_{{\mathrm{{T,gen}}}}^\mathrm{{miss}}\leq {}$", + "gen_mom_range": "${} \less p_{{\mathrm{{gen}}}}\leq {}$", + "gen_jet_range": "${} \less p_{{\mathrm{{T,gen}}}} \leq {}$", +} + def get_class_names(dataset_name): if dataset_name.startswith("clic_"): @@ -90,17 +104,16 @@ def get_class_names(dataset_name): EVALUATION_DATASET_NAMES = { - "clic_ttbar_pf": r"CLIC $ee \rightarrow \mathrm{t}\overline{\mathrm{t}}$", - "delphes_pf": r"Delphes-CMS $pp \rightarrow \mathrm{QCD}$", + "delphes_data_pf": r"Delphes-CMS $pp \rightarrow \mathrm{QCD}$", # qcd is the validation set "cms_pf_qcd_high_pt": r"CMS high-$p_T$ QCD+PU events", "cms_pf_ttbar": r"CMS $\mathrm{t}\overline{\mathrm{t}}$+PU events", "cms_pf_single_neutron": r"CMS single neutron particle gun events", - "clic_edm_ttbar_pf": r"CLIC $ee \rightarrow \mathrm{t}\overline{\mathrm{t}}$", - "clic_edm_ttbar_pu10_pf": r"CLIC $ee \rightarrow \mathrm{t}\overline{\mathrm{t}}$, PU10", - "clic_edm_ttbar_hits_pf": r"CLIC $ee \rightarrow \mathrm{t}\overline{\mathrm{t}}$", - "clic_edm_qq_pf": r"CLIC $ee \rightarrow \gamma/\mathrm{Z}^* \rightarrow \mathrm{hadrons}$", - "clic_edm_ww_fullhad_pf": r"CLIC $ee \rightarrow WW \rightarrow \mathrm{hadrons}$", - "clic_edm_zh_tautau_pf": r"CLIC $ee \rightarrow ZH \rightarrow \tau \tau$", + "clic_edm_ttbar_pf": r"$e^+e^- \rightarrow \mathrm{t}\overline{\mathrm{t}}$", + "clic_edm_ttbar_pu10_pf": r"$e^+e^- \rightarrow \mathrm{t}\overline{\mathrm{t}}$, PU10", + "clic_edm_ttbar_hits_pf": r"$e^+e^- \rightarrow \mathrm{t}\overline{\mathrm{t}}$", + "clic_edm_qq_pf": r"$e^+e^- \rightarrow \gamma/\mathrm{Z}^* \rightarrow \mathrm{hadrons}$", + "clic_edm_ww_fullhad_pf": r"$e^+e^- \rightarrow WW \rightarrow \mathrm{hadrons}$", + "clic_edm_zh_tautau_pf": r"$e^+e^- \rightarrow ZH \rightarrow \tau \tau$", } @@ -246,10 +259,11 @@ def load_eval_data(path, max_files=None): for typ in ["gen", "cand", "pred"]: - # Compute phi, px, py + # Compute phi, px, py, pz yvals[typ + "_phi"] = np.arctan2(yvals[typ + "_sin_phi"], yvals[typ + "_cos_phi"]) yvals[typ + "_px"] = yvals[typ + "_pt"] * yvals[typ + "_cos_phi"] yvals[typ + "_py"] = yvals[typ + "_pt"] * yvals[typ + "_sin_phi"] + yvals[typ + "_pz"] = yvals[typ + "_pt"] * np.sinh(yvals[typ + "_eta"]) # Get the jet vectors jetvec = vector.awk(data["jets"][typ]) @@ -327,6 +341,44 @@ def compute_met_and_ratio(yvals): } +def compute_3dmomentum_and_ratio(yvals): + msk_gen = yvals["gen_cls_id"] != 0 + gen_px = yvals["gen_px"][msk_gen] + gen_py = yvals["gen_py"][msk_gen] + gen_pz = yvals["gen_pz"][msk_gen] + + msk_pred = yvals["pred_cls_id"] != 0 + pred_px = yvals["pred_px"][msk_pred] + pred_py = yvals["pred_py"][msk_pred] + pred_pz = yvals["pred_pz"][msk_pred] + + msk_cand = yvals["cand_cls_id"] != 0 + cand_px = yvals["cand_px"][msk_cand] + cand_py = yvals["cand_py"][msk_cand] + cand_pz = yvals["cand_pz"][msk_cand] + + gen_mom = awkward.to_numpy( + np.sqrt(np.sum(gen_px, axis=1) ** 2 + np.sum(gen_py, axis=1) ** 2 + np.sum(gen_pz, axis=1) ** 2) + ) + pred_mom = awkward.to_numpy( + np.sqrt(np.sum(pred_px, axis=1) ** 2 + np.sum(pred_py, axis=1) ** 2 + np.sum(pred_pz, axis=1) ** 2) + ) + cand_mom = awkward.to_numpy( + np.sqrt(np.sum(cand_px, axis=1) ** 2 + np.sum(cand_py, axis=1) ** 2 + np.sum(cand_pz, axis=1) ** 2) + ) + + mom_ratio_pred = awkward.to_numpy(pred_mom / gen_mom) + mom_ratio_cand = awkward.to_numpy(cand_mom / gen_mom) + + return { + "gen_mom": gen_mom, + "pred_mom": pred_mom, + "cand_mom": cand_mom, + "ratio_pred": mom_ratio_pred, + "ratio_cand": mom_ratio_cand, + } + + def save_img(outfile, epoch, cp_dir=None, comet_experiment=None): if cp_dir: image_path = str(cp_dir / outfile) @@ -376,7 +428,7 @@ def plot_jets(yvals, epoch=None, cp_dir=None, comet_experiment=None, title=None) plt.xscale("log") plt.xlabel("jet $p_T$") - plt.ylabel("number of jets / bin") + plt.ylabel("Jets / bin") plt.legend(loc="best") if title: plt.title(title) @@ -416,8 +468,8 @@ def plot_jet_ratio( lw=2, label="MLPF $(M={:.2f}, IQR={:.2f}, f_m={:.2f})$".format(p[0], p[1], n_matched / n_jets), ) - plt.xlabel("jet $p_T$ reco/gen") - plt.ylabel("number of matched jets") + plt.xlabel(labels["reco_gen_jet_ratio"]) + plt.ylabel("Matched jets / bin") plt.legend(loc="best", title=title) plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0)) @@ -481,8 +533,8 @@ def plot_met(met_ratio, epoch=None, cp_dir=None, comet_experiment=None, title=No lw=2, label="Truth $(M={:.2f}, IQR={:.2f})$".format(p[0], p[1]), ) - plt.xlabel("MET [GeV]") - plt.ylabel("Number of events / bin") + plt.xlabel(labels["met"]) + plt.ylabel("Events / bin") plt.legend(loc="best", title=title) plt.xscale("log") save_img("met.png", epoch, cp_dir=cp_dir, comet_experiment=comet_experiment) @@ -512,8 +564,8 @@ def plot_met_ratio( lw=2, label="MLPF $(M={:.2f}, IQR={:.2f})$".format(p[0], p[1]), ) - plt.xlabel("MET reco/gen") - plt.ylabel("number of events") + plt.xlabel(labels["reco_gen_met_ratio"]) + plt.ylabel("Events / bin") plt.legend(loc="best", title=title) ylim = ax.get_ylim() @@ -531,6 +583,49 @@ def plot_met_ratio( ) +def plot_3dmomentum_ratio( + mom_ratio, epoch=None, cp_dir=None, comet_experiment=None, title=None, bins=None, file_modifier="", logy=False +): + plt.figure() + ax = plt.axes() + if bins is None: + bins = np.linspace(0, 20, 100) + + p = med_iqr(mom_ratio["ratio_cand"]) + plt.hist( + mom_ratio["ratio_cand"], + bins=bins, + histtype="step", + lw=2, + label="PF $(M={:.2f}, IQR={:.2f})$".format(p[0], p[1]), + ) + p = med_iqr(mom_ratio["ratio_pred"]) + plt.hist( + mom_ratio["ratio_pred"], + bins=bins, + histtype="step", + lw=2, + label="MLPF $(M={:.2f}, IQR={:.2f})$".format(p[0], p[1]), + ) + plt.xlabel(labels["reco_gen_mom_ratio"]) + plt.ylabel("Events / bin") + plt.legend(loc="best", title=title) + + ylim = ax.get_ylim() + ax.set_ylim(ylim[0], 1.2 * ylim[1]) + + if logy: + ax.set_yscale("log") + ax.set_ylim(10, 10 * ylim[1]) + + save_img( + "mom_res{}.png".format(file_modifier), + epoch, + cp_dir=cp_dir, + comet_experiment=comet_experiment, + ) + + def compute_distances(distribution_1, distribution_2, ratio): if len(distribution_1) > 0 and len(distribution_2) > 0: wd = scipy.stats.wasserstein_distance(distribution_1, distribution_2) @@ -550,8 +645,8 @@ def plot_rocs(yvals, class_names, epoch=None, cp_dir=None, comet_experiment=None ncls = len(yvals["gen_cls"][0, 0]) plt.figure() for icls in range(ncls): - predvals = awkward.flatten(yvals["pred_cls"][:, :, icls]) - truevals = awkward.flatten(yvals["gen_cls_id"] == icls) + predvals = awkward.to_numpy(awkward.flatten(yvals["pred_cls"][:, :, icls])) + truevals = awkward.to_numpy(awkward.flatten(yvals["gen_cls_id"] == icls)) fpr, tpr, _ = sklearn.metrics.roc_curve(truevals, predvals) plt.plot(fpr, tpr, label=class_names[icls]) plt.xlim(1e-7, 1) @@ -579,8 +674,8 @@ def plot_num_elements(X, epoch=None, cp_dir=None, comet_experiment=None, title=N plt.figure() plt.hist(num_Xelems, bins=np.linspace(0, int(1.2 * maxval), 100)) - plt.xlabel("Number of PFElements / event") - plt.ylabel("Number of events / bin") + plt.xlabel("PFElements / event") + plt.ylabel("Events / bin") if title: plt.title(title) save_img( @@ -931,10 +1026,10 @@ def plot_jet_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No plt.xlim(0, 2) plt.xticks([0, 0.5, 1, 1.5, 2]) plt.ylabel("Matched jets / bin") - plt.xlabel("jet $p_{T,reco} / p_{T,gen}$") + plt.xlabel(labels["reco_gen_jet_ratio"]) plt.axvline(1.0, ymax=0.7, color="black", ls="--") plt.legend(loc=1, fontsize=16) - plt.title(r"${} \less p_{{T,gen}} \leq {}$".format(lim_low, lim_hi)) + plt.title(labels["gen_jet_range"].format(lim_low, lim_hi)) plt.yscale("log") plt.tight_layout() @@ -957,18 +1052,13 @@ def plot_jet_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No plt.ylim(0.75, 1.25) plt.axhline(1.0, color="black", ls="--") plt.ylabel("Response median") - if title: - plt.title(title) - plt.legend() + plt.legend(title=title) plt.sca(axs[1]) plt.plot(x_vals, pf_vals[:, 2] - pf_vals[:, 0], marker="o", label="PF") plt.plot(x_vals, mlpf_vals[:, 2] - mlpf_vals[:, 0], marker="o", label="MLPF") plt.ylabel("Response IQR") - plt.legend() - if title: - plt.title(title) - plt.xlabel("gen-jet $p_T$ [GeV]") + plt.xlabel(labels["gen_jet"]) plt.tight_layout() save_img( @@ -1029,10 +1119,10 @@ def plot_met_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No plt.xlim(0, 2) plt.xticks([0, 0.5, 1, 1.5, 2]) plt.ylabel("Events / bin") - plt.xlabel("MET reco / gen") + plt.xlabel(labels["reco_gen_met_ratio"]) plt.axvline(1.0, ymax=0.7, color="black", ls="--") plt.legend(loc=1, fontsize=16) - plt.title(r"${} \less MET_{{gen}} \leq {}$".format(lim_low, lim_hi)) + plt.title(labels["gen_met_range"].format(lim_low, lim_hi)) plt.yscale("log") plt.tight_layout() @@ -1066,7 +1156,7 @@ def plot_met_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No plt.legend() if title: plt.title(title) - plt.xlabel("gen MET [GeV]") + plt.xlabel(labels["gen_met"]) plt.tight_layout() save_img( @@ -1075,3 +1165,96 @@ def plot_met_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No cp_dir=cp_dir, comet_experiment=comet_experiment, ) + + +def plot_3dmomentum_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=None, title=None): + + genmet = yvals["gen_mom"] + + pf_response = yvals["ratio_cand"] + mlpf_response = yvals["ratio_pred"] + + genmet_bins = [10, 20, 40, 60, 80, 100, 200] + + x_vals = [] + pf_vals = [] + mlpf_vals = [] + b = np.linspace(0, 2, 100) + + fig, axs = plt.subplots(2, 3, figsize=(3 * 5, 2 * 5)) + axs = axs.flatten() + for ibin in range(len(genmet_bins) - 1): + lim_low = genmet_bins[ibin] + lim_hi = genmet_bins[ibin + 1] + x_vals.append(np.mean([lim_low, lim_hi])) + + mask_gen = (genmet > lim_low) & (genmet <= lim_hi) + pf_subsample = pf_response[mask_gen] + if len(pf_subsample) > 0: + pf_p25 = np.percentile(pf_subsample, 25) + pf_p50 = np.percentile(pf_subsample, 50) + pf_p75 = np.percentile(pf_subsample, 75) + else: + pf_p25 = 0.0 + pf_p50 = 0.0 + pf_p75 = 0.0 + pf_vals.append([pf_p25, pf_p50, pf_p75]) + + mlpf_subsample = mlpf_response[mask_gen] + if len(pf_subsample) > 0: + mlpf_p25 = np.percentile(mlpf_subsample, 25) + mlpf_p50 = np.percentile(mlpf_subsample, 50) + mlpf_p75 = np.percentile(mlpf_subsample, 75) + else: + mlpf_p25 = 0.0 + mlpf_p50 = 0.0 + mlpf_p75 = 0.0 + mlpf_vals.append([mlpf_p25, mlpf_p50, mlpf_p75]) + + plt.sca(axs[ibin]) + plt.hist(pf_subsample, bins=b, histtype="step", lw=2, label="PF") + plt.hist(mlpf_subsample, bins=b, histtype="step", lw=2, label="MLPF") + plt.xlim(0, 2) + plt.xticks([0, 0.5, 1, 1.5, 2]) + plt.ylabel("Events / bin") + plt.xlabel(labels["reco_gen_mom_ratio"]) + plt.axvline(1.0, ymax=0.7, color="black", ls="--") + plt.legend(loc=1, fontsize=16) + plt.title(labels["gen_mom_range"].format(lim_low, lim_hi)) + plt.yscale("log") + + plt.tight_layout() + save_img( + "mom_response_binned.png", + epoch, + cp_dir=cp_dir, + comet_experiment=comet_experiment, + ) + + x_vals = np.array(x_vals) + pf_vals = np.array(pf_vals) + mlpf_vals = np.array(mlpf_vals) + + # Plot median and IQR as a function of gen pt + fig, axs = plt.subplots(2, 1, sharex=True) + plt.sca(axs[0]) + plt.plot(x_vals, pf_vals[:, 1], marker="o", label="PF") + plt.plot(x_vals, mlpf_vals[:, 1], marker="o", label="MLPF") + plt.ylim(0.75, 1.25) + plt.axhline(1.0, color="black", ls="--") + plt.ylabel("Response median") + plt.legend(title=title) + + plt.sca(axs[1]) + plt.plot(x_vals, pf_vals[:, 2] - pf_vals[:, 0], marker="o", label="PF") + plt.plot(x_vals, mlpf_vals[:, 2] - mlpf_vals[:, 0], marker="o", label="MLPF") + plt.ylabel("Response IQR") + plt.xlabel(labels["gen_mom"]) + + plt.tight_layout() + save_img( + "mom_response_med_iqr.png", + epoch, + cp_dir=cp_dir, + comet_experiment=comet_experiment, + ) diff --git a/mlpf/pyg/PFGraphDataset.py b/mlpf/pyg/PFGraphDataset.py index 79abf4fe3..586d34043 100644 --- a/mlpf/pyg/PFGraphDataset.py +++ b/mlpf/pyg/PFGraphDataset.py @@ -10,7 +10,7 @@ sys.path.append(sys.path[0] + "/..") # temp hack from heptfds.cms_pf.cms_utils import prepare_data_cms -from heptfds.delphes_pf.delphes_utils import prepare_data_delphes +from heptfds.delphes_pf.utils_delphes import prepare_data_delphes from heptfds.clic_pf_edm4hep.utils_edm import prepare_data_clic diff --git a/mlpf/tfmodel/datasets/BaseDatasetFactory.py b/mlpf/tfmodel/datasets/BaseDatasetFactory.py index 9045e93ef..b13584b9b 100644 --- a/mlpf/tfmodel/datasets/BaseDatasetFactory.py +++ b/mlpf/tfmodel/datasets/BaseDatasetFactory.py @@ -46,17 +46,29 @@ def unpack_target(y, num_output_classes, config): return ret +def my_getitem(self, vals): + records = self.data_source.__getitems__(vals) + return [self.dataset_info.features.deserialize_example_np(record, decoders=self.decoders) for record in records] + + def mlpf_dataset_from_config(dataset_name, full_config, split, max_events=None, horovod_enabled=False): dataset_config = full_config["datasets"][dataset_name] - tf_dataset = tfds.load( - "{}:{}".format(dataset_name, dataset_config["version"]), - split=split, - as_supervised=False, - data_dir=dataset_config["data_dir"], - with_info=False, - shuffle_files=False, - download=False, - ) + + def yield_from_ds(): + for elem in dss: + yield {"X": elem["X"], "ygen": elem["ygen"], "ycand": elem["ycand"]} + + # when the dataset is saved with file_format=array_record, we cannot do tfds.load, but instead must do the following + dss = tfds.builder( + "{}:{}".format(dataset_name, dataset_config["version"]), data_dir=dataset_config["data_dir"] + ).as_data_source(split) + # hack to prevent a warning from tfds about accessing sequences of indices + dss.__class__.__getitems__ = my_getitem + + output_signature = {k: tf.TensorSpec(shape=(None, v.shape[1])) for (k, v) in dss.dataset_info.features.items()} + + tf_dataset = tf.data.Dataset.from_generator(yield_from_ds, output_signature=output_signature) + if max_events: tf_dataset = tf_dataset.take(max_events) diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py index c1efe8e84..57249c6f7 100644 --- a/mlpf/tfmodel/model_setup.py +++ b/mlpf/tfmodel/model_setup.py @@ -156,10 +156,7 @@ def prepare_callbacks( callbacks = [] callbacks.append(tf.keras.callbacks.TerminateOnNaN()) - - # these checkpoints don't seem to work in horovod (maybe it's Tensorboard) - if not horovod_enabled: - callbacks += get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, horovod_enabled, is_hpo_run) + callbacks += get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, horovod_enabled, is_hpo_run) if not horovod_enabled or hvd.rank() == 0: if benchmark_dir: @@ -201,49 +198,53 @@ def prepare_callbacks( def get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, horovod_enabled, is_hpo_run=False): + callbacks = [] - cp_dir = Path(outdir) / "weights" - cp_dir.mkdir(parents=True, exist_ok=True) - cp_callback = ModelOptimizerCheckpoint( - filepath=str(cp_dir / "weights-{epoch:02d}-{val_loss:.6f}.hdf5"), - save_weights_only=True, - verbose=0, - monitor=config["callbacks"]["checkpoint"]["monitor"], - save_best_only=False, - ) - cp_callback.opt_path = str(cp_dir / "opt-{epoch:02d}-{val_loss:.6f}.pkl") - callbacks += [cp_callback] - - history_path = Path(outdir) / "history" - history_path.mkdir(parents=True, exist_ok=True) - history_path = str(history_path) - cb = CustomCallback( - history_path, - dataset.tensorflow_dataset.take(config["validation_num_events"]), - config, - plot_freq=config["callbacks"]["plot_freq"], - horovod_enabled=horovod_enabled, - comet_experiment=comet_experiment, - is_hpo_run=is_hpo_run, - ) - if config.get("do_validation_callback", True): - callbacks += [cb] - - tb = CustomTensorBoard( - log_dir=outdir + "/logs", - histogram_freq=config["callbacks"]["tensorboard"]["hist_freq"], - write_graph=False, - write_images=False, - update_freq="batch", - profile_batch=config["callbacks"]["tensorboard"]["profile_batch"] - if "profile_batch" in config["callbacks"]["tensorboard"].keys() - else 0, - dump_history=config["callbacks"]["tensorboard"]["dump_history"], - ) - # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it - tb.__class__.__name__ = "TensorBoard" - callbacks += [tb] + if not horovod_enabled or hvd.rank() == 0: + cp_dir = Path(outdir) / "weights" + cp_dir.mkdir(parents=True, exist_ok=True) + cp_callback = ModelOptimizerCheckpoint( + filepath=str(cp_dir / "weights-{epoch:02d}-{val_loss:.6f}.hdf5"), + save_weights_only=True, + verbose=1, + monitor=config["callbacks"]["checkpoint"]["monitor"], + save_best_only=False, + ) + cp_callback.opt_path = str(cp_dir / "opt-{epoch:02d}-{val_loss:.6f}.pkl") + callbacks += [cp_callback] + + if not horovod_enabled: + history_path = Path(outdir) / "history" + history_path.mkdir(parents=True, exist_ok=True) + history_path = str(history_path) + cb = CustomCallback( + history_path, + dataset.tensorflow_dataset.take(config["validation_num_events"]), + config, + plot_freq=config["callbacks"]["plot_freq"], + horovod_enabled=horovod_enabled, + comet_experiment=comet_experiment, + is_hpo_run=is_hpo_run, + ) + + if config.get("do_validation_callback", True): + callbacks += [cb] + + tb = CustomTensorBoard( + log_dir=outdir + "/logs", + histogram_freq=config["callbacks"]["tensorboard"]["hist_freq"], + write_graph=False, + write_images=False, + update_freq="batch", + profile_batch=config["callbacks"]["tensorboard"]["profile_batch"] + if "profile_batch" in config["callbacks"]["tensorboard"].keys() + else 0, + dump_history=config["callbacks"]["tensorboard"]["dump_history"], + ) + # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it + tb.__class__.__name__ = "TensorBoard" + callbacks += [tb] return callbacks @@ -457,10 +458,6 @@ def eval_model( def freeze_model(model, config, outdir): - import tf2onnx - - num_features = config["dataset"]["num_input_features"] - def model_output(ret): return tf.concat( [ @@ -500,12 +497,19 @@ def model_output(ret): # we need to use opset 12 for the version of ONNXRuntime in CMSSW # the warnings "RuntimeError: Opset (12) must be >= 13 for operator 'batch_dot'." do not seem to be critical - model_proto, _ = tf2onnx.convert.from_function( - full_model, - opset=12, - input_signature=(tf.TensorSpec((None, None, num_features), tf.float32, name="x:0"),), - output_path=str(Path(outdir) / "model.onnx"), - ) + + # Note on 2023.08.24: currently there is a conflict between latest tensorflow and tf2onnx + # The conflict is caused by: + # onnxruntime 1.12.0 depends on flatbuffers + # tensorflow 2.13.0 depends on flatbuffers>=23.1.21 + # tf2onnx 1.15.0 depends on flatbuffers<3.0 and >=1.12 + # import tf2onnx + # model_proto, _ = tf2onnx.convert.from_function( + # full_model, + # opset=12, + # input_signature=(tf.TensorSpec((None, None, num_features), tf.float32, name="x:0"),), + # output_path=str(Path(outdir) / "model.onnx"), + # ) class LearningRateLoggingCallback(tf.keras.callbacks.Callback): diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py index 480813a5d..4a08e35c2 100644 --- a/mlpf/tfmodel/utils.py +++ b/mlpf/tfmodel/utils.py @@ -230,7 +230,11 @@ def get_singlenode_strategy(num_cpus=None): num_batches_multiplier = 1 if num_gpus > 1: num_batches_multiplier = num_gpus - logging.info("Multiple GPUs detected, num_batches_multiplier={}".format(num_batches_multiplier)) + logging.info( + "Multiple GPUs detected, batch size will be increased by num_batches_multiplier={}".format( + num_batches_multiplier + ) + ) return strategy, num_gpus, num_batches_multiplier @@ -783,12 +787,13 @@ def model_weight_setting(): if loaded_opt: opt.set_weights(loaded_opt["weights"]) - # FIXME: check that this still works with multiple GPUs + logging.info("distributing optimizer state") strategy = tf.distribute.get_strategy() strategy.run(model_weight_setting) initial_epoch = int(weights.split("/")[-1].split("-")[1]) + logging.info("setting model weights") config = set_config_loss(config, config["setup"]["trainable"]) configure_model_weights(model, config["setup"]["trainable"]) @@ -805,6 +810,7 @@ def model_weight_setting(): loss_dict, loss_weights = get_loss_dict(config) + logging.info("compiling model") model.compile( loss=loss_dict, optimizer=opt, diff --git a/notebooks/clic/clic-hitbased.ipynb b/notebooks/clic/clic-hitbased.ipynb deleted file mode 100644 index d5d76df55..000000000 --- a/notebooks/clic/clic-hitbased.ipynb +++ /dev/null @@ -1,330 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "foster-monte", - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import glob\n", - "import tqdm\n", - "import awkward as ak\n", - "import boost_histogram as bh\n", - "import sys\n", - "import vector\n", - "import pickle\n", - "import scipy\n", - "\n", - "import mplhep\n", - "mplhep.style.use(mplhep.style.CMS)\n", - "\n", - "import sys\n", - "sys.path.append(\"../mlpf/\")\n", - "from plotting.plot_utils import pid_to_text, EVALUATION_DATASET_NAMES, load_eval_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d05b0fec", - "metadata": {}, - "outputs": [], - "source": [ - "yvals, X, _ = load_eval_data(\n", - " \"../experiments/clic-hits_20230421_213012_921390.gpu1.local/evaluation\"+\n", - " \"/epoch_18/clic_edm_ttbar_hits_pf/*.parquet\",\n", - " max_files=200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7bb38e96", - "metadata": {}, - "outputs": [], - "source": [ - "yvals[\"pred_cls_sm\"] = ak.softmax(yvals[\"pred_cls\"], axis=-1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab10652c", - "metadata": {}, - "outputs": [], - "source": [ - "msk1=yvals[\"gen_cls_id\"]==2\n", - "msk2=yvals[\"pred_cls_id\"]==2\n", - "msk3=yvals[\"cand_cls_id\"]==2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83e77dd2", - "metadata": {}, - "outputs": [], - "source": [ - "X[msk1][:, :, 10]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6ca30f0b", - "metadata": {}, - "outputs": [], - "source": [ - "X[msk1&msk2][:, :, 10]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "94550667", - "metadata": {}, - "outputs": [], - "source": [ - "plt.hist2d(\n", - " ak.to_numpy(ak.flatten(X[msk1])[:, 7]),\n", - " ak.to_numpy(ak.flatten(yvals[\"pred_cls\"][msk1][:, :, 2])), bins=100);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22233a6e", - "metadata": {}, - "outputs": [], - "source": [ - "b = np.logspace(-2,1,61)\n", - "plt.hist(ak.flatten(X[msk1])[:, 5], bins=b, histtype=\"step\", lw=2)\n", - "plt.hist(ak.flatten(X[msk1&msk2])[:, 5], bins=b, histtype=\"step\", lw=2)\n", - "plt.hist(ak.flatten(X[msk1&msk3])[:, 5], bins=b, histtype=\"step\", lw=2)\n", - "plt.xscale(\"log\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5aac5e8d", - "metadata": {}, - "outputs": [], - "source": [ - "iev = 7\n", - "plt.scatter(\n", - " X[X[:, :, 0]==1][iev, :, 2],\n", - " np.arctan2(X[X[:, :, 0]==1][iev, :, 3], X[X[:, :, 0]==1][iev, :, 4]),\n", - " s=X[X[:, :, 0]==1][iev, :, 5]\n", - ")\n", - "\n", - "plt.scatter(\n", - " X[X[:, :, 0]==2][iev, :, 2],\n", - " np.arctan2(X[X[:, :, 0]==2][iev, :, 3], X[X[:, :, 0]==2][iev, :, 4]),\n", - " s=X[X[:, :, 0]==2][iev, :, 5]\n", - ")\n", - "\n", - "\n", - "plt.scatter(\n", - " yvals[\"gen_eta\"][yvals[\"gen_cls_id\"]!=0][iev],\n", - " yvals[\"gen_phi\"][yvals[\"gen_cls_id\"]!=0][iev],\n", - " s=yvals[\"gen_energy\"][yvals[\"gen_cls_id\"]!=0][iev]\n", - ")\n", - "\n", - "plt.scatter(\n", - " yvals[\"pred_eta\"][yvals[\"pred_cls_id\"]!=0][iev],\n", - " yvals[\"pred_phi\"][yvals[\"pred_cls_id\"]!=0][iev],\n", - " s=yvals[\"pred_energy\"][yvals[\"pred_cls_id\"]!=0][iev]\n", - ")\n", - "\n", - "plt.xlim(-4,4)\n", - "plt.ylim(-4,4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38f03dfc", - "metadata": {}, - "outputs": [], - "source": [ - "iev = 8\n", - "plt.scatter(\n", - " X[X[:, :, 0]==2][iev, :, 6],\n", - " X[X[:, :, 0]==2][iev, :, 7],\n", - " s=X[X[:, :, 0]==2][iev, :, 5]\n", - ")\n", - "plt.xlim(-4000, 4000)\n", - "plt.ylim(-4000, 4000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6f882a2", - "metadata": {}, - "outputs": [], - "source": [ - "iev = 9\n", - "msk1 = X[:, :, 0]==2\n", - "colors = [\n", - " \"gray\", \"red\", \"green\", \"blue\", \"orange\", \"purple\"\n", - "]\n", - "for icls in range(0,6):\n", - " msk2 = yvals[\"gen_cls_id\"]==icls\n", - " mult = 5 if icls==0 else 50\n", - " plt.scatter(\n", - " X[msk1 & msk2][iev, :, 6],\n", - " X[msk1 & msk2][iev, :, 7],\n", - " s=mult*X[msk1 & msk2][iev, :, 5],\n", - " c=colors[icls],\n", - " label=icls,\n", - " alpha=0.2 if icls==0 else 1.0\n", - " )\n", - "plt.legend()\n", - "plt.xlim(-4000, 4000)\n", - "plt.ylim(-4000, 4000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f8d2f78", - "metadata": {}, - "outputs": [], - "source": [ - "for icls in range(0,6):\n", - " msk2 = yvals[\"pred_cls_id\"]==icls\n", - " mult = 5 if icls==0 else 50\n", - " plt.scatter(\n", - " X[msk1 & msk2][iev, :, 6],\n", - " X[msk1 & msk2][iev, :, 7],\n", - " s=mult*X[msk1 & msk2][iev, :, 5],\n", - " c=colors[icls],\n", - " label=icls,\n", - " alpha=0.2 if icls==0 else 1.0\n", - " )\n", - "plt.legend()\n", - "plt.xlim(-4000, 4000)\n", - "plt.ylim(-4000, 4000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc6e9043", - "metadata": {}, - "outputs": [], - "source": [ - "for icls in range(0,6):\n", - " msk2 = yvals[\"gen_cls_id\"]==icls\n", - " mult = 5 if icls==0 else 50\n", - " plt.scatter(\n", - " X[msk1 & msk2][iev, :, 6],\n", - " X[msk1 & msk2][iev, :, 8],\n", - " s=mult*X[msk1 & msk2][iev, :, 5],\n", - " c=colors[icls],\n", - " label=icls,\n", - " alpha=0.2 if icls==0 else 1.0\n", - " )\n", - "plt.legend()\n", - "plt.xlim(-4000, 4000)\n", - "plt.ylim(-4000, 4000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25b67dcd", - "metadata": {}, - "outputs": [], - "source": [ - "for icls in range(0,6):\n", - " msk2 = yvals[\"pred_cls_id\"]==icls\n", - " mult = 5 if icls==0 else 50\n", - " plt.scatter(\n", - " X[msk1 & msk2][iev, :, 6],\n", - " X[msk1 & msk2][iev, :, 8],\n", - " s=mult*X[msk1 & msk2][iev, :, 5],\n", - " c=colors[icls],\n", - " label=icls,\n", - " alpha=0.2 if icls==0 else 1.0\n", - " )\n", - "plt.legend()\n", - "plt.xlim(-4000, 4000)\n", - "plt.ylim(-4000, 4000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "431cbd4a", - "metadata": {}, - "outputs": [], - "source": [ - "import plotly.express as px\n", - "import pandas\n", - "\n", - "iev = 2\n", - "\n", - "df = pandas.DataFrame()\n", - "df[\"px\"] = X[msk1][iev, :, 6].to_numpy()\n", - "df[\"py\"] = X[msk1][iev, :, 7].to_numpy()\n", - "df[\"pz\"] = X[msk1][iev, :, 8].to_numpy()\n", - "df[\"energy\"] = 10*X[msk1][iev, :, 5].to_numpy()\n", - "df[\"gen_cls_id\"] = yvals[\"gen_cls_id\"][msk1][iev].to_numpy()\n", - "df[\"cand_cls_id\"] = yvals[\"cand_cls_id\"][msk1][iev].to_numpy()\n", - "df[\"pred_cls_sm_0\"] = yvals[\"pred_cls_sm\"][msk1][iev, :, 0].to_numpy()\n", - "df[\"pred_cls_sm_1\"] = yvals[\"pred_cls_sm\"][msk1][iev, :, 1].to_numpy()\n", - "df[\"pred_cls_sm_2\"] = yvals[\"pred_cls_sm\"][msk1][iev, :, 2].to_numpy()\n", - "df[\"pred_cls_sm_3\"] = yvals[\"pred_cls_sm\"][msk1][iev, :, 3].to_numpy()\n", - "df[\"subdetector\"] = X[msk1][iev, :, 10].to_numpy()\n", - "\n", - "fig = px.scatter_3d(\n", - " df,\n", - " x='px',\n", - " y='pz',\n", - " z='py',\n", - " color='cand_cls_id',\n", - " size='energy',\n", - " hover_data=[\"pred_cls_sm_0\", \"pred_cls_sm_1\", \"pred_cls_sm_2\", \"pred_cls_sm_3\", \"cand_cls_id\", \"gen_cls_id\", \"subdetector\"],\n", - " color_continuous_scale=px.colors.diverging.Spectral)\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8bf9a68", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/clic/clic-visualize.ipynb b/notebooks/clic/clic-visualize.ipynb index 2be2ca68c..c7a8f9e26 100644 --- a/notebooks/clic/clic-visualize.ipynb +++ b/notebooks/clic/clic-visualize.ipynb @@ -4,24 +4,39 @@ "cell_type": "code", "execution_count": null, "id": "10d72c28", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "%matplotlib inline" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3e2b13c-ed1d-4c6b-ab6b-c48eb4b3f275", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import sys" + ] + }, { "cell_type": "code", "execution_count": null, "id": "088f63e0", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import uproot\n", "import numpy as np\n", "import awkward\n", "\n", - "import plotly.express as px\n", "import pandas\n", "\n", "import matplotlib.pyplot as plt" @@ -31,11 +46,24 @@ "cell_type": "code", "execution_count": null, "id": "b5c9bf64", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def hits_to_features(hit_data, iev, coll, feats):\n", - " feat_arr = {f: hit_data[coll + \".\" + f][iev] for f in feats}\n", + " \n", + " if \"TrackerHit\" in coll:\n", + " new_feats = []\n", + " for feat in feats:\n", + " feat_to_get = feat\n", + " if feat == \"energy\":\n", + " feat_to_get = \"eDep\"\n", + " new_feats.append((feat, feat_to_get))\n", + " else:\n", + " new_feats = [(f, f) for f in feats]\n", + " \n", + " feat_arr = {f1: hit_data[coll + \".\" + f2][iev] for f1, f2 in new_feats}\n", "\n", " sdcoll = \"subdetector\"\n", " feat_arr[sdcoll] = np.zeros(len(feat_arr[\"type\"]), dtype=np.int32)\n", @@ -43,8 +71,10 @@ " feat_arr[sdcoll][:] = 0\n", " elif coll.startswith(\"HCAL\"):\n", " feat_arr[sdcoll][:] = 1\n", - " else:\n", + " elif coll.startswith(\"MUON\"):\n", " feat_arr[sdcoll][:] = 2\n", + " else:\n", + " feat_arr[sdcoll][:] = 3\n", " return awkward.Record(feat_arr)" ] }, @@ -52,40 +82,24 @@ "cell_type": "code", "execution_count": null, "id": "6e577c86", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "fi = uproot.open(\"/media/joosep/data/clic_edm4hep_2023_02_27/p8_ee_tt_ecm380/reco_p8_ee_tt_ecm380_1.root\")\n", + "fi = uproot.open(\"reco_p8_ee_tt_ecm380_1302.root\")\n", "ev = fi[\"events\"]\n", "\n", - "iev = 0" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56b59774", - "metadata": {}, - "outputs": [], - "source": [ - "ev[\"SiTracks_1\"].arrays()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02921670", - "metadata": {}, - "outputs": [], - "source": [ - "ev[\"ECALBarrel\"][\"ECALBarrel.cellID\"].array()[0]" + "iev = 1" ] }, { "cell_type": "code", "execution_count": null, "id": "9f8364d2", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "collectionIDs = {k: v for k, v in\n", @@ -94,6 +108,10 @@ "collectionIDs_reverse = {v: k for k, v in collectionIDs.items()}\n", "\n", "hit_data = {\n", + " \"VXDTrackerHits\": ev[\"VXDTrackerHits\"].array(),\n", + " \"VXDEndcapTrackerHits\": ev[\"VXDEndcapTrackerHits\"].array(),\n", + " \"ITrackerHits\": ev[\"ITrackerHits\"].array(),\n", + " \"OTrackerHits\": ev[\"OTrackerHits\"].array(),\n", " \"ECALBarrel\": ev[\"ECALBarrel\"].array(),\n", " \"ECALEndcap\": ev[\"ECALEndcap\"].array(),\n", " \"ECALOther\": ev[\"ECALOther\"].array(),\n", @@ -103,129 +121,212 @@ " \"MUON\": ev[\"MUON\"].array(),\n", "}\n", " \n", - "feats = [\"type\", \"cellID\", \"energy\", \"energyError\", \"time\", \"position.x\", \"position.y\", \"position.z\"]\n", + "feats = [\"position.x\", \"position.y\", \"position.z\", \"energy\", \"type\"]\n", "\n", - "hit_idx_global = 0\n", - "hit_idx_global_to_local = {}\n", "hit_feature_matrix = []\n", "for col in sorted(hit_data.keys()):\n", " icol = collectionIDs[col]\n", " hit_features = hits_to_features(hit_data[col], iev, col, feats)\n", " hit_feature_matrix.append(hit_features)\n", - " for ihit in range(len(hit_data[col][col+\".energy\"][iev])):\n", - " hit_idx_global_to_local[hit_idx_global] = (icol, ihit)\n", - " hit_idx_global += 1\n", - "hit_idx_local_to_global = {v: k for k, v in hit_idx_global_to_local.items()}\n", + " \n", "hit_feature_matrix = awkward.Record({\n", - " k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) for k in hit_feature_matrix[0].fields})\n", - "\n", - "dd = ev.arrays(\n", - " [\"PandoraClusters/PandoraClusters.energy\",\n", - " \"PandoraClusters/PandoraClusters.position.x\",\n", - " \"PandoraClusters/PandoraClusters.position.y\",\n", - " \"PandoraClusters/PandoraClusters.position.z\",\n", - " \"PandoraClusters/PandoraClusters.hits_begin\",\n", - " \"PandoraClusters/PandoraClusters.hits_end\",\n", - " \"PandoraClusters#1/PandoraClusters#1.collectionID\",\n", - " \"PandoraClusters#1/PandoraClusters#1.index\",\n", - " ]\n", - ")\n", - "\n", - "coll_arr = dd[\"PandoraClusters#1/PandoraClusters#1.collectionID\"][iev]\n", - "idx_arr = dd[\"PandoraClusters#1/PandoraClusters#1.index\"][iev]\n", - "hits_begin = dd[\"PandoraClusters/PandoraClusters.hits_begin\"][iev]\n", - "hits_end = dd[\"PandoraClusters/PandoraClusters.hits_end\"][iev]\n", - "\n", - "#index in the array of all hits\n", - "hit_to_cluster_matrix_coo0 = []\n", - "#index in the cluster array\n", - "hit_to_cluster_matrix_coo1 = []\n", - "\n", - "#weight\n", - "hit_to_cluster_matrix_w = []\n", - "\n", - "#loop over all clusters\n", - "for icluster in range(len(hits_begin)):\n", - "\n", - " #get the slice in the hit array corresponding to this cluster\n", - " hbeg = hits_begin[icluster]\n", - " hend = hits_end[icluster]\n", - " idx_range = idx_arr[hbeg:hend]\n", - " coll_range = coll_arr[hbeg:hend]\n", - "\n", - " #add edges from hit to cluster\n", - " for icol, idx in zip(coll_range, idx_range):\n", - " hit_to_cluster_matrix_coo0.append(hit_idx_local_to_global[(icol, idx)])\n", - " hit_to_cluster_matrix_coo1.append(icluster)\n", - " hit_to_cluster_matrix_w.append(1.0)\n", - "hit_to_cluster_matrix_coo0 = np.array(hit_to_cluster_matrix_coo0)\n", - "hit_to_cluster_matrix_coo1 = np.array(hit_to_cluster_matrix_coo1)" + " k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) for k in hit_feature_matrix[0].fields})" ] }, { "cell_type": "code", "execution_count": null, - "id": "91b6864a", - "metadata": {}, + "id": "b306fa60-4431-408e-b733-c537c1c9c1ab", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "hit_cluster_idx = -1*np.ones(len(hit_feature_matrix[\"position.x\"]))" + "msk_gen = ev[\"MCParticles/MCParticles.generatorStatus\"].array()==1" ] }, { "cell_type": "code", "execution_count": null, - "id": "24008966", - "metadata": {}, + "id": "26d67c20-4211-437f-9f78-dfbf5d291ca6", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "for cl in np.unique(hit_to_cluster_matrix_coo1):\n", - " hit_cluster_idx[hit_to_cluster_matrix_coo0[hit_to_cluster_matrix_coo1==cl]] = cl" + "px = ev[\"MCParticles/MCParticles.momentum.x\"].array()[msk_gen][iev]\n", + "py = ev[\"MCParticles/MCParticles.momentum.y\"].array()[msk_gen][iev]\n", + "pz = ev[\"MCParticles/MCParticles.momentum.z\"].array()[msk_gen][iev]\n", + "mass = ev[\"MCParticles/MCParticles.mass\"].array()[msk_gen][iev]\n", + "charge = ev[\"MCParticles/MCParticles.charge\"].array()[msk_gen][iev]" ] }, { "cell_type": "code", "execution_count": null, "id": "399439f1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "df = pandas.DataFrame()\n", "df[\"px\"] = hit_feature_matrix[\"position.x\"].to_numpy()\n", "df[\"py\"] = hit_feature_matrix[\"position.y\"].to_numpy()\n", "df[\"pz\"] = hit_feature_matrix[\"position.z\"].to_numpy()\n", - "df[\"energy\"] = 10*hit_feature_matrix[\"energy\"].to_numpy()\n", - "df[\"cluster\"] = hit_cluster_idx" + "df[\"energy\"] = 1000*hit_feature_matrix[\"energy\"].to_numpy()\n", + "df[\"plotsize\"] = 0\n", + "df[\"subdetector\"] = hit_feature_matrix[\"subdetector\"].to_numpy()" ] }, { "cell_type": "code", "execution_count": null, - "id": "70d8ae19", - "metadata": {}, + "id": "aa8d68a6-b03c-48a4-ab33-e6a83027bbb2", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "plt.figure(figsize=(5,5))\n", - "plt.scatter(df[\"px\"], df[\"py\"], c=df[\"cluster\"], s=df[\"energy\"])\n", - "plt.xlim(-3000,3000)\n", - "plt.ylim(-3000,3000)" + "df.loc[df[\"subdetector\"]==0, \"plotsize\"] = df.loc[df[\"subdetector\"]==0, \"energy\"]/5\n", + "df.loc[df[\"subdetector\"]==1, \"plotsize\"] = df.loc[df[\"subdetector\"]==1, \"energy\"]/10\n", + "df.loc[df[\"subdetector\"]==2, \"plotsize\"] = df.loc[df[\"subdetector\"]==2, \"energy\"]*100\n", + "df.loc[df[\"subdetector\"]==3, \"plotsize\"] = df.loc[df[\"subdetector\"]==3, \"energy\"]*100" ] }, { "cell_type": "code", "execution_count": null, "id": "97989b0d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "fig = px.scatter_3d(df, x='px', y='pz', z='py', color='cluster', size='energy', color_continuous_scale=px.colors.diverging.Spectral)\n", - "fig.show()" + "import plotly.graph_objects as go\n", + "import vector\n", + "\n", + "B = 4.0 # magnetic field in T\n", + "c = 3e8 # speed of light in m/s\n", + "scale = 2500\n", + "\n", + "mc_x = []\n", + "mc_y = []\n", + "mc_z = []\n", + "for irow in range(len(px)):\n", + " # convert to vector\n", + " v = vector.obj(\n", + " px=px[irow],\n", + " py=py[irow],\n", + " pz=pz[irow],\n", + " mass=mass[irow])\n", + " if charge[irow] == 0:\n", + " # pass\n", + " mc_x += [0, np.clip(scale * v.px/v.mag, -2000, 2000)]\n", + " mc_y += [0, np.clip(scale * v.py/v.mag, -2000, 2000)]\n", + " mc_z += [0, np.clip(scale * v.pz/v.mag, -4000, 4000)]\n", + " else:\n", + " # radius of the helix in m\n", + " R = v.pt / (np.abs(charge[irow]) * 0.3 * B)\n", + " # angular frequency\n", + " omega = np.abs(charge[irow]) * 0.3 * B / (v.gamma * v.mass)\n", + " # time values\n", + " t_values = np.linspace(0, 1/(c * v.beta), 50)\n", + " # trajectory in 3D space\n", + " mc_x += list(scale * R * np.cos(omega * c * t_values + v.phi - np.pi/2) - scale * R * np.cos(v.phi - np.pi/2))\n", + " mc_y += list(scale * R * np.sin(omega * c * t_values + v.phi - np.pi/2) - scale * R * np.sin(v.phi - np.pi/2))\n", + " mc_z += list(scale * v.pz * c * t_values / (v.gamma * v.mass))\n", + " \n", + " mc_x += [None]\n", + " mc_y += [None]\n", + " mc_z += [None]\n", + "\n", + "\n", + "fig = go.Figure(\n", + " data=[\n", + " go.Scatter3d(\n", + " x=np.clip(df[\"px\"], -2000, 2000),\n", + " y=np.clip(df[\"py\"], -2000, 2000),\n", + " z=np.clip(df[\"pz\"], -4000, 4000),\n", + " mode='markers',\n", + " marker=dict(\n", + " size=np.clip(2+2*np.log(df[\"plotsize\"]), 1, 15),\n", + " color=df[\"subdetector\"],\n", + " colorscale='Viridis',\n", + " opacity=0.8,\n", + " )\n", + " ),\n", + " go.Scatter3d(\n", + " x=mc_x,\n", + " y=mc_y,\n", + " z=mc_z,\n", + " mode='lines',\n", + " )\n", + " ],\n", + ")\n", + "fig.update_traces(marker_line_width=0, selector=dict(type='scatter3d'))\n", + "fig.update_layout(\n", + " width=1000,\n", + " height=1000,\n", + " margin=dict(l=20, r=20, t=20, b=20),\n", + ")\n", + "fig.update_layout(\n", + " scene=dict(\n", + " xaxis=dict(showgrid=False, showticklabels=False, range=[-2000, 2000]),\n", + " yaxis=dict(showgrid=False, showticklabels=False, range=[-4000, 4000]),\n", + " zaxis=dict(showgrid=False, showticklabels=False, range=[-2000, 2000]),\n", + " ),\n", + " scene_camera=dict(\n", + " up=dict(x=0, y=1, z=0),\n", + " center=dict(x=0, y=0, z=0),\n", + " eye=dict(\n", + " x=1.2,\n", + " y=0.8,\n", + " z=0\n", + " )\n", + " ),\n", + " showlegend=False\n", + ")\n", + "\n", + "# uncomment this to create a rotating animation\n", + "# idx = 0\n", + "# for angle in range(0, 360, 1):\n", + "# camera = dict(\n", + "# up=dict(x=0, y=1, z=1),\n", + "# center=dict(x=0, y=0, z=0),\n", + "# eye=dict(x=1.2 * np.cos(np.radians(angle)),\n", + "# y=0.8,\n", + "# z=1.2 * np.sin(np.radians(angle)))\n", + "# )\n", + "# fig.update_layout(scene_camera=camera)\n", + "# frame_filename = f\"frame_{idx:03d}.png\"\n", + "# idx += 1\n", + "# fig.write_image(frame_filename, scale=2)" + ] + }, + { + "cell_type": "markdown", + "id": "70561f34-e691-4649-adf2-4d80676179df", + "metadata": {}, + "source": [ + "Now compile the video with\n", + "```\n", + "ffmpeg -framerate 10 -pattern_type glob -i '*.png' -c:v libx264 -pix_fmt yuv420p out.mp4\n", + "```" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf5e1e37-0577-49fe-a1dc-5de2f748e786", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -239,7 +340,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/notebooks/clic/paper_plots_2023_datasets.ipynb b/notebooks/clic/paper_plots_2023_datasets.ipynb index ac95337e3..3f2ab960c 100644 --- a/notebooks/clic/paper_plots_2023_datasets.ipynb +++ b/notebooks/clic/paper_plots_2023_datasets.ipynb @@ -22,7 +22,7 @@ "mplhep.style.use(mplhep.style.CMS)\n", "\n", "import sys\n", - "sys.path.append(\"../mlpf/\")\n", + "sys.path.append(\"../../mlpf/\")\n", "from plotting.plot_utils import pid_to_text, EVALUATION_DATASET_NAMES" ] }, @@ -202,9 +202,7 @@ "cell_type": "code", "execution_count": null, "id": "415989d8", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "b = np.linspace(0, 200, 101)\n", @@ -228,7 +226,7 @@ "plt.legend()\n", "plt.ylim(0, 10*num_files)\n", "plt.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0,0))\n", - "plt.savefig(\"plots/clic/num_tracks.pdf\")" + "plt.savefig(\"plots_mlpf_clic_2023/num_tracks.pdf\")" ] }, { @@ -257,7 +255,7 @@ "plt.legend()\n", "plt.ylim(0,15*num_files)\n", "plt.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0,0))\n", - "plt.savefig(\"plots/clic/num_clusters.pdf\")" + "plt.savefig(\"plots_mlpf_clic_2023/num_clusters.pdf\")" ] }, { @@ -280,16 +278,14 @@ "plt.legend()\n", "plt.ylim(0,500)\n", "plt.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0,0))\n", - "plt.savefig(\"plots/clic/num_hits.pdf\")" + "plt.savefig(\"plots_mlpf_clic_2023/num_hits.pdf\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4ea58410", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "gen_pt1 = ak.flatten(data_tt[\"ygen\"][data_tt[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", @@ -321,29 +317,29 @@ "fig = plt.figure()\n", "ax = plt.axes()\n", "\n", - "prev = mplhep.histplot(h1, histtype=\"step\", lw=1, label=label_tt, ls=\"--\")\n", - "mplhep.histplot(h1c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "prev = mplhep.histplot(h1, flow=\"sum\", histtype=\"step\", lw=1, label=label_tt, ls=\"--\")\n", + "mplhep.histplot(h1c, flow=\"sum\", histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", "\n", - "prev = mplhep.histplot(h2, histtype=\"step\", lw=1, label=label_qq, ls=\"--\")\n", - "mplhep.histplot(h2c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "prev = mplhep.histplot(h2, flow=\"sum\", histtype=\"step\", lw=1, label=label_qq, ls=\"--\")\n", + "mplhep.histplot(h2c, flow=\"sum\", histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", "\n", - "prev = mplhep.histplot(h3, histtype=\"step\", lw=1, label=label_zh, ls=\"--\")\n", - "mplhep.histplot(h3c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "prev = mplhep.histplot(h3, flow=\"sum\", histtype=\"step\", lw=1, label=label_zh, ls=\"--\")\n", + "mplhep.histplot(h3c, flow=\"sum\", histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", "\n", - "prev = mplhep.histplot(h4, histtype=\"step\", lw=1, label=label_ww, ls=\"--\")\n", - "mplhep.histplot(h4c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "prev = mplhep.histplot(h4, flow=\"sum\", histtype=\"step\", lw=1, label=label_ww, ls=\"--\")\n", + "mplhep.histplot(h4c, flow=\"sum\", histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", "\n", - "prev = mplhep.histplot(h5, histtype=\"step\", lw=1, label=label_tt + \" PU10\", ls=\"--\")\n", - "mplhep.histplot(h5c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "prev = mplhep.histplot(h5, flow=\"sum\", histtype=\"step\", lw=1, label=label_tt + \" PU10\", ls=\"--\")\n", + "mplhep.histplot(h5c, flow=\"sum\", histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", "\n", "plt.xscale(\"log\")\n", "plt.xlabel(\"particle $p_T$ [GeV]\")\n", - "plt.ylabel(\"Number of particles / bin\")\n", + "plt.ylabel(\"Particles / bin\")\n", "plt.legend()\n", "plt.text(0.03, 0.97, \"dashed - stable generator particles\\nsolid - reconstructed Pandora PF particles\", transform=ax.transAxes, va=\"top\", ha=\"left\", fontsize=16)\n", "plt.ylim(0,500*num_files)\n", "plt.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0,0))\n", - "plt.savefig(\"plots/clic/gen_cand_particle_pt.pdf\")" + "plt.savefig(\"plots_mlpf_clic_2023/gen_cand_particle_pt.pdf\")\n" ] }, { diff --git a/notebooks/clic/paper_plots_2023_loss_curves.ipynb b/notebooks/clic/paper_plots_2023_loss_curves.ipynb deleted file mode 100644 index a916d34be..000000000 --- a/notebooks/clic/paper_plots_2023_loss_curves.ipynb +++ /dev/null @@ -1,260 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib as mpl\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "from pathlib import Path\n", - "import numpy as np\n", - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_dirs = [\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211025_083504_712103.workergpu007/\",\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211025_083504_715900.workergpu006/\",\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211025_083504_717867.workergpu020/\",\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211025_083504_732144.workergpu010/\",\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211025_083504_859436.workergpu036/\",\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211026_042043_168888.workergpu007/\",\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211026_042043_175689.workergpu036/\",\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211026_042043_178263.workergpu010/\",\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211026_042043_309903.workergpu020/\",\n", - " \"../experiments/all_data_cms-best-of-asha-scikit_20211026_042043_311628.workergpu006/\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "paths = []\n", - "for train_dir in train_dirs:\n", - " paths.append(Path(train_dir))\n", - "\n", - "histories = []\n", - "for path in paths:\n", - " with open(path / \"history/history.json\") as h:\n", - " histories.append(json.load(h))\n", - "\n", - "for history in histories:\n", - " reg_loss = np.sum(\n", - " np.array([history[\"{}_loss\".format(l)] for l in [\"energy\", \"pt\", \"eta\", \"sin_phi\", \"cos_phi\", \"charge\"]]), axis=0\n", - " )\n", - " val_reg_loss = np.sum(\n", - " np.array([history[\"val_{}_loss\".format(l)] for l in [\"energy\", \"pt\", \"eta\", \"sin_phi\", \"cos_phi\", \"charge\"]]), axis=0\n", - " )\n", - " history.update({\"reg_loss\": reg_loss})\n", - " history.update({\"val_reg_loss\": val_reg_loss})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def cms_label(x0=0.12, x1=0.23, x2=0.67, y=0.90):\n", - " plt.figtext(x0, y, \"CMS\", fontweight=\"bold\", wrap=True, horizontalalignment=\"left\", fontsize=16)\n", - " plt.figtext(x1, y, \"Simulation Preliminary\", style=\"italic\", wrap=True, horizontalalignment=\"left\", fontsize=14)\n", - " plt.figtext(\n", - " x2,\n", - " y,\n", - " r\"Run 3 (14 TeV), $\\mathrm{t}\\overline{\\mathrm{t}}$, QCD with PU50; $\\mu, \\pi, \\pi_0, \\tau, \\gamma$, single particle guns\",\n", - " wrap=False,\n", - " horizontalalignment=\"left\",\n", - " fontsize=12,\n", - " )\n", - "\n", - "\n", - "def get_combined_array(key):\n", - " combined_array = np.array(histories[0][key])\n", - " for ii in range(1, len(histories)):\n", - " combined_array = np.vstack([combined_array, np.array(histories[ii][key])])\n", - " return combined_array" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "loss_array = get_combined_array(\"loss\")\n", - "reg_loss_array = get_combined_array(\"reg_loss\")\n", - "cls_loss_array = get_combined_array(\"cls_loss\")\n", - "\n", - "val_loss_array = get_combined_array(\"val_loss\")\n", - "val_reg_loss_array = get_combined_array(\"val_reg_loss\")\n", - "val_cls_loss_array = get_combined_array(\"val_cls_loss\")\n", - "\n", - "cls_acc_weighted_array = get_combined_array(\"cls_acc_weighted\")\n", - "val_cls_acc_weighted_array = get_combined_array(\"val_cls_acc_weighted\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "loss_array.shape, val_loss_array.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_variance_curve(array_list, labels, skip=0, ylim=None, save_path=None):\n", - " fig = plt.figure()\n", - " final_means = []\n", - " final_stds = []\n", - " for ii, array in enumerate(array_list):\n", - " xx = np.array(range(array.shape[1])) + 1 # Epochs\n", - "\n", - " xx = xx[skip:]\n", - " array = array[:, skip:]\n", - "\n", - " std = np.std(array, axis=0)\n", - " mean = np.mean(array, axis=0)\n", - "\n", - " plt.plot(xx, mean, label=labels[ii])\n", - " plt.fill_between(xx, mean - std, mean + std, alpha=0.4)\n", - "\n", - " # Add individual loss curves\n", - " # plt.plot(np.tile(xx, reps=[10,1]).transpose(), array.transpose(), linewidth=0.2)\n", - "\n", - " print(labels[ii] + \": {:.5f} +/- {:.5f}\".format(mean[-1], std[-1]))\n", - " final_means.append(mean[-1])\n", - " final_stds.append(std[-1])\n", - "\n", - " plt.legend(bbox_to_anchor=(0.98, 0.78), loc=\"center right\")\n", - " plt.xlabel(\"Epochs\")\n", - " plt.figtext(x=0.45, y=0.53, s=\"Mean and standard deviation\\nof {} trainings\".format(array.shape[0]), fontsize=14)\n", - " s = \"\"\n", - " for ii, label in enumerate(labels):\n", - " s += \"Final {}: {:.5f} +/- {:.5f}\\n\".format(label.lower(), final_means[ii], final_stds[ii])\n", - " plt.figtext(x=0.45, y=0.45, s=s, fontsize=12)\n", - " if ylim:\n", - " plt.ylim(top=ylim[1], bottom=ylim[0])\n", - "\n", - " cms_label(x0=0.13, x1=0.185, x2=0.43)\n", - " if save_path:\n", - " plt.savefig(save_path)\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "mpl.rc_file(\"my_matplotlib_rcparams\")\n", - "plot_variance_curve(\n", - " [loss_array, val_loss_array],\n", - " labels=[\"Training loss\", \"Validation loss\"],\n", - " skip=0,\n", - " ylim=[0.0, 0.25],\n", - " save_path=\"loss_curves_std.pdf\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot_variance_curve(\n", - " [reg_loss_array, val_reg_loss_array],\n", - " labels=[\"Training regression loss\", \"Validation regression loss\"],\n", - " skip=25,\n", - " ylim=(0.0, 0.04),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot_variance_curve(\n", - " [cls_loss_array, val_cls_loss_array],\n", - " labels=[\"Training classification loss\", \"Validation classification loss\"],\n", - " skip=25,\n", - " ylim=(0.0, 0.005),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot_variance_curve(\n", - " [cls_acc_weighted_array, val_cls_acc_weighted_array],\n", - " [\"Weighted train accuracy\", \"Weighted valididation accuracy\"],\n", - " skip=0,\n", - " ylim=(0.8, 1),\n", - " save_path=\"cls_acc_std.pdf\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "tf2", - "language": "python", - "name": "tf2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/clic/paper_plots_2023_ml_training.ipynb b/notebooks/clic/paper_plots_2023_ml_training.ipynb index b1f40ca6a..52724fb67 100644 --- a/notebooks/clic/paper_plots_2023_ml_training.ipynb +++ b/notebooks/clic/paper_plots_2023_ml_training.ipynb @@ -31,7 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "!mkdir -f plots_mlpf_clic_2023" + "!mkdir plots_mlpf_clic_2023" ] }, { @@ -101,11 +101,11 @@ "metadata": {}, "outputs": [], "source": [ - "histories_gnn_before = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments-archive/hypertuning/clic_gnn_beforeHPO/*\")))\n", - "histories_gnn_after = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments-archive/hypertuning//clic_gnn_afterHPO/*\")))\n", + "histories_gnn_before = get_histories(list(glob.glob(\"../../models/clic2023_20230802/hypertuning/clic_gnn_beforeHPO/*\")))\n", + "histories_gnn_after = get_histories(list(glob.glob(\"../../models/clic2023_20230802/hypertuning//clic_gnn_afterHPO/*\")))\n", "\n", - "histories_tf_before = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments-archive/hypertuning//clic_transformer_beforeHPO/*\")))\n", - "histories_tf_after = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments-archive/hypertuning//clic_transformer_afterHPO/*\")))\n" + "histories_tf_before = get_histories(list(glob.glob(\"../../models/clic2023_20230802/hypertuning//clic_transformer_beforeHPO/*\")))\n", + "histories_tf_after = get_histories(list(glob.glob(\"../../models/clic2023_20230802/hypertuning//clic_transformer_afterHPO/*\")))" ] }, { @@ -205,10 +205,11 @@ " plt.axhline(threshold, ls=\"--\", color=\"black\", label=\"baseline PF\") \n", " \n", "# plt.legend(bbox_to_anchor=(0.98, 0.78), loc=\"center right\")\n", + " legtitle = r\"$\\mathrm{t}\\overline{\\mathrm{t}}, \\gamma/\\mathrm{Z}^* \\rightarrow \\mathrm{hadrons}$\"\n", " if loc is not None:\n", - " plt.legend(loc=loc)\n", + " plt.legend(loc=loc, title=legtitle)\n", " else:\n", - " plt.legend()\n", + " plt.legend(title=legtitle)\n", " plt.xlabel(\"Epochs\")\n", " if ylabel:\n", " plt.ylabel(ylabel)\n", @@ -216,17 +217,17 @@ " s=\"Mean and stddev of {:d} trainings\\n\".format(array.shape[0])\n", " for ii, label in enumerate(labels):\n", " if custom_info:\n", - " s += \"Final {}:${:s}$\\n\".format(label, sigdigits(custom_info[ii]['mean'], custom_info[ii][\"std\"]))\n", + " s += \"Final {}: ${:s}$\\n\".format(label, sigdigits(custom_info[ii]['mean'], custom_info[ii][\"std\"]))\n", " else:\n", - " s += \"Final {}:${:s}$\\n\".format(label, sigdigits(final_means[ii], final_stds[ii]))\n", + " s += \"Final {}: ${:s}$\\n\".format(label, sigdigits(final_means[ii], final_stds[ii]))\n", "\n", " if ylim:\n", " plt.ylim(top=ylim[1], bottom=ylim[0])\n", "\n", - " plt.subplots_adjust(left=0.14)\n", + " # plt.subplots_adjust(left=0.14)\n", " \n", " cms_label(x0=x, y=y, s=s, fz=24)\n", - " run_label(x=0.9, y=0.89, fz=22)\n", + " # run_label(x=0.9, y=0.89, fz=22)\n", " if save_path:\n", " plt.savefig(Path(save_path).with_suffix('.png'))\n", " plt.savefig(Path(save_path).with_suffix('.pdf'))\n", @@ -243,9 +244,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "plot_variance_curve([ret[\"gnn\"][\"before\"][\"val_loss\"], ret[\"gnn\"][\"after\"][\"val_loss\"],\n", @@ -255,7 +254,7 @@ " skip=1,\n", " ylim=[0, 20],\n", " save_path=\"plots_mlpf_clic_2023/loss.png\",\n", - " x=0.25,\n", + " x=0.20,\n", " y=0.85,\n", " ylabel=\"Total validation loss (a.u.)\"\n", " )" @@ -273,7 +272,7 @@ " [(\"red\", \"--\"), (\"red\", \"-\"), (\"blue\", \"--\"), (\"blue\", \"-\")],\n", " skip=1,\n", " save_path=\"plots_mlpf_clic_2023/jet_iqr.png\",\n", - " x=0.25,\n", + " x=0.20,\n", " y=0.85,\n", " ylim=(0, 0.3),\n", " ylabel=r\"jet response IQR\")" @@ -291,7 +290,7 @@ " [(\"red\", \"--\"), (\"red\", \"-\"), (\"blue\", \"--\"), (\"blue\", \"-\")],\n", " skip=1,\n", " save_path=\"plots_mlpf_clic_2023/met_iqr.png\",\n", - " x=0.25,\n", + " x=0.2,\n", " y=0.85,\n", " ylim=(0, 2),\n", " ylabel=r\"MET response IQR\"\n", @@ -302,7 +301,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Figure 5: scaling of timing" + "## Figure 5: scaling of inference timing with number of inputs" ] }, { @@ -311,9 +310,9 @@ "metadata": {}, "outputs": [], "source": [ - "timing_data_gpu_1 = open(\"/home/joosep/particleflow/experiments-archive/timing/mlpf-gnn/gpu_timing_1.txt\").read()\n", - "timing_data_gpu_2 = open(\"/home/joosep/particleflow/experiments-archive/timing/mlpf-gnn/gpu_timing_2.txt\").read()\n", - "timing_data_gpu_3 = open(\"/home/joosep/particleflow/experiments-archive/timing/mlpf-gnn/gpu_timing_2.txt\").read()" + "timing_data_gpu_1 = open(\"../../models/clic2023_20230802/timing/mlpf-gnn/gpu_timing_1.txt\").read()\n", + "timing_data_gpu_2 = open(\"../../models/clic2023_20230802/timing/mlpf-gnn/gpu_timing_2.txt\").read()\n", + "timing_data_gpu_3 = open(\"../../models/clic2023_20230802/timing/mlpf-gnn/gpu_timing_2.txt\").read()" ] }, { @@ -367,11 +366,8 @@ " label=\"B={}\".format(batch),\n", " marker=m)\n", "plt.legend(loc=\"best\")\n", - "\n", - "plt.legend(loc=\"best\")\n", - "plt.ylabel(\"relative time per event\\nT(N,B) / T(256,16)\")\n", - "plt.xlabel(\"number of input elements per event, N\")\n", - "plt.title(\"MLPF-GNN on 8GB GPU\")\n", + "plt.ylabel(\"Relative time per event\\nT(N,B) / T(256,16)\")\n", + "plt.xlabel(\"Input elements per event, N\")\n", "plt.savefig(\"plots_mlpf_clic_2023/mlpf_gnn.png\")\n", "plt.savefig(\"plots_mlpf_clic_2023/mlpf_gnn.pdf\")\n", "plt.show()\n" @@ -455,37 +451,58 @@ "plt.plot([25,200], [1,8], color=\"black\", label=\"linear scaling\", ls=\"--\")\n", "plt.legend()\n", "plt.ylabel(\"relative time per event, $T(N)/T(25)$\")\n", - "plt.xlabel(\"number of $\\pi^-$ particles per event, $N$\")\n", + "plt.xlabel(\"$\\pi^-$ particles per event, $N$\")\n", "#plt.xlim(0,100)\n", "#plt.ylim(0,10)\n", - "plt.title(\"Baseline PF on CPU\")\n", "plt.savefig(\"plots_mlpf_clic_2023/baseline_pf.png\")\n", "plt.savefig(\"plots_mlpf_clic_2023/baseline_pf.pdf\")\n", "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Figure 6: jet and MET IQR for the hit-based training" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# gpu_scaling_x = np.array([1,2,4,8])\n", - "# gpu_scaling_y = np.array([443.72, 209.913, 92.476, 34.263])\n", - "# gpu_scaling_y /= gpu_scaling_y[0]\n", - "# plt.plot(gpu_scaling_x, 1.0/gpu_scaling_y, lw=0, marker=\"o\")\n", - "# plt.plot([1,8],[1,8], color=\"black\", ls=\"--\", label=\"linear scaling\")\n", - "# plt.xlabel(\"Number of GPUs\")\n", - "# plt.ylabel(\"Training epoch throughput\\nincrease over 1 GPU\")\n", - "# plt.title(\"Scaling test on LUMI HPC: MI250X\")\n", - "# plt.show()" + "hists = sorted(list(glob.glob(\"../../models/clic2023_20230802/hits/clic-hits-ln_*/logs/history/*\")))" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## Figure 6: jet and MET IQR for the hit-based training" + "loss_vals = []\n", + "val_loss_vals = []\n", + "\n", + "for hist in hists:\n", + " loss_vals.append(json.load(open(hist))[\"loss\"])\n", + " val_loss_vals.append(json.load(open(hist))[\"val_loss\"])\n", + "\n", + "loss_vals = np.array(loss_vals)\n", + "val_loss_vals = np.array(val_loss_vals)\n", + "\n", + "plt.plot(loss_vals, label=\"train loss\", color=\"black\", ls=\"--\", marker=\"s\")\n", + "plt.plot(val_loss_vals, label=\"val loss\", color=\"black\", marker=\"o\")\n", + "\n", + "plt.legend(title=format_dataset_name(\"clic_edm_ttbar_pf\"))\n", + "plt.ylim(0,0.4)\n", + "plt.xlim(0, 22)\n", + "plt.ylabel(\"Validation loss\")\n", + "plt.xlabel(\"Training epoch\")\n", + "#plt.title(\"Training on tracks and calorimeter hits\")\n", + "#plt.savefig(\"plots_mlpf_clic_2023/hitbased_res_iqr.png\")\n", + "#plt.savefig(\"plots_mlpf_clic_2023/hitbased_res_iqr.pdf\")\n", + "plt.show()" ] }, { @@ -497,6 +514,7 @@ "jet_iqr_vals = []\n", "met_iqr_vals = []\n", "\n", + "xvals = np.arange(1,11)\n", "for hist in hists:\n", " jet_iqr_vals.append(json.load(open(hist))[\"val_jet_iqr\"])\n", " met_iqr_vals.append(json.load(open(hist))[\"val_met_iqr\"])\n", @@ -504,11 +522,11 @@ "jet_iqr_vals = np.array(jet_iqr_vals)\n", "met_iqr_vals = np.array(met_iqr_vals)\n", "\n", - "plt.plot(jet_iqr_vals, label=\"jet response IQR\", marker=\"o\")\n", - "plt.plot(met_iqr_vals/5, label=\"MET response IQR / 5\", marker=\"o\")\n", + "plt.plot(xvals, jet_iqr_vals, label=\"jet response IQR\", marker=\"o\")\n", + "plt.plot(xvals, met_iqr_vals/5, label=\"MET response IQR / 5\", marker=\"o\")\n", "plt.legend(title=format_dataset_name(\"clic_edm_ttbar_pf\"))\n", "plt.ylim(0,0.2)\n", - "plt.xlim(0, 12)\n", + "plt.xlim(1, 10)\n", "plt.ylabel(\"Response IQR (a.u.)\")\n", "plt.xlabel(\"Training epoch\")\n", "#plt.title(\"Training on tracks and calorimeter hits\")\n", @@ -538,7 +556,7 @@ "plt.legend(title=format_dataset_name(\"clic_edm_ttbar_pf\"))\n", "plt.axhline(1.0, color=\"black\", ls=\"--\")\n", "plt.ylim(0.8,1.2)\n", - "plt.xlim(0, 12)\n", + "#plt.xlim(0, 12)\n", "\n", "plt.ylabel(\"Response median (a.u.)\")\n", "plt.xlabel(\"Training epoch\")\n", @@ -552,7 +570,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### GPU scaling on AMD" + "## Scaling with a varying number of GPUs" ] }, { @@ -561,10 +579,19 @@ "metadata": {}, "outputs": [], "source": [ - "hist_1gpu = get_histories(list(glob.glob(\"../../experiments/gpu_scaling/amd/clic-test_20230804_102126_135296.nid007240/\")))\n", - "hist_2gpu = get_histories(list(glob.glob(\"../../experiments/gpu_scaling/amd/clic-test_20230804_103542_563525.nid007250/\")))\n", - "hist_4gpu = get_histories(list(glob.glob(\"../../experiments/gpu_scaling/amd/clic-test_20230804_104304_603100.nid007259/\")))\n", - "hist_8gpu = get_histories(list(glob.glob(\"../../experiments/gpu_scaling/amd/clic-test_20230804_105224_350322.nid007259/\")))" + "def import_scale_test(path=\"../../models/clic2023_20230802/gpu_scaling/scale_test_gnn_h100/scale_testV3_*/result.json\"):\n", + " num_gpus = []\n", + " epoch_times = []\n", + " df = pandas.DataFrame()\n", + " for fn in glob.glob(path):\n", + " data = json.load(open(fn))\n", + " epoch_times.append(np.mean(data[\"wl-stats\"][\"epoch_times\"][1:]))\n", + " num_gpus.append(data[\"wl-stats\"][\"GPU\"])\n", + "\n", + " df[\"num_gpus\"] = num_gpus\n", + " df[\"epoch_times\"] = epoch_times\n", + " df = df.sort_values(\"num_gpus\")\n", + " return df" ] }, { @@ -573,10 +600,9 @@ "metadata": {}, "outputs": [], "source": [ - "times_1gpu = np.diff(hist_1gpu[0][\"time\"])\n", - "times_2gpu = np.diff(hist_2gpu[0][\"time\"])\n", - "times_4gpu = np.diff(hist_4gpu[0][\"time\"])\n", - "times_8gpu = np.diff(hist_8gpu[0][\"time\"])" + "df_h100 = import_scale_test(\"../../models/clic2023_20230802/gpu_scaling/scale_test_gnn_h100/scale_testV3_*/result.json\")\n", + "df_mi250x = import_scale_test(\"../../models/clic2023_20230802/gpu_scaling/scale_test_gnn_mi250x/*/result.json\")\n", + "df_hpu = import_scale_test(\"../../models/clic2023_20230802/gpu_scaling/scale_test_habana/*/result.json\")" ] }, { @@ -585,24 +611,24 @@ "metadata": {}, "outputs": [], "source": [ - "times = [times_1gpu, times_2gpu, times_4gpu, times_8gpu]\n", - "times_m = np.array(list(map(np.mean, times)))\n", - "times_m_1 = times_m[0]\n", - "times_m = times_m_1 / times_m" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gpus = [1,2,4,8]\n", - "plt.plot(gpus, times_m, marker=\"o\", lw=1, label=\"AMD MI250X\")\n", - "plt.plot(gpus, [1,2,4,8], color=\"black\", ls=\"--\")\n", - "plt.ylabel(\"speedup over 1 GPU\")\n", - "plt.xlabel(\"number of GPUs\")\n", - "plt.legend(loc=2)" + "plt.plot([1,8],[1,8], color=\"black\", ls=\"--\", label=\"linear scaling\")\n", + "plt.plot(\n", + " df_h100[\"num_gpus\"].values,\n", + " df_h100[\"epoch_times\"].values[0]/df_h100[\"epoch_times\"].values,\n", + " marker=\"o\", label=\"H100\")\n", + "plt.plot(\n", + " df_mi250x[\"num_gpus\"].values,\n", + " df_mi250x[\"epoch_times\"].values[0]/df_mi250x[\"epoch_times\"].values,\n", + " marker=\"s\", label=\"MI250X\")\n", + "plt.plot(\n", + " df_hpu[\"num_gpus\"].values,\n", + " df_hpu[\"epoch_times\"].values[0]/df_hpu[\"epoch_times\"].values,\n", + " marker=\"s\", label=\"Habana\")\n", + "plt.legend(loc=\"best\")\n", + "plt.xlabel(\"Accelerator processors, N\")\n", + "plt.ylabel(\"Speedup over single accelerator, T(N)/T(1)\")\n", + "plt.savefig(\"./plots_mlpf_clic_2023/scale_test.pdf\")\n", + "plt.savefig(\"./plots_mlpf_clic_2023/scale_test.png\")" ] }, { diff --git a/parameters/clic-hits.yaml b/parameters/clic-hits.yaml index d740a8d1c..de62a96a8 100644 --- a/parameters/clic-hits.yaml +++ b/parameters/clic-hits.yaml @@ -245,38 +245,38 @@ evaluation_jet_algo: ee_genkt_algorithm datasets: clic_edm_ttbar_hits_pf: - version: 1.2.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_qq_hits_pf: - version: 1.2.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_single_kaon0l_hits_pf: - version: 1.2.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_single_gamma_hits_pf: - version: 1.2.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_single_pi_hits_pf: - version: 1.2.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_single_pi0_hits_pf: - version: 1.2.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_single_neutron_hits_pf: - version: 1.2.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_single_electron_hits_pf: - version: 1.2.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_single_muon_hits_pf: - version: 1.2.0 + version: 1.5.0 data_dir: manual_dir: diff --git a/parameters/clic-test.yaml b/parameters/clic-test.yaml index 1e6522aa7..866322e4d 100644 --- a/parameters/clic-test.yaml +++ b/parameters/clic-test.yaml @@ -252,6 +252,6 @@ evaluation_jet_algo: ee_genkt_algorithm datasets: clic_edm_ttbar_pf: - version: 1.4.0 + version: 1.5.0 data_dir: manual_dir: diff --git a/parameters/clic.yaml b/parameters/clic.yaml index 58f0ecd2f..768ca7ca5 100644 --- a/parameters/clic.yaml +++ b/parameters/clic.yaml @@ -61,26 +61,7 @@ setup: batching: # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu - bucket_by_sequence_length: yes - # these sizes were sort of tuned for an 8GB GPU - # - max_sequence_length, batch_size_per_gpu - -#on 8GB GPU - bucket_batch_sizes: - - 25, 200 - - 50, 100 - - 100, 50 - - 200, 20 - - 500, 10 - - 1000, 5 - - 2000, 3 - - 3000, 2 - - 4000, 2 - - 5000, 1 - - 6000, 1 - - inf, 1 - # use this batch multiplier to increase all batch sizes by a constant factor - batch_multiplier: 1 + bucket_by_sequence_length: no optimizer: adam: @@ -234,7 +215,7 @@ raytune: train_test_datasets: physical: - batch_per_gpu: 1 + batch_per_gpu: 50 datasets: - clic_edm_ttbar_pf - clic_edm_qq_pf @@ -264,22 +245,22 @@ evaluation_jet_algo: ee_genkt_algorithm datasets: clic_edm_ttbar_pf: - version: 1.4.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_ttbar_pu10_pf: - version: 1.4.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_qq_pf: - version: 1.4.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_ww_fullhad_pf: - version: 1.4.0 + version: 1.5.0 data_dir: manual_dir: clic_edm_zh_tautau_pf: - version: 1.4.0 + version: 1.5.0 data_dir: manual_dir: diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml index 382e30a34..98b5d205d 100644 --- a/parameters/cms-gen.yaml +++ b/parameters/cms-gen.yaml @@ -285,50 +285,50 @@ evaluation_jet_algo: antikt_algorithm datasets: cms_pf_ttbar: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_ztt: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_qcd: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_qcd_high_pt: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_single_electron: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_single_gamma: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_single_pi0: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_single_neutron: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_single_pi: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_single_tau: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_single_mu: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: cms_pf_single_proton: - version: 1.5.1 + version: 1.6.0 data_dir: manual_dir: diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml index 0ebf8e673..da3822f06 100644 --- a/parameters/delphes.yaml +++ b/parameters/delphes.yaml @@ -231,21 +231,21 @@ train_test_datasets: delphes: batch_per_gpu: 5 datasets: - - delphes_pf + - delphes_data_pf -validation_dataset: delphes_pf +validation_dataset: delphes_data_pf validation_batch_size: 5 validation_num_events: 100 evaluation_datasets: - delphes_pf: + delphes_data_pf: batch_size: 5 num_events: -1 evaluation_jet_algo: antikt_algorithm datasets: - delphes_pf: - version: 1.1.0 + delphes_data_pf: + version: 1.2.0 data_dir: manual_dir: diff --git a/requirements.txt b/requirements.txt index 217fec984..efb43dbb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +array-record autopep8 awkward boost_histogram @@ -15,27 +16,22 @@ networkx nevergrad notebook numba -numpy==1.23.5 # later versions are incompatible with tf2onnx v1.14.0 (latest as of this commit) +numpy onnxruntime pandas papermill plotly pre-commit -protobuf==3.20.3 +protobuf pyarrow -ray[default] ray[tune] scikit-learn scikit-optimize scipy seaborn setGPU -tensorflow -tensorflow-datasets==4.8.0 -tensorflow-estimator -tensorflow-probability -tf-models-official -tf2onnx +tensorflow>=2.13.0 +tensorflow-datasets>=4.9.2 tqdm uproot vector diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh index b642fbe28..b3624f388 100755 --- a/scripts/generate_tfds.sh +++ b/scripts/generate_tfds.sh @@ -3,16 +3,15 @@ # Tallinn export MANUAL_DIR=/local/joosep/mlpf/cms/v2 export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets -export IMG=/home/software/singularity/tf-2.11.0.simg +export IMG=/home/software/singularity/tf-2.13.0.simg export PYTHONPATH=`pwd`/mlpf export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$PYTHONPATH $IMG tfds build " # Desktop -# IMG=/home/joosep/HEP-KBFI/singularity/tf-2.10.0.simg -# MANUAL_DIR=data/ +# IMG=/home/joosep/HEP-KBFI/singularity/tf-2.13.0.simg # DATA_DIR=/home/joosep/tensorflow_datasets # export PYTHONPATH="mlpf:$PYTHONPATH" -# CMD="singularity exec --env PYTHONPATH=$PYTHONPATH $IMG tfds build " +# CMD="singularity exec -B /media/joosep/data --env PYTHONPATH=$PYTHONPATH $IMG tfds build " # CMS # $CMD mlpf/heptfds/cms_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar.log & @@ -30,7 +29,8 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # wait # CLIC cluster-based -# export MANUAL_DIR=/local/joosep/mlpf/clic_edm4hep_2023_05_09/ +# export MANUAL_DIR=/local/joosep/mlpf/clic_edm4hep/ +# export MANUAL_DIR=/media/joosep/data/mlpf/clic_edm4hep_2023_02_27/ # $CMD mlpf/heptfds/clic_pf_edm4hep/qq --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq.log & # $CMD mlpf/heptfds/clic_pf_edm4hep/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar.log & # $CMD mlpf/heptfds/clic_pf_edm4hep/zh --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_zh.log & @@ -40,6 +40,7 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # CLIC hit-based # export MANUAL_DIR=/local/joosep/mlpf_hits/clic_edm4hep/ +# export MANUAL_DIR=/media/joosep/data/mlpf_hits/clic_edm4hep/ # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/qq --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq_hits.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar_hits.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_kaon0L_hits.log & @@ -52,5 +53,6 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # wait # Delphes -# $CMD mlpf/heptfds/delphes_pf/delphes_pf &> logs/tfds_delphes.log & +# export MANUAL_DIR=/local/joosep/mlpf/delphes/ +# $CMD mlpf/heptfds/delphes_pf/delphes_data_pf --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite # wait diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh index 6d4c0bad3..fe8a629b0 100755 --- a/scripts/local_test_delphes_pipeline.sh +++ b/scripts/local_test_delphes_pipeline.sh @@ -11,7 +11,7 @@ mv tev14_pythia8_ttbar_0_0.pkl.bz2 data/delphes_pf/pythia8_ttbar/ wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2 mv tev14_pythia8_qcd_10_0.pkl.bz2 data/delphes_pf/pythia8_qcd/ -tfds build mlpf/heptfds/delphes_pf --download_dir data/ --manual_dir data/delphes_pf +tfds build mlpf/heptfds/delphes_pf/delphes_data_pf --download_dir data/ --manual_dir data/delphes_pf #Run a simple training on a few events python mlpf/pipeline.py train --config parameters/delphes.yaml --nepochs 1 --ntrain 5 --ntest 5 --customize pipeline_test diff --git a/scripts/tallinn/mlpf-train-a100-mir2.sh b/scripts/tallinn/mlpf-train-a100-hits.sh similarity index 64% rename from scripts/tallinn/mlpf-train-a100-mir2.sh rename to scripts/tallinn/mlpf-train-a100-hits.sh index d05a403d9..7a9c14715 100755 --- a/scripts/tallinn/mlpf-train-a100-mir2.sh +++ b/scripts/tallinn/mlpf-train-a100-hits.sh @@ -11,9 +11,11 @@ cd ~/particleflow singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py train -c parameters/clic-test.yaml \ - --plot-freq 0 --num-cpus 32 --batch-multiplier 5 \ - --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir + $IMG python mlpf/pipeline.py train -c parameters/clic-hits.yaml \ + --plot-freq 1 --num-cpus 32 --batch-multiplier 2 --ntrain 10000 --ntest 10000 + +# $IMG python mlpf/pipeline.py train -c parameters/clic-hits.yaml \ +# --plot-freq 1 --num-cpus 32 --batch-multiplier 2 # --env TF_GPU_THREAD_MODE=gpu_private \ # --env TF_GPU_THREAD_COUNT=8 \ diff --git a/scripts/tallinn/mlpf-train-a100.sh b/scripts/tallinn/mlpf-train-a100.sh index 9b3f82a59..7cb5a4635 100755 --- a/scripts/tallinn/mlpf-train-a100.sh +++ b/scripts/tallinn/mlpf-train-a100.sh @@ -11,9 +11,11 @@ cd ~/particleflow singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py train -c parameters/clic-hits.yaml \ - --plot-freq 1 --num-cpus 32 --batch-multiplier 2 \ - --weights experiments/clic-hits_20230804_231819_093246.gpu1.local/weights/weights-04-0.195574.hdf5 + $IMG python mlpf/pipeline.py train -c parameters/clic.yaml \ + --plot-freq 1 --num-cpus 32 --batch-multiplier 5 + +# $IMG python mlpf/pipeline.py train -c parameters/clic-hits.yaml \ +# --plot-freq 1 --num-cpus 32 --batch-multiplier 2 # --env TF_GPU_THREAD_MODE=gpu_private \ # --env TF_GPU_THREAD_COUNT=8 \