In [2]:
import numpy as np
import os
import matplotlib.pyplot as plt
import energyflow as ef
import energyflow.archs
from energyflow.archs import PFN
from matplotlib import gridspec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Layer, concatenate
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from tensorflow.keras.layers import BatchNormalization

plt.rc('font', size=20)
plt.rcParams["font.family"] = "serif"


In [3]:
#These are the same datasets from the OmniFold paper https://arxiv.org/abs/1911.09107.  More detail at https://energyflow.network/docs/datasets/.
#Pythia and Herwig are two generators; one will be treated here as the "simulation" and one as "data".
datasets = {'Pythia26': ef.zjets_delphes.load('Pythia26', num_data=1000000),
            'Herwig': ef.zjets_delphes.load('Herwig', num_data=1000000)}



In [4]:
def is_charged(myin):
    if (myin == 0):
        return 0
    elif (myin == 0.1):
        return 1
    elif (myin == 0.2):
        return -1
    elif (myin == 0.3):
        return 0
    elif (myin == 0.4):
        return -1
    elif (myin == 0.5):
        return 1
    elif (myin == 0.6):
        return -1
    elif (myin == 0.7):
        return 1
    elif (myin == 0.8):
        return 1
    elif (myin == 0.9):
        return -1
    elif (myin == 1.0):
        return 1
    elif (myin == 1.1):
        return -1
    elif (myin == 1.2):
        return 0
    elif (myin == 1.3):
        return 0

In [5]:
for dataset in datasets:
    mycharges = []
    mycharges2 = []
    for i in range(len(datasets[dataset]['gen_particles'])):
        pTs = datasets[dataset]['gen_particles'][i][:,0]
        charges = [is_charged(datasets[dataset]['gen_particles'][i][:,3][j]) for j in range(len(datasets[dataset]['gen_particles'][i][:,3]))]
        mycharges+=[np.sum(charges*pTs**0.5)/np.sum(pTs**0.5)]
        mycharges2+=[np.sum(np.abs(charges)*pTs)/np.sum(pTs)]
    datasets[dataset]['gen_charge'] = mycharges
    datasets[dataset]['gen_pTcharge'] = mycharges2

    mycharges = []
    mycharges2 = []
    for i in range(len(datasets[dataset]['sim_particles'])):
        pTs = datasets[dataset]['sim_particles'][i][:,0]
        charges = [is_charged(datasets[dataset]['sim_particles'][i][:,3][j]) for j in range(len(datasets[dataset]['sim_particles'][i][:,3]))]
        mycharges+=[np.sum(charges*pTs**0.5)/np.sum(pTs**0.5)]
        mycharges2+=[np.sum(np.abs(charges)*pTs)/np.sum(pTs)]
    datasets[dataset]['sim_charge'] = mycharges
    datasets[dataset]['sim_pTcharge'] = mycharges2

In [9]:
tau2s_reco = datasets['Pythia26']['sim_tau2s']
tau2s_reco_alt = datasets['Herwig']['sim_tau2s']

tau1s_reco = datasets['Pythia26']['sim_widths']
tau1s_reco_alt = datasets['Herwig']['sim_widths']

tau2s_true = datasets['Pythia26']['gen_tau2s']
tau2s_true_alt = datasets['Herwig']['gen_tau2s']

tau1s_true = datasets['Pythia26']['gen_widths']
tau1s_true_alt = datasets['Herwig']['gen_widths']

In [7]:
pT_true = datasets['Pythia26']['gen_jets'][:,0]
m_true = datasets['Pythia26']['gen_jets'][:,3]
pT_reco = datasets['Pythia26']['sim_jets'][:,0]
m_reco = datasets['Pythia26']['sim_jets'][:,3]

pT_true_alt = datasets['Herwig']['gen_jets'][:,0]
m_true_alt = datasets['Herwig']['gen_jets'][:,3]
pT_reco_alt = datasets['Herwig']['sim_jets'][:,0]
m_reco_alt = datasets['Herwig']['sim_jets'][:,3]

#
w_true = datasets['Pythia26']['gen_widths']
w_reco = datasets['Pythia26']['sim_widths']
w_true_alt = datasets['Herwig']['gen_widths']
w_reco_alt = datasets['Herwig']['sim_widths']

#
q_true = np.array(datasets['Pythia26']['gen_charge'])
q_reco = np.array(datasets['Pythia26']['sim_charge'])
q_true_alt = np.array(datasets['Herwig']['gen_charge'])
q_reco_alt = np.array(datasets['Herwig']['sim_charge'])

#
r_true = np.array(datasets['Pythia26']['gen_pTcharge'])
r_reco = np.array(datasets['Pythia26']['sim_pTcharge'])
r_true_alt = np.array(datasets['Herwig']['gen_pTcharge'])
r_reco_alt = np.array(datasets['Herwig']['sim_pTcharge'])

In [11]:
variables_to_save = ['pT_true', 'pT_true_alt', 'pT_reco', 'pT_reco_alt',
         'm_true', 'm_true_alt', 'm_reco', 'm_reco_alt',
         'q_true', 'q_true_alt', 'q_reco', 'q_reco_alt',
         'w_true', 'w_true_alt', 'w_reco', 'w_reco_alt',
         'r_true', 'r_true_alt', 'r_reco', 'r_reco_alt',
         'tau1s_true', 'tau1s_true_alt', 'tau1s_reco', 'tau1s_reco_alt',
         'tau2s_true', 'tau2s_true_alt', 'tau2s_reco', 'tau2s_reco_alt']

In [13]:
arrays_dict = {var_name: globals()[var_name] for var_name in variables_to_save}

In [14]:
np.savez('rawdata.npz', **arrays_dict)

In [10]:
np.savez('rawdata.npz', pT_true, pT_true_alt, pT_reco, pT_reco_alt,
         m_true, m_true_alt, m_reco, m_reco_alt,
         q_true, q_true_alt, q_reco, q_reco_alt,
         w_true, w_true_alt, w_reco, w_reco_alt,
         r_true, r_true_alt, r_reco, r_reco_alt,
         tau1s_true, tau1s_true_alt, tau1s_reco, tau1s_reco_alt,
         tau2s_true, tau2s_true_alt, tau2s_reco, tau2s_reco_alt
        )