In [None]:
import mg5qs_import as qs
import numpy as np
import os
import concurrent
from pathlib import Path
import pickle
import matplotlib.pyplot as plt

### Storing data and varying multiple parameters


This example will demonstrate a way to store data at the time of generation using Python's built-in binary file writer, pickle. This approach has many benefits including: arbitrarily large runs with no concern to depleting system resources (data not stored in memory), safety (data is stored at the time of generation), and multithreading compatibility. 

This example includes:
- looping over multiple model parameters
- use of pickle to store pT data
- reading pickle files
- reassociating data with corresponding model parameters

### 1. Generate a MadGraph framework

**1.1 call run_MG5**

For the purposes of this example, the proc_card and run_card will not be changed. 

In [None]:
output_name, FRAMEWORK_PATH = qs.run_MG5(qs.MG5_PATH, qs.INPUT_PATH, proc_card_name='proc_card.dat')

### 2. Generate LHEs varying multiple parameters 

**2.1 load param_card as ParamCard object**

In [None]:
card = qs.ParamCard(FRAMEWORK_PATH)

In [None]:
card.dfs()['MASS']

**2.2 Vary Higgs mass and mass of the Z-boson while generating LHEs**

MH_MZ will be required for reassociating the runs to their respective parameter values.  

In [None]:
MHs = [42, 125]
MZs = [42, 91.188] # the parameter values to be looped over
count = 0 
MH_MZ = {} # nested dict keyed to parameter values, used for run association 

for MH in MHs:
    for MZ in MZs:
        if not MH in MH_MZ.keys():
            MH_MZ[MH] = {}
        MH_MZ[MH][MZ] = count # dict[dict] to assocate params with run numbers
        count += 1 
        card.set_value('MASS', 25, MH)
        card.set_value('MASS', 23, MZ)
        print('Working on MH =',MH, '& MZ =',MZ,'...')
        qs.generate_LHE(card, FRAMEWORK_PATH)
print('done') 

In [None]:
MH_MZ # parameter values are associated with run number

### 3. Generate pT while storing to pickle files

**3.1 get LHEs, shower using Pythia, and store pT in pickle files**

This process can generate any quantity of pT values, limited only by space in the file system (instead of in memory). The overhead of reading and writing pickle files is low, and the files are very small (containing only pT values, and no other data from the shower). This is in contrast to the standard .hepmc files that are necessarily large, each containing every detail of every particle in the shower (on the order of tens of gigabytes, compared to 1-2 megabytes for a pickle file). Also, intercepting the pT values at runtime eliminates the need to post-process .hepmc files to calculate pT.

In this example, each pickle file contains pT associated with a unique set of parameters, but it is equally easy to produce many runs with the same parameters, allowing for generation of very large data sets. 

In [None]:
output_dir = Path('pT_tau_runs')  # output path relative to Jupyter
cpu_cores = 10                    # set an appropriate value based on CPU hardware

TAU = 15 # Particle ID for Tau
LHEs = qs.get_LHEs(FRAMEWORK_PATH)

# create output path if it does not exist
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

# function to process a single LHE file, storing result in a pickle file
def process_LHE(n, LHE, PID):
    pTs = qs.generate_pT(PID, LHE)
    fname = FRAMEWORK_PATH.name + '.' + str(n) + '.pkl'
    with open(output_dir / fname, 'wb') as f:
        pickle.dump(pTs, f)
        f.close()

# run multiple processes in parallel
with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_cores) as executor:
    futures = {executor.submit(process_LHE, i, LHE, TAU): i for i, LHE in enumerate(LHEs)}

**3.2 continue to handle allocation of data to a corresponding run**

In [None]:
# retrieve pT values from pickle files
def get_pT(mh, mz):
    n = MH_MZ[mh][mz]
    fname = FRAMEWORK_PATH.name + '.' + str(n) + '.pkl'
    with open(output_dir / fname, 'rb') as file:
        return pickle.load(file)

results = []
for MH in MHs:
    for MZ in MZs:
        results.append((MH, MZ, get_pT(MH, MZ)))

**3.3 graph results**

In [None]:
fig, ax = plt.subplots(len(results), figsize=(7, len(results)*6))
# adapter to graphing code from prior example (fun CS exercise) 
i_j = [[i, j] for i in range(len(MHs)) for j in range(len(MZs))]

for i in range(len(results)):
    ax[i].hist(results[i][2][1], bins=30)
    ax[i].set_title(f'MH = {MHs[i_j[i][0]]}, MZ = {MZs[i_j[i][1]]}')
    ax[i].set_yscale('log')
    ax[i].set_xlabel("pT (GeV)")
plt.show()