In [None]:
HMM_PATH = "/storage/user/jpata/hmm/dev/hepaccelerate-cms"

In [None]:
from argparse import Namespace
import os
import dask
import distributed
import glob
import shutil
import copy
import time
import random

from dask.distributed import get_worker, wait, progress

In [None]:
class LibResource():
    _args = None
    _analysis_parameters = None
    _analysis_corrections = None
    
    def get_resource(self):
        if self._args is None:
            self._args, self._analysis_parameters, self._analysis_corrections = initialize_worker()
        return self._args, self._analysis_parameters, self._analysis_corrections

def initialize_worker():
    
    import sys
    sys.path += [HMM_PATH + "/hepaccelerate", HMM_PATH + "/coffea", HMM_PATH, HMM_PATH + "/tests/hmm"]

    os.environ["NUMBA_NUM_THREADS"] = str(1)
    os.environ["OMP_NUM_THREADS"] = str(1)

    import hmumu_utils, hmumu_lib
    from analysis_hmumu import AnalysisCorrections
    import hepaccelerate
    from pars import analysis_parameters

    os.chdir(HMM_PATH)
    hmumu_utils.NUMPY_LIB, hmumu_utils.ha = hepaccelerate.choose_backend(use_cuda=False)
    analysis_parameters["baseline"]["do_factorized_jec"] = False
    analysis_parameters["baseline"]["save_dnn_vars"] = False
    args = Namespace()
    args.out = "./out"
    args.cache_location = "/storage/user/jpata/hmm/cache"
    args.datapath = "/storage/group/allcit/"
    args.do_fsr = False
    args.async_data = False
    args.use_cuda = False
    args.do_sync = False
    args.nthreads = 1
    do_tensorflow = False
    
    analysis_corrections = AnalysisCorrections(args, do_tensorflow)
    return args, analysis_parameters, analysis_corrections

In [None]:
args, analysis_parameters, analysis_corrections = initialize_worker()

In [None]:
!rm -Rf ./out

In [None]:
import pars
import hmumu_utils
from hmumu_utils import seed_generator, create_dataset_jobfiles
from analysis_hmumu import merge_partial_results
seed_gen = seed_generator()

jobfiles_dataset = {}
if os.path.isdir("./out/jobfiles"):
    shutil.rmtree("./out/jobfiles")
for dataset_name, dataset_era, globpattern, is_mc in pars.datasets:
    filenames = glob.glob(args.datapath + globpattern, recursive=True)
    if len(filenames) == 0:
        print(args.datapath + globpattern)
        break
    jobfiles_dataset[(dataset_name, dataset_era)] = create_dataset_jobfiles(dataset_name, dataset_era,
            filenames, is_mc, 1, args.out, seed_gen)

In [None]:
!du -csh out/jobfiles/*.json | tail -n1

In [None]:
def fix_filename(jobfile_desc):
    ret = copy.deepcopy(jobfile_desc)
    for ifn in range(len(ret["filenames"])):
        ret["filenames"][ifn] = ret["filenames"][ifn].replace(args.datapath, "")
    return ret

jobfiles_to_process = []
for k in jobfiles_dataset.keys():
    if k[0] == "data":
        n = 100
    else:
        n = 5
    jobfiles_to_process += jobfiles_dataset[k]

jobfiles_to_process = [fix_filename(jf) for jf in jobfiles_to_process]
random.shuffle(jobfiles_to_process)

In [None]:
outpath_partial = "./out/partial_results"
if not os.path.exists(outpath_partial):
    os.makedirs(outpath_partial)
#hmumu_utils.run_analysis(args, outpath_partial, jobfiles_to_process[:2], analysis_parameters, analysis_corrections)

In [None]:
#nworkers = 20
#cluster = distributed.LocalCluster(n_workers=nworkers, threads_per_worker=1, nanny=None, processes=True, memory_limit=0)
#client = distributed.Client(cluster)

#submit using condor_submit dask-worker.jdl
#singularity exec /storage/user/jpata/gpuservers/singularity/images/cupy.simg dask-scheduler --host 10.3.18.196 --dashboard-address 131.215.207.131:8178
#singularity exec /storage/user/jpata/gpuservers/singularity/images/cupy.simg dask-worker --nthreads 1 --nprocs 20 --memory-limit 0 10.3.18.196:8786
client = distributed.Client("tcp://10.3.18.196:8786")

In [None]:
#make sure we load the calibration stuff only once
def get_library():
    worker = get_worker()
    try:
        return worker.library
    except AttributeError:
        worker.library = LibResource()
        return worker.library

#Actually run the phsyics computation!
def run_on_worker(job_descriptions):
    lr = get_library()
    _args, _analysis_parameters, _analysis_corrections = lr.get_resource()
    import hmumu_utils
    return hmumu_utils.run_analysis(_args, outpath_partial, job_descriptions, _analysis_parameters, _analysis_corrections)
    
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
arglist = list(chunks(jobfiles_to_process, 1))
print("Will process {0} jobs".format(len(arglist)))

In [None]:
client.restart()
t0 = time.time()
jobs = client.map(run_on_worker, arglist, retries=3)
wait(jobs)
t1 = time.time()

In [None]:
failed = []
for f in jobs:
    if f.status != "finished":
        print("job {0} failed".format(f))
        failed += [f]
        
if len(failed) == 0:
    print("All jobs were successful")
else:
    raise Exception("Some jobs failed")

In [None]:
nev_total = sum([f.result()["nev_total"] for f in jobs])
dt = (t1 - t0)
print("Processed {0:.2E} events in {1:.1f} seconds, {2:.2E}".format(nev_total, dt, nev_total/dt))

In [None]:
!du -csh out/partial_results/*.pkl | tail -n1

In [None]:
res_merge = []
for dataset_name, dataset_era, globpattern, is_mc in pars.datasets:
    res_merge += [client.submit(merge_partial_results, dataset_name, dataset_era, args.out, outpath_partial)]
wait(res_merge);
for j in res_merge:
    assert(j.status == "finished")

In [None]:
!du -csh out/results/*.pkl | tail -n1

In [None]:
from plotting import make_pdf_plot, get_cross_section
from pars import cross_sections, categories, combined_categories, process_groups, extra_plot_kwargs
import pickle

In [None]:
era = "2018"
analysis = "baseline"

res = {}
res["data"] = pickle.load(open(args.out + "/results/data_{0}.pkl".format(era), "rb"))
mc_samples_load = set([d[0] for d in pars.datasets if d[1] == era])
for mc_samp in mc_samples_load:
    res_file_name = args.out + "/results/{0}_{1}.pkl".format(mc_samp, era)
    if os.path.isfile(res_file_name):
        res[mc_samp] = pickle.load(open(res_file_name, "rb"))
int_lumi = res["data"]["baseline"]["int_lumi"]

genweights = {}
weight_xs = {}
for mc_samp in res.keys():
    if mc_samp != "data":
        genweights[mc_samp] = res[mc_samp]["genEventSumw"]
        weight_xs[mc_samp] = get_cross_section(cross_sections, mc_samp, era) * int_lumi / genweights[mc_samp]

In [None]:
list(res["data"][analysis].keys())[:10]

In [None]:
var = "hist__dimuon_invmass_z_peak_cat5__eta_jj"
mc_samples = categories["z_peak"]["datacard_processes"]
combined_mc_samples = combined_categories["z_peak"]["datacard_processes"]

In [None]:
histos = {}
for sample in mc_samples + ["data"]:
    histos[sample] = res[sample][analysis][var]
hdata = res["data"][analysis][var]["nominal"]

In [None]:
outpath = os.path.join(args.out, analysis, "plots", era)
weight_nominal = "nominal"
plot_args = [(
    histos, hdata, mc_samples, analysis,
    var, weight_nominal, weight_xs, int_lumi, outpath, era, process_groups, extra_plot_kwargs.get(var, {}))]

In [None]:
make_pdf_plot(plot_args[0])

In [None]:
from IPython.display import Image

In [None]:
Image(filename="{0}/png/{1}_{2}_{3}.png".format(outpath, analysis, var, weight_nominal), width=300)