# Parsing Reports

This notebook parses the contents of reports/ to understand the speed and reliability of AGC-running experiments.

In [45]:
import awkward as ak
import pickle as pkl

In [46]:
REMOTE_NONE_NONE = "RemoteRead_reports20240822_221923_250_None_None.pkl"
REMOTE_5_NONE = "RemoteRead_reports20240822_202120_250_5_None.pkl"
XCACHE_5_NONE_FIRST = "Xcache_reports20240822_204219_250_5_None.pkl"
XCACHE_5_NONE_SECOND = "Xcache_reports20240822_205535_250_5_None.pkl"
XCACHE_NONE_NONE_FIRST = "Xcache_reports20240823_192620_250_None_None.pkl"

In [47]:
class ParsedReport:
    def __init__(self,rep):
        with open(rep,"rb") as f:
            self.reports = pkl.load(f)
        self.errors = {}
        for dset in self.reports.keys():
            self.errors[dset] = self.reports[dset][~ak.is_none(self.reports[dset].message)]

        #Calculate some aggregate metrics
        self._chunk_fail_rate()
        self._count_chunks()
        self._file_fail_rate()
        self._count_files()
        self._count_sites()
        self._count_messages()

    def print_metrics(self,sites=False):
        print("AGGREGATE INFO:\n----------------------------------------------------")
        print(f"Total number of files: {self.num_files}")
        print(f"Total file read error rate: {100*self.tot_file_fail_rate}%")
        print(f"Total number of chunks: {self.num_chunks}")
        print(f"Total chunk read error rate: {100*self.tot_chunk_fail_rate}%")
        
        print("\n========================================================\n")
        print("PER-DATASET INFO:\n----------------------------------------------------")
        for dset in self.reports.keys():
            print(f"Dataset: {dset}")
            print(f"\tNumber of files: {len(set(self.reports[dset].args[:,0]))}")
            print(f"\tFile read error rate: {100*self.file_fail_rates[dset]}%")
            print(f"\tNumber of chunks: {ak.num(self.reports[dset],axis=0)}")
            print(f"\tChunk read error rate: {100*self.chunk_fail_rates[dset]}%")
        print("\n========================================================\n")
        print("ERROR MESSAGES:\n----------------------------------------------------")
        for msg in self.messages:
            print(f"Error message: {msg}")
            print(f"\tChunks with this message: {self.messages_count[msg]}")
        if not sites:
            return
        print("\n========================================================\n")
        print("SITES INFO:\n----------------------------------------------------")
        for site in self.site_counts.keys():
            print(f"Site: {site}")
            print(f"\tNumber of files: {self.site_counts[site]}")
            print(f"\tFile read failure rate at this site: {100*self.site_error_counts[site]/self.site_counts[site]}%")

    def _chunk_fail_rate(self):
        self.chunk_fail_rates = {}
        for dset in self.reports.keys():
            self.chunk_fail_rates[dset] = ak.num(self.errors[dset],axis=0)/ak.num(self.reports[dset],axis=0)

    def _count_chunks(self):
        self.num_chunks = 0
        self.num_error_chunks = 0
        for dset in self.reports.keys():
            self.num_chunks += ak.num(self.reports[dset],axis=0)
            self.num_error_chunks += ak.num(self.errors[dset],axis=0)
        self.tot_chunk_fail_rate = self.num_error_chunks/self.num_chunks

    def _file_fail_rate(self):
        self.file_fail_rates = {}
        for dset in self.reports.keys():
            num_files = len(set(self.reports[dset].args[:,0]))
            num_error_files = len(set(self.errors[dset].args[:,0]))
            self.file_fail_rates[dset] = num_error_files/num_files

    def _count_files(self):
        self.num_files = 0
        self.num_error_files = 0
        for dset in self.reports.keys():
            self.num_files += len(set(self.reports[dset].args[:,0]))
            self.num_error_files += len(set(self.errors[dset].args[:,0]))
        self.tot_file_fail_rate = self.num_error_files/self.num_files

    def _count_sites(self):
        self.site_counts = {}
        self.site_error_counts = {}
        for dset in self.reports.keys():
            files = set(self.reports[dset].args[:,0])
            for f in files:
                file_count = ak.sum((self.reports[dset].args[:,0] == f) & (self.reports[dset].args[:,2] == "0"))
                error_file_count = ak.sum((self.errors[dset].args[:,0] == f) & (self.errors[dset].args[:,2] == "0"))
                site = f.split('/store')[0]
                self.site_counts[site] = self.site_counts.get(site,0) + file_count
                self.site_error_counts[site] = self.site_error_counts.get(site,0) + error_file_count
                

    def _count_messages(self):
        _messages_list = []
        for dset in self.errors.keys():
            _messages_list += list(set(self.errors[dset].message))
        self.messages = set(_messages_list)
        self.messages_count = {}
        for dset in self.errors.keys():
            if ak.num(self.errors[dset].message,axis=0) == 0:
                continue
            for msg in self.messages:
                self.messages_count[msg] = self.messages_count.get(msg,0) + ak.sum(self.errors[dset].message == msg)

## Results for Remote Reading, Max 5 Files per Sample

In [48]:
rep1 = ParsedReport(f"reports/{REMOTE_5_NONE}")

In [49]:
rep1.print_metrics(sites=True)

AGGREGATE INFO:
----------------------------------------------------
Total number of files: 40
Total file read error rate: 2.5%
Total number of chunks: 3209
Total chunk read error rate: 4.799002804612028%


PER-DATASET INFO:
----------------------------------------------------
Dataset: /TTToHadronic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAOD-106X_upgrade2018_realistic_v11_L1v1-v1/NANOAODSIM
	Number of files: 5
	File read error rate: 0.0%
	Number of chunks: 532
	Chunk read error rate: 0.0%
Dataset: /ST_s-channel_4f_hadronicDecays_TuneCP5_13TeV-amcatnlo-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Number of files: 5
	File read error rate: 0.0%
	Number of chunks: 292
	Chunk read error rate: 0.0%
Dataset: /ST_t-channel_top_4f_InclusiveDecays_TuneCP5_13TeV-powheg-madspin-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Number of files: 5
	File read error rate: 0.0%
	Number of chunks: 503
	Chunk read err

## Results for XCache Reading, Max 5 Files per Sample, Files not yet in XCache

In [50]:
rep2 = ParsedReport(f"reports/{XCACHE_5_NONE_FIRST}")

In [51]:
rep2.print_metrics(sites=True)

AGGREGATE INFO:
----------------------------------------------------
Total number of files: 40
Total file read error rate: 2.5%
Total number of chunks: 3209
Total chunk read error rate: 2.243689622935494%


PER-DATASET INFO:
----------------------------------------------------
Dataset: /TTToHadronic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAOD-106X_upgrade2018_realistic_v11_L1v1-v1/NANOAODSIM
	Number of files: 5
	File read error rate: 0.0%
	Number of chunks: 532
	Chunk read error rate: 0.0%
Dataset: /ST_s-channel_4f_hadronicDecays_TuneCP5_13TeV-amcatnlo-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Number of files: 5
	File read error rate: 0.0%
	Number of chunks: 292
	Chunk read error rate: 0.0%
Dataset: /ST_t-channel_top_4f_InclusiveDecays_TuneCP5_13TeV-powheg-madspin-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Number of files: 5
	File read error rate: 0.0%
	Number of chunks: 503
	Chunk read err

## Results for XCache Reading, Max 5 Files per Sample, Files already in XCache

In [52]:
rep3 = ParsedReport(f"reports/{XCACHE_5_NONE_SECOND}")

In [53]:
rep3.print_metrics()

AGGREGATE INFO:
----------------------------------------------------
Total number of files: 40
Total file read error rate: 2.5%
Total number of chunks: 3209
Total chunk read error rate: 2.243689622935494%


PER-DATASET INFO:
----------------------------------------------------
Dataset: /TTToHadronic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAOD-106X_upgrade2018_realistic_v11_L1v1-v1/NANOAODSIM
	Number of files: 5
	File read error rate: 0.0%
	Number of chunks: 532
	Chunk read error rate: 0.0%
Dataset: /ST_s-channel_4f_hadronicDecays_TuneCP5_13TeV-amcatnlo-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Number of files: 5
	File read error rate: 0.0%
	Number of chunks: 292
	Chunk read error rate: 0.0%
Dataset: /ST_t-channel_top_4f_InclusiveDecays_TuneCP5_13TeV-powheg-madspin-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Number of files: 5
	File read error rate: 0.0%
	Number of chunks: 503
	Chunk read err

## Results for Remote Reading, All Files All Chunks

In [54]:
rep4 = ParsedReport(f"reports/{REMOTE_NONE_NONE}")

In [55]:
rep4.print_metrics(sites=True)

AGGREGATE INFO:
----------------------------------------------------
Total number of files: 823
Total file read error rate: 4.981773997569866%
Total number of chunks: 79735
Total chunk read error rate: 3.620743713551138%


PER-DATASET INFO:
----------------------------------------------------
Dataset: /TTToHadronic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAOD-106X_upgrade2018_realistic_v11_L1v1-v1/NANOAODSIM
	Number of files: 329
	File read error rate: 3.0395136778115504%
	Number of chunks: 31863
	Chunk read error rate: 0.7343941248470013%
Dataset: /ST_s-channel_4f_hadronicDecays_TuneCP5_13TeV-amcatnlo-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Number of files: 30
	File read error rate: 0.0%
	Number of chunks: 1627
	Chunk read error rate: 0.0%
Dataset: /ST_t-channel_top_4f_InclusiveDecays_TuneCP5_13TeV-powheg-madspin-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Number of files: 124
	File read e

## Results for XCache Reading, All Files All Chunks

In [56]:
rep5 = ParsedReport(f"reports/{XCACHE_NONE_NONE_FIRST}")

In [57]:
rep5.print_metrics()

AGGREGATE INFO:
----------------------------------------------------
Total number of files: 823
Total file read error rate: 13.365735115431349%
Total number of chunks: 79735
Total chunk read error rate: 10.410735561547627%


PER-DATASET INFO:
----------------------------------------------------
Dataset: /TTToHadronic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAOD-106X_upgrade2018_realistic_v11_L1v1-v1/NANOAODSIM
	Number of files: 329
	File read error rate: 17.325227963525837%
	Number of chunks: 31863
	Chunk read error rate: 14.116687066503466%
Dataset: /ST_s-channel_4f_hadronicDecays_TuneCP5_13TeV-amcatnlo-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Number of files: 30
	File read error rate: 6.666666666666667%
	Number of chunks: 1627
	Chunk read error rate: 16.65642286416718%
Dataset: /ST_t-channel_top_4f_InclusiveDecays_TuneCP5_13TeV-powheg-madspin-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM
	Num