Merge d2e4488 into 1dc1c52

icecube · Oct 6, 2022 · d7e1e8d · d7e1e8d
2 parents 1dc1c52 + d2e4488
commit d7e1e8d
Show file tree

Hide file tree

Showing 15 changed files with 489 additions and 41 deletions.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -9,6 +9,12 @@
     .. automodule:: flarestack.core.data_types
         :members:
 
+#########################
+Data formats and datasets
+#########################
+.. automodule:: flarestack.data.dataset_index
+    :members:
+
 ###############
 Base PDFs
 ###############

diff --git a/docs/source/datasets.md b/docs/source/datasets.md
@@ -0,0 +1,46 @@
+# Datasets
+*flarestack* is designed to work with different types of datasets.
+
+Datasets are stored under the *flarestack* data directory (`$FLARESTACK_DATA_DIR`). Note that this is different from the `flarestack__data` directory that is automatically created under `$FLARESTACK_SCRATCH_DIR`. The former is a static repository of datasets, the latter is the actual working directory of *flarestack*. Python modules acting as interfaces to the stored datasets are included under `flarestack/data`.
+
+## Dataset index
+*flarestack* currently implements a dataset index, an auxiliary dictionary that allows to retrieve datasets by name (instead of having to look up an object in the corresponding interface module). You can access the index by importing `flarestack.data.dataset_index'. You can use it by following this example:
+
+```python
+from flarestack.data.dataset_index import dataset_index
+print(dataset_index.get_dataset_list())
+dataset_name = dataset_index.get_dataset_list()[0] # just get the first dataset name the list
+dataset = dataset_index.get_dataset(dataset_name)
+```
+
+## Reduce a dataset to the relevant seasons
+A dataset is usually composed of different seasons. When conducting time-dependent analyses, it could be more efficient to discard the season that do not overlap with the time frame of the chosen signal injection and search. The module `flarestack.utils.custom_dataset` comes to help:
+
+```python
+dataset = dataset_index.get_dataset(dataset_name)
+catalogue = np.load(catalogue_path)
+common_time_pdf = { "time_pdf_name": "custom_source_box" } # example time PDF
+
+from flarestack.utils.custom_dataset import custom_dataset
+reduced_dataset = custom_dataset(dataset, catalogue, common_time_pdf)
+```
+
+## Adding a new dataset
+To add a new dataset to *flarestack*:
+- store the corresponding files under `$FLARESTACK_DATA_DIR`. If the dataset is a new version of an existing one, follow the same directory hierarchy. Otherwise, you will likely have to create your own path specification;
+- create an interface module under `flarestack/data`;
+- import the corresponding dataset object in `flarestack/data/__init__.py`.
+
+To add the dataset to the index, first import the index in the dataset interface module:
+```python
+from flarestack.data.dataset_index import dataset_index
+
+sample_name = "ps_tracks_v004_p02" # give the dataset a meaningful name
+ps_v004_p02 = IceCubeDataset() # instantiate the dataset
+"""
+[...] dataset is populated here [...]
+"""
+dataset_index.add_dataset("icecube." + sample_name, ps_v004_p02) # add the dataset to the index
+```
+
+**Important**: for the correct population of the index, the dataset needs to be added to `flarestack/data/__init.py__` (see above).
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -50,6 +50,7 @@ CONTENTS
 .. toctree::
    setup
    data_types
+   datasets
    flarestack_llh_workshop
    api
    :maxdepth: 2

diff --git a/flarestack/cluster/make_desy_cluster_script.py b/flarestack/cluster/make_desy_cluster_script.py
@@ -4,6 +4,8 @@
 
 logger = logging.getLogger(__name__)
 
+logger.warning("WARNING: this module is deprecated. Check `flarestack.cluster.submitter` instead.")
+
 username = os.path.basename(os.environ["HOME"])
 
 root_dir = os.path.dirname(fs_dir[:-1])

diff --git a/flarestack/cluster/submitter.py b/flarestack/cluster/submitter.py
@@ -1,4 +1,4 @@
-import os, subprocess, time, logging, shutil, copy
+import os, subprocess, time, logging, shutil, copy, sys
 import numpy as np
 from flarestack.shared import (
     fs_dir,
@@ -437,7 +437,9 @@ def make_cluster_submission_script(self):
             "eval $(/cvmfs/icecube.opensciencegrid.org/py3-v4.1.0/setup.sh) \n"
             "export PYTHONPATH=" + DESYSubmitter.root_dir + "/ \n"
             "export FLARESTACK_SCRATCH_DIR=" + flarestack_scratch_dir + " \n"
-            "python " + fs_dir + "core/multiprocess_wrapper.py -f $1 -n $2 \n"
+            f"{sys.executable} "
+            + fs_dir
+            + "core/multiprocess_wrapper.py -f $1 -n $2 \n"
             "cp $TMPDIR/${JOB_ID}_stdout.txt " + log_dir + "\n"
             "cp $TMPDIR/${JOB_ID}_stderr.txt " + log_dir + "\n "
         )

diff --git a/flarestack/core/results.py b/flarestack/core/results.py
@@ -50,6 +50,8 @@ def __init__(self, rh_dict, do_sens=True, do_disc=True, bias_error="std"):
 
         self.allow_extrapolation = rh_dict.get("allow_extrapolated_sensitivity", True)
 
+        self.valid = True
+
         # Checks if the code should search for flares. By default, this is
         # not done.
         # self.flare = self.mh_name == "flare"
@@ -112,12 +114,20 @@ def __init__(self, rh_dict, do_sens=True, do_disc=True, bias_error="std"):
         self.extrapolated_disc = False
         self.flux_to_ns = np.nan
 
-        # if self.show_inj:
-        self.inj = self.load_injection_values()
-        self._inj_dict = rh_dict["inj_dict"]
-        self._dataset = rh_dict["dataset"]
-        # else:
-        #     self.inj = None
+        try:
+            # if self.show_inj:
+            self.inj = self.load_injection_values()
+            self._inj_dict = rh_dict["inj_dict"]
+            self._dataset = rh_dict["dataset"]
+            # else:
+            #     self.inj = None
+        except FileNotFoundError as err:
+            logger.error(
+                "Unable to load injection values. Have you run this analysis at least once?"
+            )
+            logger.error(err)
+            self.valid = False
+            return
 
         try:
             self.merge_pickle_data()
@@ -137,7 +147,7 @@ def __init__(self, rh_dict, do_sens=True, do_disc=True, bias_error="std"):
             try:
                 self.find_sensitivity()
             except ValueError as e:
-                logger.warning("RuntimeError for discovery potential: \n {0}".format(e))
+                logger.warning("RuntimeError for sensitivity: \n {0}".format(e))
 
         if do_disc:
             try:
@@ -149,6 +159,15 @@ def __init__(self, rh_dict, do_sens=True, do_disc=True, bias_error="std"):
             except ValueError as e:
                 logger.warning("TypeError for discovery potential: \n {0}".format(e))
 
+    def is_valid(self):
+        """If results are valid, returns True.
+            If something went wrong during the instantiation, returns False.
+
+        Returns:
+            bool: whether results are valid or not.
+        """
+        return self.valid
+
     @property
     def scales_float(self):
         """directly return the injected scales as floats"""
@@ -550,8 +569,13 @@ def find_overfluctuations(self, ts_val, savepath=None):
                 x_acc.append(float(scale))
                 yerr.append(1.0 / np.sqrt(float(len(ts_array))))
 
-                self.make_plots(scale)
-
+                if frac != 0.0:
+                    logger.info(f"Making plot for {scale=}, {frac=}")
+                    self.make_plots(scale)
+                else:
+                    logger.warning(
+                        f"Fraction of overfluctuations is {frac=}, skipping plot for {scale=}"
+                    )
         if len(np.where(np.array(y) < 0.95)[0]) < 2:
             raise OverfluctuationError(
                 f"Not enough points with overfluctuations under 95%, lower injection scale!"
@@ -674,19 +698,28 @@ def find_disc_potential(self):
 
         x = [scale_shortener(i) for i in sorted([float(j) for j in x])]
 
+        if np.isnan(disc_threshold):
+            logger.warning(
+                f"Invalid discovery threshold {disc_threshold=} will be ingnored. Using TS = 25.0 only."
+            )
+
         for scale in x:
             ts_array = np.array(self.results[scale]["TS"])
-            frac = float(len(ts_array[ts_array > disc_threshold])) / (
-                float(len(ts_array))
-            )
 
-            logger.info(
-                "Fraction of overfluctuations is {0:.2f} above {1:.2f} (N_trials={2}) (Scale={3})".format(
-                    frac, disc_threshold, len(ts_array), scale
+            if not np.isnan(disc_threshold):
+
+                frac = float(len(ts_array[ts_array > disc_threshold])) / (
+                    float(len(ts_array))
                 )
-            )
 
-            y.append(frac)
+                logger.info(
+                    "Fraction of overfluctuations is {0:.2f} above {1:.2f} (N_trials={2}) (Scale={3})".format(
+                        frac, disc_threshold, len(ts_array), scale
+                    )
+                )
+
+                y.append(frac)
+
             frac_25 = float(len(ts_array[ts_array > 25.0])) / (float(len(ts_array)))
 
             logger.info(
@@ -697,7 +730,13 @@ def find_disc_potential(self):
 
             y_25.append(frac_25)
 
+            # if frac != 0.0:
+            #    logger.info(f"Making plot for {scale=}, {frac=}")
             self.make_plots(scale)
+            # else:
+            #    logger.warning(
+            #        f"Fraction of overfluctuations is {frac=}, skipping plot for {scale=}"
+            #    )
 
         x = np.array([float(s) for s in x])
 
@@ -707,7 +746,14 @@ def find_disc_potential(self):
 
         sols = []
 
-        for i, y_val in enumerate([y, y_25]):
+        if np.isnan(disc_threshold):
+            y_list = [y_25]
+            out_list = ["disc_potential_25"]
+        else:
+            y_list = [y, y_25]
+            out_list = ["disc_potential", "disc_potential_25"]
+
+        for i, y_val in enumerate(y_list):
 
             def f(x, a, b, c):
                 value = scipy.stats.gamma.cdf(x, a, b, c)
@@ -728,9 +774,7 @@ def best_f(x):
                     return f(x, best_a, best_b, best_c)
 
                 sol = scipy.stats.gamma.ppf(0.5, best_a, best_b, best_c)
-                setattr(
-                    self, ["disc_potential", "disc_potential_25"][i], k_to_flux(sol)
-                )
+                setattr(self, out_list[i], k_to_flux(sol))
 
             except RuntimeError as e:
                 logger.warning(f"RuntimeError for discovery potential!: {e}")
@@ -779,6 +823,7 @@ def best_f(x):
         )
 
     def noflare_plots(self, scale):
+
         ts_array = np.array(self.results[scale]["TS"])
         ts_path = os.path.join(self.plot_dir, "ts_distributions/" + str(scale) + ".pdf")
 

diff --git a/flarestack/core/ts_distributions.py b/flarestack/core/ts_distributions.py
@@ -307,9 +307,17 @@ def plot_background_ts_distribution(
 
     ts_array = ts_array[~np.isnan(ts_array)]
 
+    max_ts = np.max(ts_array)
+
     if np.median(ts_array) < 0.0:
         plot_expanded_negative(ts_array, path)
 
+    if max_ts == 0:
+        logger.warning(
+            f"Maximum of TS is {max_ts=}, unable to calculate discovery threshold (too few trials?)"
+        )
+        return np.NaN
+
     fig = plt.figure()
 
     ax = plt.subplot(1, 1, 1)
@@ -322,8 +330,6 @@ def plot_background_ts_distribution(
 
     plt.axhline(frac_over * (1 - five_sigma), color="r", linestyle="--")
 
-    max_ts = np.max(ts_array)
-
     disc_potential = scipy.stats.chi2.ppf(five_sigma, df, loc, scale)
 
     x_range = np.linspace(0.0, max(max_ts, disc_potential), 100)

diff --git a/flarestack/data/dataset_index.py b/flarestack/data/dataset_index.py
@@ -0,0 +1,66 @@
+""" This module provides the functionality to create a dataset index by instantiating a DatasetIndex object and importing all the available datasets. Each dataset, in turns, is expect to import `dataset_index` from this module, and adding its own information.
+"""
+
+import logging
+from typing import List
+from flarestack.data import Dataset
+
+logger = logging.getLogger(__name__)
+
+
+class DatasetIndex:
+    """Class storing an index for available datasets"""
+
+    def __init__(self) -> None:
+        """constructor"""
+        self.index = dict()
+
+    def add_dataset(self, name: str, object: Dataset) -> None:
+        """adds a dataset to the index
+
+        Args:
+            name (str): assigned name of the dataset
+            object (Dataset): dataset object
+        """
+        self.index[name] = object
+
+    def add_alias(self, alias: str, name: str) -> None:
+        """adds an alias for a dataset
+
+        Args:
+            alias (str): alias name
+            name (str): dataset name
+        """
+        dest = self.index[name]
+        if isinstance(dest, Dataset):
+            self.index[alias] = name
+        else:
+            logger.warning("f{name} is already an alias, aliasing {dest} instead.")
+            self.index[alias] = dest
+
+    def get_dataset(self, name: str) -> Dataset:
+        """retrieve a dataset by name
+
+        Args:
+            name (str): dataset name
+
+        Returns:
+            Dataset: dataset
+        """
+        dest = self.index[name]
+        if isinstance(dest, Dataset):
+            return dest
+        else:
+            logger.info(f"{name} is an alias for {dest}")
+            return self.index[dest]
+
+    def get_dataset_list(self):
+        """Get list of indexed datasets"""
+        return self.index.keys()
+
+
+dataset_index = DatasetIndex()
+
+# Datasets will in turn import dataset_index and register themselves in the index.
+import flarestack.data.public
+import flarestack.data.icecube
diff --git a/flarestack/data/icecube/__init__.py b/flarestack/data/icecube/__init__.py
@@ -3,8 +3,10 @@
 from flarestack.data.icecube.ps_tracks.ps_v003_p01 import ps_v003_p01
 from flarestack.data.icecube.ps_tracks.ps_v003_p02 import ps_v003_p02
 from flarestack.data.icecube.ps_tracks.ps_v004_p00 import ps_v004_p00
+from flarestack.data.icecube.ps_tracks.ps_v004_p02 import ps_v004_p02
 from flarestack.data.icecube.northern_tracks.nt_v002_p05 import nt_v002_p05
 from flarestack.data.icecube.northern_tracks.nt_v005_p00 import nt_v005_p00
+from flarestack.data.icecube.northern_tracks.nt_v005_p01 import nt_v005_p01
 from flarestack.data.icecube.gfu.gfu_v002_p01 import gfu_v002_p01, txs_sample_v1
 from flarestack.data.icecube.gfu.gfu_v002_p02 import gfu_v002_p02, txs_sample_v2
 from flarestack.data.icecube.gfu.gfu_v002_p04 import gfu_v002_p04