Merge 240e8c4 into bbc5d20

icecube · Oct 7, 2022 · 364b414 · 364b414
2 parents bbc5d20 + 240e8c4
commit 364b414
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 7 deletions.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -9,6 +9,12 @@
     .. automodule:: flarestack.core.data_types
         :members:
 
+#########################
+Data formats and datasets
+#########################
+.. automodule:: flarestack.data.dataset_index
+    :members:
+
 ###############
 Base PDFs
 ###############

diff --git a/docs/source/datasets.md b/docs/source/datasets.md
@@ -0,0 +1,46 @@
+# Datasets
+*flarestack* is designed to work with different types of datasets.
+
+Datasets are stored under the *flarestack* data directory (`$FLARESTACK_DATA_DIR`). Note that this is different from the `flarestack__data` directory that is automatically created under `$FLARESTACK_SCRATCH_DIR`. The former is a static repository of datasets, the latter is the actual working directory of *flarestack*. Python modules acting as interfaces to the stored datasets are included under `flarestack/data`.
+
+## Dataset index
+*flarestack* currently implements a dataset index, an auxiliary dictionary that allows to retrieve datasets by name (instead of having to look up an object in the corresponding interface module). You can access the index by importing `flarestack.data.dataset_index'. You can use it by following this example:
+
+```python
+from flarestack.data.dataset_index import dataset_index
+print(dataset_index.get_dataset_list())
+dataset_name = dataset_index.get_dataset_list()[0] # just get the first dataset name the list
+dataset = dataset_index.get_dataset(dataset_name)
+```
+
+## Reduce a dataset to the relevant seasons
+A dataset is usually composed of different seasons. When conducting time-dependent analyses, it could be more efficient to discard the season that do not overlap with the time frame of the chosen signal injection and search. The module `flarestack.utils.custom_dataset` comes to help:
+
+```python
+dataset = dataset_index.get_dataset(dataset_name)
+catalogue = np.load(catalogue_path)
+common_time_pdf = { "time_pdf_name": "custom_source_box" } # example time PDF
+
+from flarestack.utils.custom_dataset import custom_dataset
+reduced_dataset = custom_dataset(dataset, catalogue, common_time_pdf)
+```
+
+## Adding a new dataset
+To add a new dataset to *flarestack*:
+- store the corresponding files under `$FLARESTACK_DATA_DIR`. If the dataset is a new version of an existing one, follow the same directory hierarchy. Otherwise, you will likely have to create your own path specification;
+- create an interface module under `flarestack/data`;
+- import the corresponding dataset object in `flarestack/data/__init__.py`.
+
+To add the dataset to the index, first import the index in the dataset interface module:
+```python
+from flarestack.data.dataset_index import dataset_index
+
+sample_name = "ps_tracks_v004_p02" # give the dataset a meaningful name
+ps_v004_p02 = IceCubeDataset() # instantiate the dataset
+"""
+[...] dataset is populated here [...]
+"""
+dataset_index.add_dataset("icecube." + sample_name, ps_v004_p02) # add the dataset to the index
+```
+
+**Important**: for the correct population of the index, the dataset needs to be added to `flarestack/data/__init.py__` (see above).
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -50,6 +50,7 @@ CONTENTS
 .. toctree::
    setup
    data_types
+   datasets
    flarestack_llh_workshop
    api
    :maxdepth: 2

diff --git a/flarestack/data/dataset_index.py b/flarestack/data/dataset_index.py
@@ -0,0 +1,66 @@
+""" This module provides the functionality to create a dataset index by instantiating a DatasetIndex object and importing all the available datasets. Each dataset, in turns, is expect to import `dataset_index` from this module, and adding its own information.
+"""
+
+import logging
+from typing import List
+from flarestack.data import Dataset
+
+logger = logging.getLogger(__name__)
+
+
+class DatasetIndex:
+    """Class storing an index for available datasets"""
+
+    def __init__(self) -> None:
+        """constructor"""
+        self.index = dict()
+
+    def add_dataset(self, name: str, object: Dataset) -> None:
+        """adds a dataset to the index
+
+        Args:
+            name (str): assigned name of the dataset
+            object (Dataset): dataset object
+        """
+        self.index[name] = object
+
+    def add_alias(self, alias: str, name: str) -> None:
+        """adds an alias for a dataset
+
+        Args:
+            alias (str): alias name
+            name (str): dataset name
+        """
+        dest = self.index[name]
+        if isinstance(dest, Dataset):
+            self.index[alias] = name
+        else:
+            logger.warning("f{name} is already an alias, aliasing {dest} instead.")
+            self.index[alias] = dest
+
+    def get_dataset(self, name: str) -> Dataset:
+        """retrieve a dataset by name
+
+        Args:
+            name (str): dataset name
+
+        Returns:
+            Dataset: dataset
+        """
+        dest = self.index[name]
+        if isinstance(dest, Dataset):
+            return dest
+        else:
+            logger.info(f"{name} is an alias for {dest}")
+            return self.index[dest]
+
+    def get_dataset_list(self):
+        """Get list of indexed datasets"""
+        return self.index.keys()
+
+
+dataset_index = DatasetIndex()
+
+# Datasets will in turn import dataset_index and register themselves in the index.
+import flarestack.data.public
+import flarestack.data.icecube
diff --git a/flarestack/data/icecube/ic_season.py b/flarestack/data/icecube/ic_season.py
@@ -11,6 +11,8 @@
 from scipy.interpolate import interp1d
 import logging
 from pathlib import Path
+from typing import Tuple
+
 
 logger = logging.getLogger(__name__)
 
@@ -43,20 +45,17 @@
 """
 mirror_7yr_dirname = "mirror-7year-PS-sens"  # expected identical at all mirrors
 
-DESY_data_path = Path("/lustre/fs22/group/icecube/data_mirror")
-DESY_sens_path = DESY_data_path / "ref_sensitivity"
-
 if icecube_dataset_dir is not None:
     logger.info(f"Loading datasets from {icecube_dataset_dir} (local)")
 
-    icecube_dataset_path = Path(icecube_dataset_dir)
+    icecube_dataset_dir = Path(icecube_dataset_dir)
 
-    ref_dir_7yr = icecube_dataset_path / mirror_7yr_dirname
+    ref_dir_7yr = icecube_dataset_dir / mirror_7yr_dirname
     if not ref_dir_7yr.is_dir():
         logger.warning(f"No 7yr sensitivity directory found at {ref_dir_7yr}")
         ref_dir_7yr = None
 
-    ref_10yr = Path(icecube_dataset_dir) / ref_10yr_filename
+    ref_10yr = icecube_dataset_dir / ref_10yr_filename
     if not ref_10yr.is_file():
         logger.warning(f"No 10yr sensitivity found at {ref_10yr}")
         ref_10yr = None
@@ -65,6 +64,9 @@
         "Local dataset directory not found. Assuming we are running on an supported datacenter (WIPAC, DESY), I will try to fetch the data from central storage."
     )
 
+DESY_data_path = Path("/lustre/fs22/group/icecube/data_mirror")
+DESY_sens_path = DESY_data_path / "ref_sensitivity"
+
 # Only load from central storage if $FLARESTACK_DATASET_DIR is not set.
 if icecube_dataset_dir is None:
     # NOTE: he following block has no failsafe against changes in the directory structure.
@@ -96,7 +98,7 @@ def get_dataset_dir() -> str:
     return dataset_dir
 
 
-def get_published_sens_ref_dir() -> (Path, Path):
+def get_published_sens_ref_dir() -> Tuple[Path]:
     """
     Returns the paths to reference sensitivities.
     """