Skip to content

Commit

Permalink
Merge 240e8c4 into bbc5d20
Browse files Browse the repository at this point in the history
  • Loading branch information
mlincett committed Oct 7, 2022
2 parents bbc5d20 + 240e8c4 commit 364b414
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 7 deletions.
6 changes: 6 additions & 0 deletions docs/source/api.rst
Expand Up @@ -9,6 +9,12 @@
.. automodule:: flarestack.core.data_types
:members:
#########################
Data formats and datasets
#########################
.. automodule:: flarestack.data.dataset_index
:members:

###############
Base PDFs
###############
Expand Down
46 changes: 46 additions & 0 deletions docs/source/datasets.md
@@ -0,0 +1,46 @@
# Datasets
*flarestack* is designed to work with different types of datasets.

Datasets are stored under the *flarestack* data directory (`$FLARESTACK_DATA_DIR`). Note that this is different from the `flarestack__data` directory that is automatically created under `$FLARESTACK_SCRATCH_DIR`. The former is a static repository of datasets, the latter is the actual working directory of *flarestack*. Python modules acting as interfaces to the stored datasets are included under `flarestack/data`.

## Dataset index
*flarestack* currently implements a dataset index, an auxiliary dictionary that allows to retrieve datasets by name (instead of having to look up an object in the corresponding interface module). You can access the index by importing `flarestack.data.dataset_index'. You can use it by following this example:

```python
from flarestack.data.dataset_index import dataset_index
print(dataset_index.get_dataset_list())
dataset_name = dataset_index.get_dataset_list()[0] # just get the first dataset name the list
dataset = dataset_index.get_dataset(dataset_name)
```

## Reduce a dataset to the relevant seasons
A dataset is usually composed of different seasons. When conducting time-dependent analyses, it could be more efficient to discard the season that do not overlap with the time frame of the chosen signal injection and search. The module `flarestack.utils.custom_dataset` comes to help:

```python
dataset = dataset_index.get_dataset(dataset_name)
catalogue = np.load(catalogue_path)
common_time_pdf = { "time_pdf_name": "custom_source_box" } # example time PDF

from flarestack.utils.custom_dataset import custom_dataset
reduced_dataset = custom_dataset(dataset, catalogue, common_time_pdf)
```

## Adding a new dataset
To add a new dataset to *flarestack*:
- store the corresponding files under `$FLARESTACK_DATA_DIR`. If the dataset is a new version of an existing one, follow the same directory hierarchy. Otherwise, you will likely have to create your own path specification;
- create an interface module under `flarestack/data`;
- import the corresponding dataset object in `flarestack/data/__init__.py`.

To add the dataset to the index, first import the index in the dataset interface module:
```python
from flarestack.data.dataset_index import dataset_index

sample_name = "ps_tracks_v004_p02" # give the dataset a meaningful name
ps_v004_p02 = IceCubeDataset() # instantiate the dataset
"""
[...] dataset is populated here [...]
"""
dataset_index.add_dataset("icecube." + sample_name, ps_v004_p02) # add the dataset to the index
```

**Important**: for the correct population of the index, the dataset needs to be added to `flarestack/data/__init.py__` (see above).
1 change: 1 addition & 0 deletions docs/source/index.rst
Expand Up @@ -50,6 +50,7 @@ CONTENTS
.. toctree::
setup
data_types
datasets
flarestack_llh_workshop
api
:maxdepth: 2
Expand Down
66 changes: 66 additions & 0 deletions flarestack/data/dataset_index.py
@@ -0,0 +1,66 @@
""" This module provides the functionality to create a dataset index by instantiating a DatasetIndex object and importing all the available datasets. Each dataset, in turns, is expect to import `dataset_index` from this module, and adding its own information.
"""

import logging
from typing import List
from flarestack.data import Dataset

logger = logging.getLogger(__name__)


class DatasetIndex:
"""Class storing an index for available datasets"""

def __init__(self) -> None:
"""constructor"""
self.index = dict()

def add_dataset(self, name: str, object: Dataset) -> None:
"""adds a dataset to the index
Args:
name (str): assigned name of the dataset
object (Dataset): dataset object
"""
self.index[name] = object

def add_alias(self, alias: str, name: str) -> None:
"""adds an alias for a dataset
Args:
alias (str): alias name
name (str): dataset name
"""
dest = self.index[name]
if isinstance(dest, Dataset):
self.index[alias] = name
else:
logger.warning("f{name} is already an alias, aliasing {dest} instead.")
self.index[alias] = dest

def get_dataset(self, name: str) -> Dataset:
"""retrieve a dataset by name
Args:
name (str): dataset name
Returns:
Dataset: dataset
"""
dest = self.index[name]
if isinstance(dest, Dataset):
return dest
else:
logger.info(f"{name} is an alias for {dest}")
return self.index[dest]

def get_dataset_list(self):
"""Get list of indexed datasets"""
return self.index.keys()


dataset_index = DatasetIndex()

# Datasets will in turn import dataset_index and register themselves in the index.
import flarestack.data.public
import flarestack.data.icecube
16 changes: 9 additions & 7 deletions flarestack/data/icecube/ic_season.py
Expand Up @@ -11,6 +11,8 @@
from scipy.interpolate import interp1d
import logging
from pathlib import Path
from typing import Tuple


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -43,20 +45,17 @@
"""
mirror_7yr_dirname = "mirror-7year-PS-sens" # expected identical at all mirrors

DESY_data_path = Path("/lustre/fs22/group/icecube/data_mirror")
DESY_sens_path = DESY_data_path / "ref_sensitivity"

if icecube_dataset_dir is not None:
logger.info(f"Loading datasets from {icecube_dataset_dir} (local)")

icecube_dataset_path = Path(icecube_dataset_dir)
icecube_dataset_dir = Path(icecube_dataset_dir)

ref_dir_7yr = icecube_dataset_path / mirror_7yr_dirname
ref_dir_7yr = icecube_dataset_dir / mirror_7yr_dirname
if not ref_dir_7yr.is_dir():
logger.warning(f"No 7yr sensitivity directory found at {ref_dir_7yr}")
ref_dir_7yr = None

ref_10yr = Path(icecube_dataset_dir) / ref_10yr_filename
ref_10yr = icecube_dataset_dir / ref_10yr_filename
if not ref_10yr.is_file():
logger.warning(f"No 10yr sensitivity found at {ref_10yr}")
ref_10yr = None
Expand All @@ -65,6 +64,9 @@
"Local dataset directory not found. Assuming we are running on an supported datacenter (WIPAC, DESY), I will try to fetch the data from central storage."
)

DESY_data_path = Path("/lustre/fs22/group/icecube/data_mirror")
DESY_sens_path = DESY_data_path / "ref_sensitivity"

# Only load from central storage if $FLARESTACK_DATASET_DIR is not set.
if icecube_dataset_dir is None:
# NOTE: he following block has no failsafe against changes in the directory structure.
Expand Down Expand Up @@ -96,7 +98,7 @@ def get_dataset_dir() -> str:
return dataset_dir


def get_published_sens_ref_dir() -> (Path, Path):
def get_published_sens_ref_dir() -> Tuple[Path]:
"""
Returns the paths to reference sensitivities.
"""
Expand Down

0 comments on commit 364b414

Please sign in to comment.