Skip to content

Commit

Permalink
Merge d2e4488 into 1dc1c52
Browse files Browse the repository at this point in the history
  • Loading branch information
mlincett committed Oct 6, 2022
2 parents 1dc1c52 + d2e4488 commit d7e1e8d
Show file tree
Hide file tree
Showing 15 changed files with 489 additions and 41 deletions.
6 changes: 6 additions & 0 deletions docs/source/api.rst
Expand Up @@ -9,6 +9,12 @@
.. automodule:: flarestack.core.data_types
:members:
#########################
Data formats and datasets
#########################
.. automodule:: flarestack.data.dataset_index
:members:

###############
Base PDFs
###############
Expand Down
46 changes: 46 additions & 0 deletions docs/source/datasets.md
@@ -0,0 +1,46 @@
# Datasets
*flarestack* is designed to work with different types of datasets.

Datasets are stored under the *flarestack* data directory (`$FLARESTACK_DATA_DIR`). Note that this is different from the `flarestack__data` directory that is automatically created under `$FLARESTACK_SCRATCH_DIR`. The former is a static repository of datasets, the latter is the actual working directory of *flarestack*. Python modules acting as interfaces to the stored datasets are included under `flarestack/data`.

## Dataset index
*flarestack* currently implements a dataset index, an auxiliary dictionary that allows to retrieve datasets by name (instead of having to look up an object in the corresponding interface module). You can access the index by importing `flarestack.data.dataset_index'. You can use it by following this example:

```python
from flarestack.data.dataset_index import dataset_index
print(dataset_index.get_dataset_list())
dataset_name = dataset_index.get_dataset_list()[0] # just get the first dataset name the list
dataset = dataset_index.get_dataset(dataset_name)
```

## Reduce a dataset to the relevant seasons
A dataset is usually composed of different seasons. When conducting time-dependent analyses, it could be more efficient to discard the season that do not overlap with the time frame of the chosen signal injection and search. The module `flarestack.utils.custom_dataset` comes to help:

```python
dataset = dataset_index.get_dataset(dataset_name)
catalogue = np.load(catalogue_path)
common_time_pdf = { "time_pdf_name": "custom_source_box" } # example time PDF

from flarestack.utils.custom_dataset import custom_dataset
reduced_dataset = custom_dataset(dataset, catalogue, common_time_pdf)
```

## Adding a new dataset
To add a new dataset to *flarestack*:
- store the corresponding files under `$FLARESTACK_DATA_DIR`. If the dataset is a new version of an existing one, follow the same directory hierarchy. Otherwise, you will likely have to create your own path specification;
- create an interface module under `flarestack/data`;
- import the corresponding dataset object in `flarestack/data/__init__.py`.

To add the dataset to the index, first import the index in the dataset interface module:
```python
from flarestack.data.dataset_index import dataset_index

sample_name = "ps_tracks_v004_p02" # give the dataset a meaningful name
ps_v004_p02 = IceCubeDataset() # instantiate the dataset
"""
[...] dataset is populated here [...]
"""
dataset_index.add_dataset("icecube." + sample_name, ps_v004_p02) # add the dataset to the index
```

**Important**: for the correct population of the index, the dataset needs to be added to `flarestack/data/__init.py__` (see above).
1 change: 1 addition & 0 deletions docs/source/index.rst
Expand Up @@ -50,6 +50,7 @@ CONTENTS
.. toctree::
setup
data_types
datasets
flarestack_llh_workshop
api
:maxdepth: 2
Expand Down
2 changes: 2 additions & 0 deletions flarestack/cluster/make_desy_cluster_script.py
Expand Up @@ -4,6 +4,8 @@

logger = logging.getLogger(__name__)

logger.warning("WARNING: this module is deprecated. Check `flarestack.cluster.submitter` instead.")

username = os.path.basename(os.environ["HOME"])

root_dir = os.path.dirname(fs_dir[:-1])
Expand Down
6 changes: 4 additions & 2 deletions flarestack/cluster/submitter.py
@@ -1,4 +1,4 @@
import os, subprocess, time, logging, shutil, copy
import os, subprocess, time, logging, shutil, copy, sys
import numpy as np
from flarestack.shared import (
fs_dir,
Expand Down Expand Up @@ -437,7 +437,9 @@ def make_cluster_submission_script(self):
"eval $(/cvmfs/icecube.opensciencegrid.org/py3-v4.1.0/setup.sh) \n"
"export PYTHONPATH=" + DESYSubmitter.root_dir + "/ \n"
"export FLARESTACK_SCRATCH_DIR=" + flarestack_scratch_dir + " \n"
"python " + fs_dir + "core/multiprocess_wrapper.py -f $1 -n $2 \n"
f"{sys.executable} "
+ fs_dir
+ "core/multiprocess_wrapper.py -f $1 -n $2 \n"
"cp $TMPDIR/${JOB_ID}_stdout.txt " + log_dir + "\n"
"cp $TMPDIR/${JOB_ID}_stderr.txt " + log_dir + "\n "
)
Expand Down
87 changes: 66 additions & 21 deletions flarestack/core/results.py
Expand Up @@ -50,6 +50,8 @@ def __init__(self, rh_dict, do_sens=True, do_disc=True, bias_error="std"):

self.allow_extrapolation = rh_dict.get("allow_extrapolated_sensitivity", True)

self.valid = True

# Checks if the code should search for flares. By default, this is
# not done.
# self.flare = self.mh_name == "flare"
Expand Down Expand Up @@ -112,12 +114,20 @@ def __init__(self, rh_dict, do_sens=True, do_disc=True, bias_error="std"):
self.extrapolated_disc = False
self.flux_to_ns = np.nan

# if self.show_inj:
self.inj = self.load_injection_values()
self._inj_dict = rh_dict["inj_dict"]
self._dataset = rh_dict["dataset"]
# else:
# self.inj = None
try:
# if self.show_inj:
self.inj = self.load_injection_values()
self._inj_dict = rh_dict["inj_dict"]
self._dataset = rh_dict["dataset"]
# else:
# self.inj = None
except FileNotFoundError as err:
logger.error(
"Unable to load injection values. Have you run this analysis at least once?"
)
logger.error(err)
self.valid = False
return

try:
self.merge_pickle_data()
Expand All @@ -137,7 +147,7 @@ def __init__(self, rh_dict, do_sens=True, do_disc=True, bias_error="std"):
try:
self.find_sensitivity()
except ValueError as e:
logger.warning("RuntimeError for discovery potential: \n {0}".format(e))
logger.warning("RuntimeError for sensitivity: \n {0}".format(e))

if do_disc:
try:
Expand All @@ -149,6 +159,15 @@ def __init__(self, rh_dict, do_sens=True, do_disc=True, bias_error="std"):
except ValueError as e:
logger.warning("TypeError for discovery potential: \n {0}".format(e))

def is_valid(self):
"""If results are valid, returns True.
If something went wrong during the instantiation, returns False.
Returns:
bool: whether results are valid or not.
"""
return self.valid

@property
def scales_float(self):
"""directly return the injected scales as floats"""
Expand Down Expand Up @@ -550,8 +569,13 @@ def find_overfluctuations(self, ts_val, savepath=None):
x_acc.append(float(scale))
yerr.append(1.0 / np.sqrt(float(len(ts_array))))

self.make_plots(scale)

if frac != 0.0:
logger.info(f"Making plot for {scale=}, {frac=}")
self.make_plots(scale)
else:
logger.warning(
f"Fraction of overfluctuations is {frac=}, skipping plot for {scale=}"
)
if len(np.where(np.array(y) < 0.95)[0]) < 2:
raise OverfluctuationError(
f"Not enough points with overfluctuations under 95%, lower injection scale!"
Expand Down Expand Up @@ -674,19 +698,28 @@ def find_disc_potential(self):

x = [scale_shortener(i) for i in sorted([float(j) for j in x])]

if np.isnan(disc_threshold):
logger.warning(
f"Invalid discovery threshold {disc_threshold=} will be ingnored. Using TS = 25.0 only."
)

for scale in x:
ts_array = np.array(self.results[scale]["TS"])
frac = float(len(ts_array[ts_array > disc_threshold])) / (
float(len(ts_array))
)

logger.info(
"Fraction of overfluctuations is {0:.2f} above {1:.2f} (N_trials={2}) (Scale={3})".format(
frac, disc_threshold, len(ts_array), scale
if not np.isnan(disc_threshold):

frac = float(len(ts_array[ts_array > disc_threshold])) / (
float(len(ts_array))
)
)

y.append(frac)
logger.info(
"Fraction of overfluctuations is {0:.2f} above {1:.2f} (N_trials={2}) (Scale={3})".format(
frac, disc_threshold, len(ts_array), scale
)
)

y.append(frac)

frac_25 = float(len(ts_array[ts_array > 25.0])) / (float(len(ts_array)))

logger.info(
Expand All @@ -697,7 +730,13 @@ def find_disc_potential(self):

y_25.append(frac_25)

# if frac != 0.0:
# logger.info(f"Making plot for {scale=}, {frac=}")
self.make_plots(scale)
# else:
# logger.warning(
# f"Fraction of overfluctuations is {frac=}, skipping plot for {scale=}"
# )

x = np.array([float(s) for s in x])

Expand All @@ -707,7 +746,14 @@ def find_disc_potential(self):

sols = []

for i, y_val in enumerate([y, y_25]):
if np.isnan(disc_threshold):
y_list = [y_25]
out_list = ["disc_potential_25"]
else:
y_list = [y, y_25]
out_list = ["disc_potential", "disc_potential_25"]

for i, y_val in enumerate(y_list):

def f(x, a, b, c):
value = scipy.stats.gamma.cdf(x, a, b, c)
Expand All @@ -728,9 +774,7 @@ def best_f(x):
return f(x, best_a, best_b, best_c)

sol = scipy.stats.gamma.ppf(0.5, best_a, best_b, best_c)
setattr(
self, ["disc_potential", "disc_potential_25"][i], k_to_flux(sol)
)
setattr(self, out_list[i], k_to_flux(sol))

except RuntimeError as e:
logger.warning(f"RuntimeError for discovery potential!: {e}")
Expand Down Expand Up @@ -779,6 +823,7 @@ def best_f(x):
)

def noflare_plots(self, scale):

ts_array = np.array(self.results[scale]["TS"])
ts_path = os.path.join(self.plot_dir, "ts_distributions/" + str(scale) + ".pdf")

Expand Down
10 changes: 8 additions & 2 deletions flarestack/core/ts_distributions.py
Expand Up @@ -307,9 +307,17 @@ def plot_background_ts_distribution(

ts_array = ts_array[~np.isnan(ts_array)]

max_ts = np.max(ts_array)

if np.median(ts_array) < 0.0:
plot_expanded_negative(ts_array, path)

if max_ts == 0:
logger.warning(
f"Maximum of TS is {max_ts=}, unable to calculate discovery threshold (too few trials?)"
)
return np.NaN

fig = plt.figure()

ax = plt.subplot(1, 1, 1)
Expand All @@ -322,8 +330,6 @@ def plot_background_ts_distribution(

plt.axhline(frac_over * (1 - five_sigma), color="r", linestyle="--")

max_ts = np.max(ts_array)

disc_potential = scipy.stats.chi2.ppf(five_sigma, df, loc, scale)

x_range = np.linspace(0.0, max(max_ts, disc_potential), 100)
Expand Down
66 changes: 66 additions & 0 deletions flarestack/data/dataset_index.py
@@ -0,0 +1,66 @@
""" This module provides the functionality to create a dataset index by instantiating a DatasetIndex object and importing all the available datasets. Each dataset, in turns, is expect to import `dataset_index` from this module, and adding its own information.
"""

import logging
from typing import List
from flarestack.data import Dataset

logger = logging.getLogger(__name__)


class DatasetIndex:
"""Class storing an index for available datasets"""

def __init__(self) -> None:
"""constructor"""
self.index = dict()

def add_dataset(self, name: str, object: Dataset) -> None:
"""adds a dataset to the index
Args:
name (str): assigned name of the dataset
object (Dataset): dataset object
"""
self.index[name] = object

def add_alias(self, alias: str, name: str) -> None:
"""adds an alias for a dataset
Args:
alias (str): alias name
name (str): dataset name
"""
dest = self.index[name]
if isinstance(dest, Dataset):
self.index[alias] = name
else:
logger.warning("f{name} is already an alias, aliasing {dest} instead.")
self.index[alias] = dest

def get_dataset(self, name: str) -> Dataset:
"""retrieve a dataset by name
Args:
name (str): dataset name
Returns:
Dataset: dataset
"""
dest = self.index[name]
if isinstance(dest, Dataset):
return dest
else:
logger.info(f"{name} is an alias for {dest}")
return self.index[dest]

def get_dataset_list(self):
"""Get list of indexed datasets"""
return self.index.keys()


dataset_index = DatasetIndex()

# Datasets will in turn import dataset_index and register themselves in the index.
import flarestack.data.public
import flarestack.data.icecube
2 changes: 2 additions & 0 deletions flarestack/data/icecube/__init__.py
Expand Up @@ -3,8 +3,10 @@
from flarestack.data.icecube.ps_tracks.ps_v003_p01 import ps_v003_p01
from flarestack.data.icecube.ps_tracks.ps_v003_p02 import ps_v003_p02
from flarestack.data.icecube.ps_tracks.ps_v004_p00 import ps_v004_p00
from flarestack.data.icecube.ps_tracks.ps_v004_p02 import ps_v004_p02
from flarestack.data.icecube.northern_tracks.nt_v002_p05 import nt_v002_p05
from flarestack.data.icecube.northern_tracks.nt_v005_p00 import nt_v005_p00
from flarestack.data.icecube.northern_tracks.nt_v005_p01 import nt_v005_p01
from flarestack.data.icecube.gfu.gfu_v002_p01 import gfu_v002_p01, txs_sample_v1
from flarestack.data.icecube.gfu.gfu_v002_p02 import gfu_v002_p02, txs_sample_v2
from flarestack.data.icecube.gfu.gfu_v002_p04 import gfu_v002_p04
Expand Down

0 comments on commit d7e1e8d

Please sign in to comment.