Skip to content

Commit

Permalink
Add download_plasticc.py to create HDF5 files for PLAsTiCC
Browse files Browse the repository at this point in the history
  • Loading branch information
kboone committed May 14, 2019
1 parent f462711 commit e6e3666
Show file tree
Hide file tree
Showing 4 changed files with 245 additions and 122 deletions.
2 changes: 2 additions & 0 deletions avocado/augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class Augmentor():
astropy.cosmology.FlatLambdaCDM.
"""
def __init__(self, **cosmology_kwargs):
print("TODO: Update Augmentor to use specz, not truez!")

# Default cosmology to use. This is the one assumed for the PLAsTiCC
# dataset.
cosmology_parameters = {
Expand Down
44 changes: 44 additions & 0 deletions avocado/dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import StratifiedKFold

Expand Down Expand Up @@ -57,6 +59,48 @@ def __init__(self, name, metadata, observations=None):

self.objects[meta_index] = new_object

@classmethod
def load(cls, dataset_name, metadata_only=False):
"""Load a dataset that has been saved in HDF5 format in the data
directory.
For an example of how to create such a dataset, see
`scripts/download_plasticc.py`.
Parameters
----------
dataset_name : str
The name of the dataset to load
metadata_only : bool (optional)
If False (default), the observations are loaded. Otherwise, only
the metadata is loaded. This is useful for very large datasets.
Returns
-------
dataset : :class:`Dataset`
The loaded dataset.
"""
data_directory = settings['data_directory']

data_path = os.path.join(data_directory, dataset_name + '.h5')
print(data_path)

if not os.path.exists(data_path):
raise AvocadoException("Couldn't find dataset %s!" % dataset_name)

metadata = pd.read_hdf(data_path, 'metadata')

if metadata_only:
observations = None
else:
observations = pd.read_hdf(data_path, 'observations')

# Create a Dataset object
dataset = Dataset(dataset_name, metadata, observations)

return dataset


def label_folds(self):
"""Separate the dataset into groups for k-folding
Expand Down
124 changes: 2 additions & 122 deletions avocado/plasticc.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,13 @@
"""Utility functions to interact with the PLAsTiCC dataset"""

import numpy as np
import os
import pandas as pd
from scipy.special import erf

from .dataset import Dataset
from .utils import settings, AvocadoException, logger

from .augment import Augmentor

def update_plasticc_names(dataset_kind, metadata, observations=None):
"""Rename columns in PLAsTiCC tables to follow the avocado naming scheme.
Parameters
----------
dataset_kind : str {'training', 'test'}
metadata : pandas.DataFrame
Original metadata
observations : pandas.DataFrame (optional)
Original observations DataFrame
Returns
-------
renamed_metadata : pandas.DataFrame
metadata DataFrame with renamed columns to follow the avocado
naming scheme.
renamed_observations : pandas.DataFrame
observations DataFrame with renamed columns to follow the avocado
naming scheme. This is only returned if observations is not None.
"""

# Rename columns in the metadata table to match the avocado standard.
metadata_name_map = {
'target': 'category',
'hostgal_photoz_err': 'host_photoz_error',
'hostgal_photoz': 'host_photoz',
'hostgal_specz': 'host_specz',
}
metadata.rename(metadata_name_map, axis=1, inplace=True)

# Convert the ddf flag to a boolean
metadata['ddf'] = metadata['ddf'].astype(bool)

# The true redshift is the host spectroscopic redshift for the PLAsTiCC
# training set.
if dataset_kind == 'training':
metadata['redshift'] = metadata['host_specz']

# Explicitly set a galactic/extragalactic flag.
metadata['galactic'] = metadata['host_photoz'] == 0.

if observations is None:
return metadata

# Replace the passband number with a string representing the LSST band.
band_map = {
0: 'lsstu',
1: 'lsstg',
2: 'lsstr',
3: 'lssti',
4: 'lsstz',
5: 'lssty',
}

observations['band'] = observations['passband'].map(band_map)
observations.drop('passband', axis=1, inplace=True)

# Rename columns in the observations table to match the avocado standard.
observations_name_map = {
'mjd': 'time',
'flux_err': 'flux_error',
}
observations.rename(observations_name_map, axis=1, inplace=True)

return metadata, observations


def load_plasticc_training():
"""Load the PLAsTiCC training set.
Returns
=======
training_dataset : :class:`Dataset`
The PLAsTiCC training dataset.
"""
data_directory = settings['data_directory']

observations_path = os.path.join(data_directory, 'training_set.csv')
observations = pd.read_csv(observations_path)

metadata_path = os.path.join(data_directory, 'training_set_metadata.csv')
metadata = pd.read_csv(metadata_path)

metadata, observations = update_plasticc_names('training', metadata,
observations)

# Create a Dataset object
dataset = Dataset('plasticc_training', metadata, observations)

return dataset


def load_plasticc_test():
"""Load the metadata of the full PLAsTiCC test set.
Only the metadata is loaded, not the individual observations. The
individual observations can't all fit in memory at the same time on normal
computers.
Returns
=======
test_dataset : :class:`Dataset`
The PLAsTiCC test dataset (metadata only).
"""
data_directory = settings['data_directory']

metadata_path = os.path.join(data_directory, 'test_set_metadata.csv')
metadata = pd.read_csv(metadata_path)

metadata = update_plasticc_names('test', metadata)

# Create a Dataset object
dataset = Dataset('plasticc_test', metadata)

return dataset


class PlasticcAugmentor(Augmentor):
"""Implementation of an Augmentor for the PLAsTiCC dataset"""
Expand All @@ -154,7 +33,8 @@ def _load_test_dataset(self):
The test dataset loaded with metadata only.
"""
if self._test_dataset is None:
self._test_dataset = load_plasticc_test()
self._test_dataset = Dataset.load('plasticc_test',
metadata_only=True)

return self._test_dataset

Expand Down

0 comments on commit e6e3666

Please sign in to comment.