In [1]:
import pyarrow.parquet as pq
import numpy as np
import os
import lsdb
import tape

from lsdb.core.search import BoxSearch, ConeSearch, PolygonSearch
from tape import Ensemble, ColumnMapper
from hipscat.io.file_io import read_parquet_metadata
print(lsdb.__version__)
print(tape.__version__)


0.2.1
0.4.1


In [7]:
ztf_object_path = "/data3/epyc/data3/hipscat/catalogs/ztf_axs/ztf_dr14"
ztf_source_path = "/data3/epyc/data3/hipscat/catalogs/ztf_axs/ztf_source"



In [8]:
ztf_object = lsdb.read_hipscat(ztf_object_path, search_filter=ConeSearch(ra=-60, dec=20, radius_arcsec=1*1600))
#sources load takes a minute, since it creates a healpix alignment on load
ztf_source = lsdb.read_hipscat(ztf_source_path,
                               columns=['index', 'ps1_objid',
                                       'ra', 'dec', 
                                       'catflags', 
                                       'fieldID', 
                                       'mjd', 'band', 'mag', 'magerr', 'Npix'], search_filter=ConeSearch(ra=-60, dec=20, radius_arcsec=1*1600))

In [10]:
ztf_object_100 = ztf_object.query("nobs_g > 100 and nobs_r > 100")
# We do this to get the source catalog indexed by the objects hipscat index
ztf_joined_source_cat = ztf_object_100.join(
    ztf_source, left_on="ps1_objid", right_on="ps1_objid", suffixes=("_object", "")
)



In [11]:
colmap = ColumnMapper(
    id_col="_hipscat_index",
    time_col="mjd",
    flux_col="mag",
    err_col="magerr",  
    band_col="band",
)

ens = Ensemble(client=True)

# We just pass in the catalog objects
ens.from_lsdb(ztf_joined_source_cat, ztf_object, column_mapper=colmap)

ens.object.compute()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34112 instead


Unnamed: 0_level_0,ps1_objid,ra,dec,ps1_gMeanPSFMag,ps1_rMeanPSFMag,ps1_iMeanPSFMag,nobs_g,nobs_r,nobs_i,mean_mag_g,mean_mag_r,mean_mag_i,Norder,Dir,Npix
_hipscat_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3645663898796818432,129852995315513961,299.531509,18.211121,-999.000000,21.667700,20.771200,0,1,74,,16.707520,20.641304,5,0,3238
3645663899514044416,129852995331574656,299.533154,18.211779,18.025499,17.122601,16.712799,577,1232,158,18.084419,17.038139,16.639050,5,0,3238
3645663900969467904,129852995302075776,299.530212,18.212697,20.562799,19.651300,19.229000,11,231,108,20.619924,19.703492,19.299216,5,0,3238
3645663901456007168,129852995294816101,299.529480,18.212954,21.752100,20.551500,19.889799,1,179,97,21.444788,20.630898,19.817250,5,0,3238
3645663901678305280,129852995314306059,299.531427,18.212937,17.094400,16.523100,16.258400,580,1240,159,17.086404,16.376977,16.122447,5,0,3238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3647915693078216704,134423009353274611,300.935318,22.020049,22.195299,21.428101,20.957899,0,45,0,,21.347121,,5,0,3239
3647915693963214848,134423009380234354,300.938141,22.019936,-999.000000,22.041100,21.512699,0,4,0,,21.526417,,5,0,3239
3647915694005157888,134423009378754660,300.937897,22.020140,-999.000000,21.795500,20.581800,0,41,0,,21.339633,,5,0,3239
3647915696442048512,134423009365776649,300.936569,22.021715,20.138500,18.959700,17.492901,275,959,0,20.297452,18.777850,,5,0,3239


In [12]:
# Defining a simple function
def my_flux_average(flux_array, band_array, method="mean", band=None):
    """Read in an array of fluxes, and return the average of the fluxes by band"""
    if band != None:
        mask = [band_array == band]  # Create a band by band mask
        band_flux = flux_array[tuple(mask)]  # Mask the flux array
        if method == "mean":
            res = np.mean(band_flux)
        elif method == "median":
            res = np.median(band_flux)
    else:
        res = None
    return res

In [14]:
# Applying the function to the ensemble
res = ens.batch(my_flux_average, "mag", "band", meta=None, method="median", band="g")
res_computed = res.compute()

Using generated label, result_2, for a batch result.


In [15]:
res_computed

Unnamed: 0_level_0,result
_hipscat_index,Unnamed: 1_level_1
3646116897268695040,18.736000
3646116901299421184,18.156498
3646116903954415616,18.641582
3646116907842535424,17.747654
3646116915115458560,18.484097
...,...
3647494549216952320,21.129456
3647494570431741952,20.318192
3647494571937497088,18.787664
3647494575540404224,18.749584
