# 2MASS

This notebook demonstrates the importing process for 2MASS.

In [2]:
import glob
import lsdb
import numpy as np
import numpy.testing as npt
import pandas as pd
from dask.distributed import Client
from hats_import.pipeline import pipeline_with_client
from hats_import.catalog.arguments import ImportArguments
from hats_import.catalog.file_readers import CsvReader

Load all raw point source file paths:

In [3]:
in_file_paths = glob.glob("/epyc/data3/hipscat/raw/two_mass/psc_**.gz")
in_file_paths.sort()
print(f"{len(in_file_paths)} files")

92 files


Load the column names and types from a side file (inferred from the official [schema](https://irsa.ipac.caltech.edu/2MASS/download/allsky/twomass_psc_schema)):

In [4]:
type_frame = pd.read_csv("/epyc/data3/hipscat/raw/two_mass/schema.csv")
type_map = dict(zip(type_frame["name"], type_frame["type"]))
type_map

{'ra': 'float64',
 'decl': 'float64',
 'err_maj': 'Float32',
 'err_min': 'Float32',
 'err_ang': 'Int16',
 'designation': 'str',
 'j_m': 'Float32',
 'j_cmsig': 'Float32',
 'j_msigcom': 'Float32',
 'j_snr': 'Float32',
 'h_m': 'Float32',
 'h_cmsig': 'Float32',
 'h_msigcom': 'Float32',
 'h_snr': 'Float32',
 'k_m': 'Float32',
 'k_cmsig': 'Float32',
 'k_msigcom': 'Float32',
 'k_snr': 'Float32',
 'ph_qual': 'str',
 'rd_flg': 'str',
 'bl_flg': 'str',
 'cc_flg': 'str',
 'ndet': 'str',
 'prox': 'Float32',
 'pxpa': 'Int16',
 'pxcntr': 'Int64',
 'gal_contam': 'Int16',
 'mp_flg': 'Int16',
 'pts_key': 'Int64',
 'hemis': 'str',
 'date': 'str',
 'scan': 'Int16',
 'glon': 'Float32',
 'glat': 'Float32',
 'x_scan': 'Float32',
 'jdate': 'Float64',
 'j_psfchi': 'Float32',
 'h_psfchi': 'Float32',
 'k_psfchi': 'Float32',
 'j_m_stdap': 'Float32',
 'j_msig_stdap': 'Float32',
 'h_m_stdap': 'Float32',
 'h_msig_stdap': 'Float32',
 'k_m_stdap': 'Float64',
 'k_msig_stdap': 'Float64',
 'dist_edge_ns': 'Int64',
 'dis

In [5]:
args = ImportArguments(
    output_artifact_name="two_mass",
    input_file_list=in_file_paths,
    file_reader=CsvReader(
        header=None,
        delimiter="|", 
        column_names=type_frame["name"].values.tolist(),
        type_map=type_map,
        compression="gzip",
        na_values="\\N"
    ),
    output_path="/epyc/data3/hats/catalogs",
    dask_tmp="/epyc/data3/hats/tmp",
    ra_column="ra",
    dec_column="decl",
    resume=False,
)

In [6]:
with Client(n_workers=32, memory_limit="8GiB") as client:
    pipeline_with_client(args,client)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39929 instead
Planning  : 100%|██████████| 4/4 [00:00<00:00, 57.25it/s]
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:
  with pd.read_csv(csv_file, chunksize=chunksize, **kwargs) as reader:


Let's see what the schema looks like:

In [9]:
from pyarrow.parquet import read_metadata
meta = read_metadata("/epyc/data3/hats/catalogs/two_mass/dataset/_metadata")
meta.schema

<pyarrow._parquet.ParquetSchema object at 0x7f1f1411f400>
required group field_id=-1 schema {
  optional double field_id=-1 ra;
  optional double field_id=-1 decl;
  optional float field_id=-1 err_maj;
  optional float field_id=-1 err_min;
  optional int32 field_id=-1 err_ang (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 designation (String);
  optional float field_id=-1 j_m;
  optional float field_id=-1 j_cmsig;
  optional float field_id=-1 j_msigcom;
  optional float field_id=-1 j_snr;
  optional float field_id=-1 h_m;
  optional float field_id=-1 h_cmsig;
  optional float field_id=-1 h_msigcom;
  optional float field_id=-1 h_snr;
  optional float field_id=-1 k_m;
  optional float field_id=-1 k_cmsig;
  optional float field_id=-1 k_msigcom;
  optional float field_id=-1 k_snr;
  optional binary field_id=-1 ph_qual (String);
  optional binary field_id=-1 rd_flg (String);
  optional binary field_id=-1 bl_flg (String);
  optional binary field_id=-1 cc_flg (String);
  o

### Veritication checks

IPAC provides a [query](https://irsa.ipac.caltech.edu/2MASS/download/allsky/verification_query_psc.html) for data verification. Let's use it on our newly imported catalog and make sure columns sum up to the expected amounts.

In [11]:
two_mass = lsdb.read_hats("/epyc/data3/hats/catalogs/two_mass")
two_mass

Unnamed: 0_level_0,ra,decl,err_maj,err_min,err_ang,designation,j_m,j_cmsig,j_msigcom,j_snr,h_m,h_cmsig,h_msigcom,h_snr,k_m,k_cmsig,k_msigcom,k_snr,ph_qual,rd_flg,bl_flg,cc_flg,ndet,prox,pxpa,pxcntr,gal_contam,mp_flg,pts_key,hemis,date,scan,glon,glat,x_scan,jdate,j_psfchi,h_psfchi,k_psfchi,j_m_stdap,j_msig_stdap,h_m_stdap,h_msig_stdap,k_m_stdap,k_msig_stdap,dist_edge_ns,dist_edge_ew,dist_edge_flg,dup_src,use_src,a,dist_opt,phi_opt,b_m_opt,vr_m_opt,nopt_mchs,ext_key,scan_key,coadd_key,coadd,Norder,Dir,Npix
npartitions=1107,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1
0,double[pyarrow],double[pyarrow],float[pyarrow],float[pyarrow],int16[pyarrow],string[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],string[pyarrow],string[pyarrow],string[pyarrow],string[pyarrow],string[pyarrow],float[pyarrow],int16[pyarrow],int64[pyarrow],int16[pyarrow],int16[pyarrow],int64[pyarrow],string[pyarrow],string[pyarrow],int16[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],double[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],float[pyarrow],double[pyarrow],double[pyarrow],int64[pyarrow],int64[pyarrow],string[pyarrow],int16[pyarrow],int16[pyarrow],string[pyarrow],float[pyarrow],int16[pyarrow],float[pyarrow],float[pyarrow],int16[pyarrow],int64[pyarrow],int64[pyarrow],int64[pyarrow],int16[pyarrow],uint8[pyarrow],uint64[pyarrow],uint64[pyarrow]
18014398509481984,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3454260914193170432,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3458764513820540928,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [59]:
two_mass.head(10)

Unnamed: 0_level_0,ra,decl,err_maj,err_min,err_ang,designation,j_m,j_cmsig,j_msigcom,j_snr,...,b_m_opt,vr_m_opt,nopt_mchs,ext_key,scan_key,coadd_key,coadd,Norder,Dir,Npix
_healpix_29,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2943408919,44.996055,0.005565,0.17,0.16,76,02595905+0000200,16.375999,0.097,0.097,11.3,...,17.9,16.9,1,,69347,1594959,9,2,0,0
29153088558,45.004857,0.019806,0.06,0.06,90,03000116+0001113,12.529,0.021,0.024,389.799988,...,15.0,13.8,1,,69347,1594959,9,2,0,0
29639235405,45.004193,0.020956,0.06,0.06,90,03000100+0001154,14.845,0.055,0.056,46.200001,...,,,0,,69347,1594959,9,2,0,0
162212337124,44.995074,0.038204,0.38,0.31,0,02595881+0002175,16.746,0.133,0.134,8.0,...,,,0,,69347,1594959,9,2,0,0
187874191277,44.963851,0.043587,0.22,0.17,95,02595132+0002369,16.476,0.112,0.113,10.3,...,18.700001,17.700001,1,,69347,1594959,9,2,0,0
282956647929,45.048281,0.048329,0.18,0.07,0,03001158+0002539,13.354,0.022,0.025,153.399994,...,15.5,14.9,1,,61142,1406266,267,2,0,0
425727676968,45.023564,0.068472,0.18,0.07,0,03000565+0004064,14.729,0.036,0.038,43.200001,...,17.4,16.1,1,,61142,1406266,267,2,0,0
643736796518,44.993307,0.076423,0.07,0.06,4,02595839+0004351,14.921,0.034,0.036,43.099998,...,16.5,16.1,1,,69347,1594959,9,2,0,0
681422857682,44.969132,0.084458,0.13,0.12,88,02595259+0005040,16.086,0.085,0.086,14.7,...,18.299999,17.5,1,,69347,1594959,9,2,0,0
687145892547,44.978474,0.092522,0.25,0.2,179,02595483+0005330,16.504999,0.122,0.122,10.0,...,18.9,18.299999,1,,69347,1594959,9,2,0,0


In [54]:
desired = [
    306810325437475788, 
    306815556538478936, 
    16902776758555, 
    32666066948, 
    2048692118201,
    388758631396659, 
    64617139213, 
    16048, 
    729878, 
    464456155, 
    79798372, 
    369187043, 
    64916239773,
    69388217174, 
    2670725813652, 
    29279563815
]

In [110]:
def compute_sums(df):
    sums = [
        df["pts_key"].sum(),
        df["pxcntr"].sum(),
        df["scan_key"].sum(),
        df["scan"].sum(),
        df["ext_key"].sum(),
        df["coadd_key"].sum(),
        df["coadd"].sum(),
        df["mp_flg"].sum(),
        df["gal_contam"].sum(),
        df["use_src"].sum(),
        df["dup_src"].sum(),
        df["nopt_mchs"].sum(),
        df["phi_opt"].sum(),
        df["dist_edge_ew"].sum(),
        df["dist_edge_ns"].sum(),
        df["err_ang"].sum(),
    ]
    return pd.DataFrame(sums)

results = two_mass.map_partitions(lambda df: compute_sums(df))

with Client(n_workers=64, memory_limit="4GiB") as client:
    sums = results.compute()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39526 instead


In [111]:
splits = np.array_split(sums, len(two_mass.get_healpix_pixels()))
total_sums = np.sum(splits, axis=0).flatten()
total_sums

  return bound(*args, **kwds)


array([306810325437475788, 306815556538478936,     16902776758555,
              32666066948,      2048692118201,    388758631396659,
              64617139213,              16048,             729878,
                464456155,           79798372,          369187043,
              64916239773,        69388217174,      2670725813652,
              29279563815])

In [112]:
npt.assert_array_equal(total_sums, desired)