# Import TESS light curves into HATS catalog

## Install deps

In [1]:
%pip install -U lsdb hats-import

Note: you may need to restart the kernel to use updated packages.


## Create custom HATS-import reader class

In [2]:
import os
from upath import UPath

s3_root = "s3://stpubdata/tess/public/tid"

def slashfill(tic_str):
    assert len(tic_str) == 16
    result = "/".join(tic_str[i:i+4] for i in range(0, len(tic_str), 4))
    return result

def convert_uri(file_uri, return_id=True):
    filename = os.path.basename(file_uri)
    fname_list = filename.split("-")
    sector_str = fname_list[1]
    tic_str = fname_list[2]

    meta_path = sector_str + "/" + slashfill(tic_str)
    s3_uri = UPath(
        s3_root + "/" + meta_path + "/" + filename,
        anon=True
    )
    if return_id:
        return s3_uri, "tic" + "_".join([tic_str, sector_str])
    return s3_uri

def get_uris(lines, return_id):    
    uris = []
    
    for l in lines[1:]:
        cmds = l.split()
        uri = convert_uri(cmds[-1], return_id=return_id)
        uris.append(uri)
    
    return uris

In [3]:
from collections import defaultdict
from datetime import datetime

import numpy as np
import pyarrow as pa
from astropy.io import fits
from astropy.table import Table
from hats_import.catalog.file_readers import InputReader
from upath import UPath


class TESSLCReader(InputReader):
    header0_columns = {
        "RA_OBJ": pa.float64(),
        "DEC_OBJ": pa.float64(),
        "TSTART": pa.float32(),
        "TSTOP": pa.float32(),
        "DATE-OBS": pa.timestamp("ms", tz="utc"),
        "DATE-END": pa.timestamp("ms", tz="utc"),
        "TICID": pa.int64(),
        "SECTOR": pa.int32(),
        "CAMERA": pa.int8(),
        "CCD": pa.int8(),
        "PMRA": pa.float32(),
        "PMDEC": pa.float32(),
        "PMTOTAL": pa.float32(),
        "TEFF": pa.float32(),
        "LOGG": pa.float32(),
        "MH": pa.float32(),
        "RADIUS": pa.float32(),
    }

    header1_columns = {
        "EXPOSURE": pa.float32(),
        "TELAPSE": pa.float32(),
        "DEADC": pa.float32(),
        "TIMEPIXR": pa.float32(),
        "TIERRELA": pa.float32(),
        "INT_TIME": pa.float32(),
        "READTIME": pa.float32(),
        "FRAMETIM": pa.float32(),
        "NUM_FRM": pa.int32(),
        "TIMEDEL": pa.float32(),
        "GAINA": pa.float32(),
        "GAINB": pa.float32(),
        "GAINC": pa.float32(),
        "GAIND": pa.float32(),
        "READNOIA": pa.float32(),
        "READNOIB": pa.float32(),
        "READNOIC": pa.float32(),
        "READNOID": pa.float32(),
        "NREADOUT": pa.int32(),
        "CDPP0_5": pa.float32(),
        "CDPP1_0": pa.float32(),
        "CDPP2_0": pa.float32(),
        "CROWDSAP": pa.float32(),
        "FLFRCSAP": pa.float32(),
        "PDCVAR": pa.float32(),
        "PR_GOOD1": pa.float32(),
        "PR_WGHT1": pa.float32(),
        "PR_GOOD2": pa.float32(),
        "PR_WGHT2": pa.float32(),
        "PR_GOOD3": pa.float32(),
        "PR_WGHT3": pa.float32(),
        "PDC_TOT":  pa.float32(),
        "PDC_TOTP": pa.float32(),
        "PDC_COR": pa.float32(),
        "PDC_CORP":pa.float32(),
        "PDC_VAR":pa.float32(),
        "PDC_VARP": pa.float32(),
        "PDC_NOI": pa.float32(),
        "PDC_NOIP": pa.float32(),
        "PDC_EPT":pa.float32(),
        "PDC_EPTP": pa.float32(),
    }

    header2_columns = {
        "NPIXSAP": pa.int32(),
        "NPIXMISS": pa.int32(),
    }

    def __init__(self, chunksize: int = 100, few_rows_per_sector: bool = False):
        super().__init__()
        self.chunksize = chunksize
        self.few_rows_per_sector = few_rows_per_sector 
    
    @staticmethod
    def fits_to_hats_colname(name: str) -> str:
        return name.lower().replace("-", "_")
    
    def read(self, input_file: str, read_columns=None):
        uris = self.get_uris_from_sh(input_file)

        if self.few_rows_per_sector:
            uris = uris[:123]

        n_chunks = int(np.ceil(len(uris) / self.chunksize))
        
        for chunk in np.array_split(uris, n_chunks):
            # Just ra and dec are needed
            if read_columns is None:
                yield self.get_whole_table(chunk)
            else:
                yield self.get_ra_dec_table(chunk, read_columns)

    def get_ra_dec_table(self, uris, columns):
        ra_, dec_ = [], []
        for upath in uris:
            ra, dec = self.get_radec_from_path(upath)
            ra_.append(ra)
            dec_.append(dec)
        return pa.table(dict(zip(columns, [ra_, dec_], strict=True)))

    def get_whole_table(self, uris):
        data = defaultdict(list)
        for path in uris:
            with path.open('rb') as fh, fits.open(fh) as hdul:
                # Adding values from headers
                self.add_header_values(data, hdul[0].header, self.header0_columns)
                self.add_header_values(data, hdul[1].header, self.header1_columns)
                self.add_header_values(data, hdul[2].header, self.header2_columns)

                # Adding light curve
                data['lightcurve'].append(self.fits_table_to_pa_scalar(hdul[1].data))

                # Adding aperture and its shape
                ap, ap_x, ap_y = self.parse_fist_aperture(hdul[2].data)
                data['aperture'].append(ap)
                data['aperture_size_x'].append(ap_x)
                data['aperture_size_y'].append(ap_y)
        table = pa.table(data)
        return table

    def add_header_values(self, data, header, columns):
        for fits_colname, ty in columns.items():
            hats_colname = self.fits_to_hats_colname(fits_colname)
            value = header.get(fits_colname)
            if pa.types.is_timestamp(ty):
                value = datetime.fromisoformat(value)
            elif pa.types.is_floating(ty):
                if value is not None and np.isnan(value):
                    value = None
            data[hats_colname].append(pa.scalar(value, type=ty))
        
    @staticmethod
    def fits_table_to_pa_scalar(arr):
        data = {}
        for fits_field, (fits_dtype, _) in arr.dtype.fields.items():
            hats_field = TESSLCReader.fits_to_hats_colname(fits_field)
            # Swap bytes to "native" order
            hats_dtype = fits_dtype.newbyteorder('=')
            data[hats_field] = np.asarray(arr[fits_field], dtype=hats_dtype)
        return pa.scalar(data)
        
    def get_radec_from_path(self, path):
        with path.open('rb') as fh:
            header = fits.getheader(fh, 0)
        return header.get('RA_OBJ'), header.get('DEC_OBJ')
    
    def get_uris_from_sh(self, sh_file):
        with open(sh_file) as fh:
            return get_uris(fh.readlines(), return_id=False)

    @staticmethod
    def parse_fist_aperture(data):
        dtype = data.dtype.newbyteorder("=")
        flat_array = np.asarray(data.flatten(), dtype=dtype)
        return (
            pa.scalar(flat_array),
            pa.scalar(data.shape[1], type=pa.int16()),
            pa.scalar(data.shape[0], type=pa.int16()),
        )

  from .autonotebook import tqdm as notebook_tqdm


## Download all "curl" scripts, one per sector

In [4]:
SH_ROOT = "./sh_files"
SECTORS = list(range(1, 97))

In [5]:
import os

from tqdm.auto import tqdm
from upath import UPath


curl_root = UPath("https://archive.stsci.edu/missions/tess/download_scripts/sector")


def download_sh_files():
    sh_root_path = UPath(SH_ROOT)
    sh_root_path.mkdir(exist_ok=True, parents=True)
    
    for sector in tqdm(SECTORS):
        fname = f"tesscurl_sector_{sector}_lc.sh"
        sh_remote = curl_root / fname
        sh_local = sh_root_path / fname
        with sh_local.open("wb") as local:
            local.write(sh_remote.read_bytes())


download_sh_files()

100%|██████████| 96/96 [00:06<00:00, 15.18it/s]


In [6]:
sh_paths = sorted(upath.path for upath in UPath(SH_ROOT).glob("*.sh"))

## Run the import pipeline

In [7]:
default_columns = [
    'ra_obj',
    'dec_obj',
    'ticid',
    'sector',
    # CHANGE TO SUBCOLUMNS
    # 'lightcurve.time',
    # 'lightcurve.pdcsap_flux',
    # 'lightcurve.pdcsap_flux_err',
    # 'lightcurve.quality',
    'lightcurve',
]

In [8]:
from hats_import import CollectionArguments

args = (
    CollectionArguments(
        output_artifact_name="tess-lightcurve",
        output_path="hats",
    )
    .catalog(
        # CHANGE TO USE ALL SECTORS
        input_file_list=sh_paths[::5],
        # REMOVE few_rows_per_sector=True
        file_reader=TESSLCReader(chunksize=100, few_rows_per_sector=True),
        ra_column="ra_obj",
        dec_column="dec_obj",
        sort_columns="ticid",
        highest_healpix_order=7,
        pixel_threshold=200,
        addl_hats_properties={
            'hats_cols_default': default_columns,
        },
    )
    # UNCOMMENT MARGINS
    # .add_margin(margin_threshold=10.0, is_default=True)
    .add_margin(margin_threshold=60.0)
    .add_index(indexing_column="ticid", include_healpix_29=True)
)

In [9]:
from dask.distributed import Client
from hats_import.pipeline import pipeline_with_client, pipeline


with Client(n_workers=8, threads_per_worker=1) as client:
    display(client)
    pipeline_with_client(args, client)

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /jupyter/user/malanchev/proxy/8787/status,

0,1
Dashboard: /jupyter/user/malanchev/proxy/8787/status,Workers: 8
Total threads: 8,Total memory: 61.77 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:40009,Workers: 8
Dashboard: /jupyter/user/malanchev/proxy/8787/status,Total threads: 8
Started: Just now,Total memory: 61.77 GiB

0,1
Comm: tcp://127.0.0.1:43613,Total threads: 1
Dashboard: /jupyter/user/malanchev/proxy/46747/status,Memory: 7.72 GiB
Nanny: tcp://127.0.0.1:39057,
Local directory: /tmp/dask-scratch-space/worker-vvfwtteo,Local directory: /tmp/dask-scratch-space/worker-vvfwtteo

0,1
Comm: tcp://127.0.0.1:45473,Total threads: 1
Dashboard: /jupyter/user/malanchev/proxy/42181/status,Memory: 7.72 GiB
Nanny: tcp://127.0.0.1:37463,
Local directory: /tmp/dask-scratch-space/worker-3mj4jrl7,Local directory: /tmp/dask-scratch-space/worker-3mj4jrl7

0,1
Comm: tcp://127.0.0.1:41557,Total threads: 1
Dashboard: /jupyter/user/malanchev/proxy/39427/status,Memory: 7.72 GiB
Nanny: tcp://127.0.0.1:43759,
Local directory: /tmp/dask-scratch-space/worker-7et4t9z7,Local directory: /tmp/dask-scratch-space/worker-7et4t9z7

0,1
Comm: tcp://127.0.0.1:45763,Total threads: 1
Dashboard: /jupyter/user/malanchev/proxy/42321/status,Memory: 7.72 GiB
Nanny: tcp://127.0.0.1:40561,
Local directory: /tmp/dask-scratch-space/worker-2604wg2f,Local directory: /tmp/dask-scratch-space/worker-2604wg2f

0,1
Comm: tcp://127.0.0.1:44453,Total threads: 1
Dashboard: /jupyter/user/malanchev/proxy/44363/status,Memory: 7.72 GiB
Nanny: tcp://127.0.0.1:45751,
Local directory: /tmp/dask-scratch-space/worker-tyj823ev,Local directory: /tmp/dask-scratch-space/worker-tyj823ev

0,1
Comm: tcp://127.0.0.1:41601,Total threads: 1
Dashboard: /jupyter/user/malanchev/proxy/37381/status,Memory: 7.72 GiB
Nanny: tcp://127.0.0.1:41503,
Local directory: /tmp/dask-scratch-space/worker-888h1_v1,Local directory: /tmp/dask-scratch-space/worker-888h1_v1

0,1
Comm: tcp://127.0.0.1:39017,Total threads: 1
Dashboard: /jupyter/user/malanchev/proxy/35553/status,Memory: 7.72 GiB
Nanny: tcp://127.0.0.1:40449,
Local directory: /tmp/dask-scratch-space/worker-o7qix4nn,Local directory: /tmp/dask-scratch-space/worker-o7qix4nn

0,1
Comm: tcp://127.0.0.1:33515,Total threads: 1
Dashboard: /jupyter/user/malanchev/proxy/34117/status,Memory: 7.72 GiB
Nanny: tcp://127.0.0.1:43827,
Local directory: /tmp/dask-scratch-space/worker-9o5qbtkt,Local directory: /tmp/dask-scratch-space/worker-9o5qbtkt


Planning  : 100%|██████████| 4/4 [00:00<00:00, 902.00it/s]


tmp_path (hats/tess-lightcurve/intermediate/tess-lightcurve/intermediate) contains intermediate files; resuming prior progress.


Binning   : 100%|██████████| 2/2 [00:00<00:00,  4.25it/s]
Finishing : 100%|██████████| 6/6 [00:00<00:00, 61.49it/s]
Planning  : 100%|██████████| 3/3 [00:05<00:00,  1.99s/it]
Mapping   : 100%|██████████| 35/35 [00:07<00:00,  4.77it/s]
Binning   :   0%|          | 0/1 [00:00<?, ?it/s]
Reducing  : 100%|██████████| 51/51 [00:00<00:00, 291.33it/s]
Finishing : 100%|██████████| 4/4 [00:00<00:00, 109.92it/s]
Finishing : 100%|██████████| 3/3 [00:00<00:00, 478.07it/s]
Finishing : 100%|██████████| 2/2 [00:00<00:00, 313.35it/s]


In [12]:
import lsdb

lsdb.open_catalog('hats/tess-lightcurve').head()

Unnamed: 0_level_0,ra_obj,dec_obj,ticid,sector,lightcurve
cadenceno,mom_centr1,...,time,timecorr,Unnamed: 5_level_1
cadenceno,mom_centr1,...,time,timecorr,Unnamed: 5_level_2
cadenceno,mom_centr1,...,time,timecorr,Unnamed: 5_level_3
cadenceno,mom_centr1,...,time,timecorr,Unnamed: 5_level_4
cadenceno,mom_centr1,...,time,timecorr,Unnamed: 5_level_5
214294927578924518,8.906893,60.295848,283929457,24,cadenceno  mom_centr1  ...  time  timecorr  524401  NaN  ...  1955.786298  -0.003609  +19073 rows  ...  ...  ...  ...
cadenceno,mom_centr1,...,time,timecorr,
524401,,...,1955.786298,-0.003609,
+19073 rows,...,...,...,...,
263564873807431839,9.449432,65.186326,284307346,24,cadenceno  mom_centr1  ...  time  timecorr  524401  NaN  ...  1955.786671  -0.003236  +19073 rows  ...  ...  ...  ...
cadenceno,mom_centr1,...,time,timecorr,
524401,,...,1955.786671,-0.003236,
+19073 rows,...,...,...,...,
264189314054313064,2.238490,66.456659,461342461,24,cadenceno  mom_centr1  ...  time  timecorr  524401  NaN  ...  1955.786864  -0.003044  +19073 rows  ...  ...  ...  ...
cadenceno,mom_centr1,...,time,timecorr,

cadenceno,mom_centr1,...,time,timecorr
524401,,...,1955.786298,-0.003609
+19073 rows,...,...,...,...

cadenceno,mom_centr1,...,time,timecorr
524401,,...,1955.786671,-0.003236
+19073 rows,...,...,...,...

cadenceno,mom_centr1,...,time,timecorr
524401,,...,1955.786864,-0.003044
+19073 rows,...,...,...,...

cadenceno,mom_centr1,...,time,timecorr
524401,,...,1955.786917,-0.002991
+19073 rows,...,...,...,...

cadenceno,mom_centr1,...,time,timecorr
524401,,...,1955.787273,-0.002635
+19073 rows,...,...,...,...


In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import nested_pandas as npd

npd.read_parquet('hats/tess-lightcurve/tess-lightcurve/dataset/Norder=0/Dir=0/Npix=0.parquet', columns=['lightcurve.time'])