In [None]:
import lsdb
import pyarrow.parquet as pq
from nested_pandas import NestedFrame
from nested_pandas.datasets.generation import generate_data

### Nested-pandas

Empty-list HTML representation:

In [None]:
# Previously instead of None we'd see "+-1 rows"
base = NestedFrame(data={"a": [1, 2], "b": [2, 4], "c": [[1, 2, 3], []]}, index=[0, 1])
base = base.nest_lists(columns=["c"], name="nested")
base

Unnamed: 0_level_0,a,b,nested
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,2.0,c  1  +2 rows
c,,,
1,,,
+2 rows,,,
1,2.0,4.0,

c
1
+2 rows


Display non-existing nested columns in error message:

In [None]:
# Previously we'd get KeyError: "['nested.c', 'nested.g']"
base[["a", "nested.c", "nested.g"]]

KeyError: "['nested.g'] not in index"

Append sub-columns via map_rows:

In [7]:
nf = generate_data(5,5, seed=1)

def example_func(row):
    '''map_rows will return a sub-column for the existing 'nested' column'''
    return row["nested.t"] - row["a"]

# Previously a new column named "nested" with a "t_a" sub-column would be created
nf.map_rows(example_func, columns=["a", "nested.t"], output_names=["nested.t_a"], append_columns=True)

Unnamed: 0_level_0,a,b,nested
t,flux,band,t_a
t,flux,band,t_a
t,flux,band,t_a
t,flux,band,t_a
t,flux,band,t_a
0,0.417022,0.184677,t  flux  band  t_a  8.38389  31.551563  r  7.966868  +4 rows  ...  ...  ...
t,flux,band,t_a
8.38389,31.551563,r,7.966868
+4 rows,...,...,...
1,0.720324,0.372520,t  flux  band  t_a  13.70439  68.650093  g  12.984066  +4 rows  ...  ...  ...
t,flux,band,t_a
13.70439,68.650093,g,12.984066
+4 rows,...,...,...
2,0.000114,0.691121,t  flux  band  t_a  4.089045  83.462567  g  4.088931  +4 rows  ...  ...  ...
t,flux,band,t_a

t,flux,band,t_a
8.38389,31.551563,r,7.966868
+4 rows,...,...,...

t,flux,band,t_a
13.70439,68.650093,g,12.984066
+4 rows,...,...,...

t,flux,band,t_a
4.089045,83.462567,g,4.088931
+4 rows,...,...,...

t,flux,band,t_a
17.562349,1.828828,g,17.260016
+4 rows,...,...,...

t,flux,band,t_a
0.547752,75.014431,g,0.400996
+4 rows,...,...,...


### Estimate catalog size (and pyarrow arrays)

Investigated how pyarrow arrays store data in memory:

|             Arrow Type            |          Data Buffer         |             Offsets Buffer             |    Validity Bitmap   |
|:---------------------------------:|:----------------------------:|:--------------------------------------:|:---------------------:
| int32 / float32                   | 4 × n                        | None                                   | ceil(n / 8) if nulls |
| int64 / float64                   | 8 × n                        | None                                   | ceil(n / 8) if nulls |
| bool                              | 1 bit × n                    | None                                   | ceil(n / 8) if nulls |
| byte array (variable-length)      | sum(len_i for i in range(n)) | 4 × n (no nulls) or 4 × (n+1) if nulls | ceil(n / 8) if nulls |
| byte array (fixed length)         | n_bytes × n                  | None                                   | ceil(n / 8) if nulls |

The data in parquet files is stored encoded and compressed and...

the `total_uncompressed_size` provided in the metadata corresponds to the size of data after decompression, but before decoding.

In [12]:
meta = pq.read_metadata('s3://ipac-irsa-ztf/contributed/dr23/lc/hats/ztf_dr23_lc-hats/dataset/_metadata')

In [20]:
meta.row_group(0).column(0)

<pyarrow._parquet.ColumnChunkMetaData object at 0x11d90a9d0>
  file_offset: 0
  file_path: Norder=4/Dir=0/Npix=1320/part0.snappy.parquet
  physical_type: INT64
  num_values: 98304
  path_in_schema: _healpix_29
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x11d90b420>
      has_min_max: True
      min: 1486192130406879076
      max: 1487313771029915543
      null_count: 0
      distinct_count: None
      num_values: 98304
      physical_type: INT64
      logical_type: None
      converted_type (legacy): NONE
  geo_statistics:
    None
  compression: SNAPPY
  encodings: ('PLAIN', 'RLE', 'RLE_DICTIONARY')
  has_dictionary_page: True
  dictionary_page_offset: 4
  data_page_offset: 786136
  total_compressed_size: 995315
  total_uncompressed_size: 995268

In [22]:
# expected = data type size * number of elements
98304 * 8

# total_uncompressed_size is larger 
# 1M vs 800k bytes

786432

In [None]:
meta.row_group(0).column(10)

<pyarrow._parquet.ColumnChunkMetaData object at 0x3a11430b0>
  file_offset: 0
  file_path: Norder=4/Dir=0/Npix=1320/part0.snappy.parquet
  physical_type: DOUBLE
  num_values: 1664288
  path_in_schema: lightcurve.hmjd.list.element
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x3a11422f0>
      has_min_max: True
      min: 58384.44361
      max: 59951.23182
      null_count: 0
      distinct_count: None
      num_values: 1664288
      physical_type: DOUBLE
      logical_type: None
      converted_type (legacy): NONE
  geo_statistics:
    None
  compression: SNAPPY
  encodings: ('PLAIN', 'RLE', 'RLE_DICTIONARY')
  has_dictionary_page: True
  dictionary_page_offset: 3673096
  data_page_offset: 3695571
  total_compressed_size: 1508725
  total_uncompressed_size: 2701298

In [23]:
# expected = data type size * number of elements
1664288 * 8

# much larger than total_uncompressed_size 
# 13 M vs 2 M bytes

13314304

We could achieve a precise estimate of data in memory, but not for variable byte arrays. We might just stick to the encoded value for each column for simplicity (with no guarantees regarding the memory amount).

In [77]:
ztf = lsdb.open_catalog('s3://ipac-irsa-ztf/contributed/dr23/lc/hats')
ztf.columns

Index(['objectid', 'filterid', 'objra', 'objdec', 'lightcurve'], dtype='object')

In [72]:
ztf.estimate_size()



You selected 5/13 columns.
You selected 9933/9933 pixels.
Expect up to 4,973,896,193 results (100.00% of the full catalog).
Expect up to 7.9 TiB in MEMORY.
Expect up to 7.2 TiB on DISK.


In [None]:
ztf[["objectid", "lightcurve"]].cone_search(ra=150.025, dec=2.09, radius_arcsec=5).estimate_size()



You selected 2/13 columns.
You selected 1/9933 pixels.
Expect up to 1,044,411 results (0.02% of the full catalog).
Expect up to 676.1 MiB in MEMORY.
Expect up to 624.8 MiB on DISK.


There's some fixes and improvements that need to be implemented:

In [None]:
# You selected 5/13 columns (should maybe say how many are base and nested?)
# Need to check what is happening to rows when only a nested column is selected
# Add a better warning stating that "MEMORY" are not materialized memory values!
ztf[["lightcurve"]].estimate_size()



You selected 1/13 columns.
You selected 9933/9933 pixels.
Expect up to 741,514,887,202 results (14908.13% of the full catalog).
Expect up to 7.8 TiB in MEMORY.
Expect up to 7.1 TiB on DISK.
