# `pandas-ts`

**Author**: Konstantin Malanchev

- Pack flat source table into a pandas series of light-curves
- If source table is sorted by ID, no copy is needed
- Getting flat representation back is cheap
- Packed series is a `pyarrow` struct-array
- Each field of the struct is `pyarrow` list-array, lists are forced to have the same length
- From user's persopective each item is a nested pandas dataframe

In [1]:
! pip install git+https://github.com/lincc-frameworks/pandas-ts@656d3f53131b2870c05beea42709fe439f52f824

Collecting git+https://github.com/lincc-frameworks/pandas-ts@656d3f53131b2870c05beea42709fe439f52f824
  Cloning https://github.com/lincc-frameworks/pandas-ts (to revision 656d3f53131b2870c05beea42709fe439f52f824) to /private/var/folders/w1/lh3h4s7d5g10rdlfj4h0mshw0000gn/T/pip-req-build-ss_5j7se
  Running command git clone --filter=blob:none --quiet https://github.com/lincc-frameworks/pandas-ts /private/var/folders/w1/lh3h4s7d5g10rdlfj4h0mshw0000gn/T/pip-req-build-ss_5j7se
  Running command git rev-parse -q --verify 'sha^656d3f53131b2870c05beea42709fe439f52f824'
  Running command git fetch -q https://github.com/lincc-frameworks/pandas-ts 656d3f53131b2870c05beea42709fe439f52f824
  Resolved https://github.com/lincc-frameworks/pandas-ts to commit 656d3f53131b2870c05beea42709fe439f52f824
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) .

# Create a source table and pack it into nested structures and lists

In [2]:
import numpy as np
import pandas as pd
from pandas_ts.packer import pack_flat, pack_lists


# Adopted from
# https://github.com/lincc-frameworks/tape/blob/6a694c4c138aadb1508c2a96de4fa63f90319331/tests/tape_tests/conftest.py#L15
def create_test_rows():
    num_points = 1000
    all_bands = np.array(["g", "r", "i", "z"])

    rows = {
        "id": 8000 + (np.arange(num_points) % 5),
        "time": np.arange(num_points),
        "flux": np.arange(num_points) % len(all_bands),
        "band": np.repeat(all_bands, num_points / len(all_bands)),
        "err": 0.1 * (np.arange(num_points) % 10),
        "count": np.arange(num_points),
        # Not sure that I'm ready for Nones
        # "something_else": np.full(num_points, None),
    }

    return rows


sources = pd.DataFrame(create_test_rows())
sources.set_index("id", inplace=True)
sources

Unnamed: 0_level_0,time,flux,band,err,count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8000,0,0,g,0.0,0
8001,1,1,g,0.1,1
8002,2,2,g,0.2,2
8003,3,3,g,0.3,3
8004,4,0,g,0.4,4
...,...,...,...,...,...
8000,995,3,z,0.5,995
8001,996,0,z,0.6,996
8002,997,1,z,0.7,997
8003,998,2,z,0.8,998


In [3]:
packed = pack_flat(sources, name="sources")
packed

8000         time  flux band  err  count
0       0    ...
8001         time  flux band  err  count
0       1    ...
8002         time  flux band  err  count
0       2    ...
8003         time  flux band  err  count
0       3    ...
8004         time  flux band  err  count
0       4    ...
Name: sources, dtype: ts<time: [int64], flux: [int64], band: [string], err: [double], count: [int64]>

### Single item of the packed series is returned as a new DataFrame

In [4]:
packed.iloc[0]

Unnamed: 0,time,flux,band,err,count
0,0,0,g,0.0,0
1,5,1,g,0.5,5
2,10,2,g,0.0,10
3,15,3,g,0.5,15
4,20,0,g,0.0,20
...,...,...,...,...,...
195,975,3,z,0.5,975
196,980,0,z,0.0,980
197,985,1,z,0.5,985
198,990,2,z,0.0,990


In [5]:
# Get the linearly interpolated flux for time=10
packed.apply(lambda df: np.interp(10.0, df["time"], df["flux"]))

8000    2.0
8001    2.8
8002    1.2
8003    0.4
8004    1.2
Name: sources, dtype: float64

### Get packed sources series and play with `.ts` accessor
This series is a collection of structures, each structure consist of multiple fields, and each field is a "list" of values. 

In [6]:
packed.ts.to_flat()

Unnamed: 0,time,flux,band,err,count
8000,0,0,g,0.0,0
8000,5,1,g,0.5,5
8000,10,2,g,0.0,10
8000,15,3,g,0.5,15
8000,20,0,g,0.0,20
...,...,...,...,...,...
8004,979,3,z,0.9,979
8004,984,0,z,0.4,984
8004,989,1,z,0.9,989
8004,994,2,z,0.4,994


In [7]:
packed.ts.to_lists()

Unnamed: 0,time,flux,band,err,count
8000,[ 0 5 10 15 20 25 30 35 40 45 50 ...,[0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2...,['g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' '...,[0. 0.5 0. 0.5 0. 0.5 0. 0.5 0. 0.5 0. 0...,[ 0 5 10 15 20 25 30 35 40 45 50 ...
8001,[ 1 6 11 16 21 26 31 36 41 46 51 ...,[1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3...,['g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' '...,[0.1 0.6 0.1 0.6 0.1 0.6 0.1 0.6 0.1 0.6 0.1 0...,[ 1 6 11 16 21 26 31 36 41 46 51 ...
8002,[ 2 7 12 17 22 27 32 37 42 47 52 ...,[2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0...,['g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' '...,[0.2 0.7 0.2 0.7 0.2 0.7 0.2 0.7 0.2 0.7 0.2 0...,[ 2 7 12 17 22 27 32 37 42 47 52 ...
8003,[ 3 8 13 18 23 28 33 38 43 48 53 ...,[3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1...,['g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' '...,[0.3 0.8 0.3 0.8 0.3 0.8 0.3 0.8 0.3 0.8 0.3 0...,[ 3 8 13 18 23 28 33 38 43 48 53 ...
8004,[ 4 9 14 19 24 29 34 39 44 49 54 ...,[0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2...,['g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' 'g' '...,[0.4 0.9 0.4 0.9 0.4 0.9 0.4 0.9 0.4 0.9 0.4 0...,[ 4 9 14 19 24 29 34 39 44 49 54 ...


In [8]:
packed.ts["flux"]

8000    [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2...
8001    [1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3...
8002    [2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0...
8003    [3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1...
8004    [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2...
Name: flux, dtype: list<item: int64>[pyarrow]

In [9]:
packed.dtype

ts<time: [int64], flux: [int64], band: [string], err: [double], count: [int64]>

### Modify underlying fields with `.ts` accessor

In [10]:
# Change flux in place with flat arrays
packed.ts["flux"] = -1 * packed.ts["flux"].list.flatten()
packed.ts["flux"]

8000    [ 0 -1 -2 -3  0 -1 -2 -3  0 -1 -2 -3  0 -1 -2 ...
8001    [-1 -2 -3  0 -1 -2 -3  0 -1 -2 -3  0 -1 -2 -3 ...
8002    [-2 -3  0 -1 -2 -3  0 -1 -2 -3  0 -1 -2 -3  0 ...
8003    [-3  0 -1 -2 -3  0 -1 -2 -3  0 -1 -2 -3  0 -1 ...
8004    [ 0 -1 -2 -3  0 -1 -2 -3  0 -1 -2 -3  0 -1 -2 ...
Name: flux, dtype: list<item: int64>[pyarrow]

In [11]:
# Change errors for object 8003 by replacing the entire nested list series
# We need to convert it to Python lists, so we can change them in-place
err = pd.Series(packed.ts["err"], dtype=object)
err[8003] = [e + 25 for e in err[8003]]
assert len(err) == len(packed.ts._series)
packed.ts["err"] = err
packed.ts["err"]

8000    [0.  0.5 0.  0.5 0.  0.5 0.  0.5 0.  0.5 0.  0...
8001    [0.1 0.6 0.1 0.6 0.1 0.6 0.1 0.6 0.1 0.6 0.1 0...
8002    [0.2 0.7 0.2 0.7 0.2 0.7 0.2 0.7 0.2 0.7 0.2 0...
8003    [25.3 25.8 25.3 25.8 25.3 25.8 25.3 25.8 25.3 ...
8004    [0.4 0.9 0.4 0.9 0.4 0.9 0.4 0.9 0.4 0.9 0.4 0...
Name: err, dtype: list<item: double>[pyarrow]

In [12]:
# Delete field and add new one

del packed.ts["count"]
filters = packed.ts.delete_field("band")
filters = "lsst_" + filters.list.flatten()
packed.ts["filters"] = filters
np.unique(packed.ts["filters"].iloc[-1])

array(['lsst_g', 'lsst_i', 'lsst_r', 'lsst_z'], dtype='<U6')

In [13]:
packed

8000         time  flux  err filters
0       0     0  ...
8001         time  flux  err filters
0       1    -1  ...
8002         time  flux  err filters
0       2    -2  ...
8003         time  flux   err filters
0       3    -3 ...
8004         time  flux  err filters
0       4     0  ...
Name: sources, dtype: ts<time: [int64], flux: [int64], band: [string], err: [double], count: [int64]>