In [1]:
from datasets import load_dataset, Dataset
from pandas import DataFrame
from lcdc import utils
from lcdc.vars import Variability

In [2]:
MMT_PATH = "/home/k/kyselica12/work/mmt/MMT"
object_list = utils.load_rsos_from_csv(f"{MMT_PATH}/rso.csv")
objects = {obj.norad_id: obj for obj in object_list}
track_list = utils.load_tracks_from_csv(f"{MMT_PATH}/tracks.csv")

# Transform MMT to parquet

In [28]:
from collections import defaultdict
from tqdm import tqdm
def var_to_int(var):
    match var:
        case Variability.APERIODIC:
            return 0
        case Variability.PERIODIC:
            return 1
        case Variability.NONVARIABLE:
            return 2
        case _:
            raise ValueError(f"Unknown variability: {var}")

    
track_list_by_time = sorted(track_list, key=lambda t: t.timestamp)

SIZE = 10_000
for i in range(29*SIZE, len(track_list_by_time), SIZE):
    data = defaultdict(list)
    for track in tqdm(track_list[i:i+SIZE]):
        if track.norad_id not in objects:
            continue
        track.load_data_from_file(f"{MMT_PATH}/data")
        data["norad_id"].append(track.norad_id)
        data["id"].append(track.id)
        data["period"].append(track.period)
        data["timestamp"].append(track.timestamp)
        data["data"].append(track.data)
        data["name"].append(objects[track.norad_id].name)
        data["variability"].append(var_to_int(objects[track.norad_id].variability))
    # print(i, len(track_list_by_time), i, i+SIZE)
    start = track_list_by_time[i].timestamp.split(" ")[0]
    end = track_list_by_time[min(i+SIZE-1, len(track_list_by_time)-1)].timestamp.split(" ")[0]
    ds = Dataset.from_dict(data)
    print(f"Saving {start} - {end}")
    ds.to_parquet(f"../MMT_parquet/{start}_{end}.parquet")
    del ds
    del data


100%|██████████| 2272/2272 [00:01<00:00, 1281.29it/s]


Saving 2024-09-29 - 2024-10-12


Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

# Load parquet files to Dataset

In [3]:
from lcdc import DatasetBuilder
from lcdc import vars
from lcdc import utils
import os
from datasets import Dataset, load_from_disk, concatenate_datasets
from memory_profiler import memory_usage
DATA_PATH = "/home/k/kyselica12/work/mmt/MMT_parquet"

In [3]:


    # load parquet file to Dataset
    # ds = load_from_disk(f"{DATA_PATH}/{parquet_file}")
def load_file():
    i = 0
    DS = None
    for parquet_file in os.listdir(DATA_PATH):
        # if i >= 10:
            # break
        i += 1
        print(parquet_file)
        ds = Dataset.from_parquet(f"{DATA_PATH}/{parquet_file}")
        if DS is None:
            DS = ds
        else:
            DS = concatenate_datasets([DS, ds])

mem_usage = memory_usage(load_file)
print(f"Memory usage: {max(mem_usage) - min(mem_usage)} MiB")



2015-07-25_2016-01-21.parquet
2023-01-20_2023-06-01.parquet
2020-06-29_2020-09-21.parquet
2023-07-20_2023-09-15.parquet
2019-05-23_2019-10-14.parquet
2020-02-19_2020-06-29.parquet
2021-01-20_2021-06-16.parquet
2020-09-21_2021-01-20.parquet
2024-05-04_2024-07-02.parquet
2014-06-04_2014-11-10.parquet
2016-08-23_2017-03-13.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2018-07-18_2018-11-09.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2022-01-09_2022-06-05.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2021-09-05_2022-01-09.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2014-11-10_2015-07-25.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2018-01-26_2018-07-18.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2021-06-16_2021-09-05.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2022-06-05_2022-08-10.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2017-03-13_2017-08-07.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2023-06-01_2023-07-20.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2016-01-21_2016-08-23.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2018-11-09_2019-05-23.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2024-09-29_2024-10-12.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2022-11-08_2023-01-20.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2024-07-02_2024-09-29.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2022-08-10_2022-11-08.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2023-12-31_2024-05-03.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2023-09-15_2023-12-31.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2019-10-14_2020-02-19.parquet


Generating train split: 0 examples [00:00, ? examples/s]

2017-08-07_2018-01-26.parquet


Generating train split: 0 examples [00:00, ? examples/s]

Memory usage: 925.03125 MiB


In [4]:
ds = Dataset.from_parquet(f"{DATA_PATH}/2015-07-25_2016-01-21.parquet")

In [7]:
ds.to_sql("data", "sqlite:///data.db")

Creating SQL from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

9579

In [8]:
# Load data from SQLite
ds2 = Dataset.from_sql("data", "sqlite:///data.db")


Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
len(ds2['data'][0])

4440

In [14]:
len(ds['data'][0])

555

In [15]:
ds = ds.with_format("numpy", columns=["data"])

In [5]:
import pyarrow.parquet as pq

table = pq.read_table(f"{DATA_PATH}/2015-07-25_2016-01-21.parquet")

In [6]:

import pyarrow as pa
i = 0
files= []
tables = []
for parquet_file in os.listdir(DATA_PATH):
    if i >= 10:
        break
    i += 1
    tables.append(pq.read_table(f"{DATA_PATH}/{parquet_file}"))
    files.append(f"{DATA_PATH}/{parquet_file}")

In [8]:
dataset = pq.ParquetDataset(files)
table = dataset.read()

In [9]:
import pyarrow.compute as pc
x = table.filter(pc.field('norad_id') == 25043)

In [11]:
mask = pc.is_in(table['norad_id'], value_set=pa.array([25043, 25772]))
x = table.filter(mask.combine_chunks())
x['norad_id'][10], type(mask.combine_chunks())

(<pyarrow.Int64Scalar: 25772>, pyarrow.lib.BooleanArray)

In [17]:
# Compute the mean, std for 'norad_id' column
mean = pc.mean(table['id'])


In [108]:
z = []
for i  in table['name']:
    z.append(i.as_py().replace("/", "_"))

In [117]:
xx = table.append_column("label", pa.array(z))

In [123]:
xx = xx.filter(pc.field('label') != "SCOUT B-1 R_B")

In [124]:
xx['label']

<pyarrow.lib.ChunkedArray object at 0x7f593072a2a0>
[
  [
    "IRIDIUM 38",
    "ATLAS CENTAUR R_B",
    "TITAN 3A TRANSTAGE R_B",
    "THOR ABLESTAR R_B",
    "YAOGAN 15",
    ...
    "GLOBALSTAR M027",
    "NOVA 1",
    "GLOBALSTAR M035",
    "GLOBALSTAR M086",
    "ATLAS 2AS CENTAUR R_B"
  ],
  [
    "USA 59",
    "CZ-3A R_B",
    "OV1-9",
    "DEBUT (ORIZURU)",
    "OPS 0100 (TRANSIT 15)",
    ...
    "CZ-4B DEB",
    "IRIDIUM 55",
    "IRIDIUM 33 DEB",
    "IRIDIUM 33 DEB",
    "ORBCOMM FM 106"
  ],
...,
  [
    "ORBCOMM FM 24",
    "ORBVIEW 1 (MICROLAB)",
    "DUMMY MASS 2",
    "ORBCOMM FM 27",
    "IRS 1A",
    ...
    "DELTA 2 R_B",
    "DELTA 1 DEB",
    "SCOUT A-1 R_B",
    "PEGASUS R_B",
    "DELTA 1 DEB"
  ],
  [
    "GLOBALSTAR M088",
    "ARIANE 40 R_B",
    "SARAL",
    "GLOBALSTAR M044",
    "EXPLORER 29 (GEOS 1)",
    ...
    "GLOBALSTAR M047",
    "GLOBALSTAR M051",
    "GLOBALSTAR M022",
    "GLOBALSTAR M058",
    "EXPLORER 38 (RAE-A)"
  ]
]

In [128]:
m = list(map(lambda x: x.as_py() != 0,xx['period']))

In [135]:
import numpy as np
np.array(xx['data'][0].as_py()).shape

(177, 5)

In [141]:
xx.set_column(xx.schema.get_field_index("name"), "name", pa.array(list(map(str, m))))

pyarrow.Table
norad_id: int64
id: int64
period: double
timestamp: string
data: list<element: list<element: double>>
  child 0, element: list<element: double>
      child 0, element: double
name: string
variability: int64
label: string
----
norad_id: [[25043,12445,1001,669,38354,...,25884,12458,25851,38045,26906],[20641,31116,2610,20479,2754,...,27433,25272,33886,33773,40088],...,[25478,23547,24926,25481,18960,...,40060,7893,6910,25561,12200],[37740,21610,39086,25678,1726,...,25772,25853,25649,25907,3307]]
id: [[8152773,8152972,8152974,8153099,8153210,...,8327694,8327705,8327715,8327739,8327825],[8327829,8327837,8327886,8327896,8327913,...,8449458,8449464,8449466,8449478,8449492],...,[3147905,3147965,3147990,3148037,3148087,...,3958414,3958440,3958480,3958778,3959076],[3959103,3959385,3959394,3959519,3959691,...,4244216,4244860,4245232,4245500,4247370]]
period: [[0,44.55,0,0,0,...,0,0,0,0,0],[0,113.5984254,0,0,0,...,0,0,0,6.19741088,0],...,[0,0,0,0,0,...,0,0,0,0,0],[0,0,0,0,0,...,0,23.8

In [143]:
len(xx['data'][0])

177

In [157]:
splits = [[0,len(x)] for x in table['data']]
sp = table.append_column("split", pa.array(splits))

In [156]:
# iterate over records
for r in range(len(table)):
    for c in range(len(table.schema)):
        print(table.column(c)[r], table.schema.field(c).name)
    break

5438 norad_id
8152757 id
0.0 period
2015-07-25 21:55:14.998000 timestamp
[[0.0, 9.136, 66.073, 870.767, 1.0], [0.10000014305114746, 9.048, 66.089, 870.494, 1.0], [0.20000004768371582, 9.383, 66.106, 870.221, 1.0], [0.3000001907348633, 9.376, 66.123, 869.948, 1.0], [0.40000009536743164, 8.842, 66.139, 869.676, 1.0], [0.5, 9.117, 66.156, 869.404, 1.0], [0.6000001430511475, 8.598, 66.173, 869.132, 1.0], [0.7000000476837158, 8.715, 66.19, 868.861, 1.0], [0.8000001907348633, 9.015, 66.207, 868.591, 1.0], [0.9000000953674316, 8.592, 66.223, 868.321, 1.0], [1.0, 8.692, 66.24, 868.051, 1.0], [1.1000001430511475, 8.578, 66.257, 867.782, 1.0], [1.3000001907348633, 8.857, 66.291, 867.245, 1.0], [1.4000000953674316, 8.341, 66.308, 866.977, 1.0], [1.5, 8.593, 66.325, 866.709, 1.0], [1.6000001430511475, 8.565, 66.342, 866.442, 1.0], [1.698000192642212, 8.907, 66.359, 866.181, 1.0], [1.7000000476837158, 8.478, 66.359, 866.175, 1.0], [1.7980000972747803, 8.943, 66.376, 865.915, 1.0], [1.80000019073486

In [178]:
sp["data"][0].values[:2].to_numpy(zero_copy_only=False)

array([array([  0.   ,   9.136,  66.073, 870.767,   1.   ]),
       array([1.00000143e-01, 9.04800000e+00, 6.60890000e+01, 8.70494000e+02,
              1.00000000e+00])                                               ],
      dtype=object)

In [166]:
sp.slice(0,1).to_pydict()

{'norad_id': [5438],
 'id': [8152757],
 'period': [0.0],
 'timestamp': ['2015-07-25 21:55:14.998000'],
 'data': [[[0.0, 9.136, 66.073, 870.767, 1.0],
   [0.10000014305114746, 9.048, 66.089, 870.494, 1.0],
   [0.20000004768371582, 9.383, 66.106, 870.221, 1.0],
   [0.3000001907348633, 9.376, 66.123, 869.948, 1.0],
   [0.40000009536743164, 8.842, 66.139, 869.676, 1.0],
   [0.5, 9.117, 66.156, 869.404, 1.0],
   [0.6000001430511475, 8.598, 66.173, 869.132, 1.0],
   [0.7000000476837158, 8.715, 66.19, 868.861, 1.0],
   [0.8000001907348633, 9.015, 66.207, 868.591, 1.0],
   [0.9000000953674316, 8.592, 66.223, 868.321, 1.0],
   [1.0, 8.692, 66.24, 868.051, 1.0],
   [1.1000001430511475, 8.578, 66.257, 867.782, 1.0],
   [1.3000001907348633, 8.857, 66.291, 867.245, 1.0],
   [1.4000000953674316, 8.341, 66.308, 866.977, 1.0],
   [1.5, 8.593, 66.325, 866.709, 1.0],
   [1.6000001430511475, 8.565, 66.342, 866.442, 1.0],
   [1.698000192642212, 8.907, 66.359, 866.181, 1.0],
   [1.7000000476837158, 8.478, 

In [181]:
a = {
    "test": [1,2,3],
    "list": [[1,2,3]]*3,
    "string": "asa"
}
t = pa.table(a)

In [182]:
t

pyarrow.Table
test: int64
list: list<item: int64>
  child 0, item: int64
string: string
----
test: [[1,2,3]]
list: [[[1,2,3],[1,2,3],[1,2,3]]]
string: [["a","s","a"]]