# 0. Step: Download the limit order book data from LakeAPI ───────────────────────────────────────────────────────────────────

In [None]:
import datetime as dt
import lakeapi
import pandas as pd
import numpy as np
import os, re, datetime as dt

In [4]:
start_date = dt.datetime(2024,10,15)
end_date  = dt.datetime(2024,10,31)
symbol = "BTC-USDT" #spot
exchange = "BINANCE"

In [3]:
df_book = lakeapi.load_data(
    table     = "book",
    start     = start_date,
    end       = end_date,
    symbols   = [symbol],
    exchanges = [exchange],
)

# # ─── 3. DINAMIKUS FÁJLNÉV ──────────────────────────────────────────────────────
file_name = (
    f"data/book_{symbol.lower().replace('-', '_')}_"
    f"{start_date:%Y%m%d}_{end_date:%Y%m%d}.parquet"
)

# # ─── 4. MENTÉS PARQUET-BE cudf használatával ─────────────────────────────────────────
df_book.to_parquet(
    file_name,
    engine="pyarrow",    # cuDF támogatja a pyarrow és fastparquet engine-t
    compression="snappy",
)

print(f"Mentés kész: {file_name}")


100%|██████████| 16/16 [00:03<00:00,  5.04it/s]


Mentés kész: data/book_btc_usdt_20241015_20241031.parquet


### What is `table="book"`?  
I’m requesting the **limit‐order‐book** table at **6 Hz** sampling rate with **20 price levels** on each side (bid/ask). Each snapshot contains **40 features**: 20 bid prices & sizes plus 20 ask prices & sizes, timestamped roughly six times per second.

---

### Why Parquet + PyArrow + Snappy?  

- **Parquet**  
  I choose Parquet because its columnar on-disk format lets me read only the columns I need, speeding up scans on wide tables.

- **PyArrow engine**  
  The PyArrow engine in cuDF/pandas leverages Apache Arrow’s zero-copy memory model, minimizing host↔device transfers when I work on GPU.

- **Snappy compression**  
  Snappy compression gives me very fast decompression with a reasonable compression ratio, so I get **fast I/O** and **smaller files**—perfect for iterating on large LOB datasets.
