
Processing: 2025-12-20 00:00
2026-01-28 17:58:05,442 - etl.core.executor - INFO - [54ec31a3] Transform completed in 31.16s
2026-01-28 17:58:05,442 - etl.core.executor - INFO - [54ec31a3]   silver: 44,829 rows -> 2 files
2026-01-28 17:58:05,449 - __main__ - INFO - [eaecebdd] COMPLETE in 33.36s

Processing: 2025-12-20 01:00
2026-01-28 17:58:38,072 - etl.core.executor - INFO - [ca6de960] Transform completed in 29.93s
2026-01-28 17:58:38,072 - etl.core.executor - INFO - [ca6de960]   silver: 42,511 rows -> 1 files
2026-01-28 17:58:38,076 - __main__ - INFO - [bc337bc5] COMPLETE in 32.10s

Processing: 2025-12-20 02:00
2026-01-28 17:59:07,689 - etl.core.executor - INFO - [a60cca3e] Transform completed in 26.82s
2026-01-28 17:59:07,689 - etl.core.executor - INFO - [a60cca3e]   silver: 27,663 rows -> 1 files
2026-01-28 17:59:07,693 - __main__ - INFO - [b199630e] COMPLETE in 29.06s

Processing: 2025-12-20 03:00
2026-01-28 17:59:35,942 - etl.core.executor - INFO - [f40f5922] Transform completed i

In [1]:
import os
import numpy as np
import pandas as pd
import polars as pl

In [None]:
lf = pl.scan_parquet("data/processed/silver/orderbook/**/*.parquet")
lf.collect_schema()


## ETL

### Orderbook Raw -> Features

In [None]:
import subprocess
from datetime import datetime, timedelta

# Configuration
EXCHANGE = "coinbaseadvanced"
SYMBOL = "BTC-USD"
STATE_PATH = "btc-usd_orderbook_state.json"

# Date range (inclusive)
START_DATE = datetime(2025, 12, 20, 0)   # year, month, day, hour
END_DATE = datetime(2026, 1, 26, 23)    # year, month, day, hour

# Track statistics
stats = {"processed": 0, "skipped_no_data": 0, "failed": 0}

# Iterate hour by hour
current = START_DATE
while current <= END_DATE:
    # Build command with venv activation (Windows)
    script_args = (
        f"--trades data/raw/ready/ccxt/trades "
        f"--exchange {EXCHANGE} "
        f"--symbol {SYMBOL} "
        f"--year {current.year} "
        f"--month {current.month} "
        f"--day {current.day} "
        f"--hour {current.hour} "
        f"--state-path {STATE_PATH}"
    )
    cmd = f"venv\\Scripts\\activate && python scripts/etl/run_orderbook_features.py {script_args}"
    
    print(f"\n{'='*60}")
    print(f"Processing: {current.strftime('%Y-%m-%d %H:00')}")
    print(f"{'='*60}")
    
    result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
    
    if result.returncode != 0:
        stderr = result.stderr.lower()
        # Check if it's a "no data" error (expected for gaps in ingestion)
        if "no matching files" in stderr or "empty" in stderr or "0 rows" in stderr:
            print(f"SKIPPED: No data available for this hour")
            stats["skipped_no_data"] += 1
            # Continue to next hour - state is preserved for when data resumes
        else:
            print(f"FAILED: {result.stderr}")
            stats["failed"] += 1
            # Uncomment to stop on real errors:
            # break
    else:
        # Print last few lines of stderr (where logs go)
        lines = result.stderr.strip().split('\n')
        for line in lines[-5:]:
            print(line)
        stats["processed"] += 1
    
    current += timedelta(hours=1)

print(f"\n{'='*60}")
print("Batch processing complete!")
print(f"  Processed: {stats['processed']} hours")
print(f"  Skipped (no data): {stats['skipped_no_data']} hours")
print(f"  Failed: {stats['failed']} hours")
print(f"{'='*60}")

In [None]:
# Lazy scan all orderbook files
orderbook_lf = pl.scan_parquet("data/raw/ready/ccxt/orderbook/**/*.parquet")
orderbook_lf.collect_schema()

Schema([('collected_at', Int64),
        ('capture_ts', Datetime(time_unit='us', time_zone='UTC')),
        ('exchange', String),
        ('symbol', String),
        ('year', Int32),
        ('month', Int32),
        ('day', Int32),
        ('hour', Int32),
        ('timestamp', Int64),
        ('nonce', Int64),
        ('bids', List(Struct({'price': Float64, 'size': Float64}))),
        ('asks', List(Struct({'price': Float64, 'size': Float64})))])

In [None]:
# Import the production ASOF resampling from our ETL library
from etl.utils.resampling import resample_orderbook, get_resampling_stats

# The resample_orderbook function implements ASOF semantics:
# - "At time T, what was the most recent known orderbook state?"
# - No lookahead bias - safe for backtesting and ML training
# - Configurable staleness detection and gap filling

# See etl/utils/resampling.py for full implementation and documentation

In [None]:
# Resample orderbook data to 1-minute frequency using ASOF semantics
resampled_orderbook_lf = resample_orderbook(
    orderbook_lf,
    group_cols=["symbol", "exchange"],  # Resample per asset/exchange
    frequency="1m",
    max_staleness="5m",  # Flag data older than 5 minutes as stale
)

# Add basic features on the resampled data
orderbook_features_lf = resampled_orderbook_lf.with_columns([
    # Best bid/ask from nested struct arrays
    pl.col("bids").list.get(0).struct.field("price").alias("best_bid"),
    pl.col("asks").list.get(0).struct.field("price").alias("best_ask"),
]).with_columns([
    # Mid-price, Spread, Spread in bps
    ((pl.col("best_bid") + pl.col("best_ask")) / 2).alias("mid_price"),
    (pl.col("best_ask") - pl.col("best_bid")).alias("spread"),
    ((pl.col("best_ask") - pl.col("best_bid")) / ((pl.col("best_bid") + pl.col("best_ask")) / 2) * 10000).alias("spread_bps"),
])

# Finalize/Cleanup - drop raw orderbook arrays
orderbook_features_lf = orderbook_features_lf.drop("bids", "asks")

In [None]:
# View demonstration on smaller subset (BTC-USD on Coinbase)
btc_features_lf = orderbook_features_lf.filter( 
    (pl.col("symbol") == "BTC-USD")
    & (pl.col("exchange") == "coinbaseadvanced")
)
btc_features_df = btc_features_lf.collect()

# Get resampling quality stats
stats = get_resampling_stats(btc_features_df)

print(f"Resampled rows: {stats['total_rows']}")
print(f"Time range: {btc_features_df['timestamp'].min()} to {btc_features_df['timestamp'].max()}")
print(f"\nData quality:")
print(f"  - Stale rows (>5m old): {stats.get('stale_rows', 'N/A')}")
print(f"  - Stale %: {stats.get('stale_pct', 0):.2f}%")
print(f"  - Avg staleness: {stats.get('avg_staleness', 'N/A')}")
print(f"  - Max staleness: {stats.get('max_staleness', 'N/A')}")
btc_features_df.head(10)

Resampled rows: 10585
Time range: 2025-12-12 17:05:00 to 2025-12-20 02:36:00

Data quality:
  - Stale rows (>5m old): 0
  - Avg staleness: 0:00:00.195000
  - Max staleness: 0:00:55.244000


symbol,exchange,original_timestamp,snapshot_count,timestamp,data_staleness,is_stale,best_bid,best_ask,mid_price,spread,spread_bps
str,str,datetime[ms],u32,datetime[ms],duration[ms],bool,f64,f64,f64,f64,f64
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:04:59.322,633,2025-12-12 17:05:00,678ms,False,90098.0,90098.41,90098.205,0.41,0.045506
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:05:59.971,573,2025-12-12 17:06:00,29ms,False,90155.69,90157.76,90156.725,2.07,0.2296
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:06:59.972,555,2025-12-12 17:07:00,28ms,False,90234.0,90235.17,90234.585,1.17,0.129662
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:07:59.969,460,2025-12-12 17:08:00,31ms,False,90314.05,90314.06,90314.055,0.01,0.001107
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:08:59.871,361,2025-12-12 17:09:00,129ms,False,90300.0,90304.0,90302.0,4.0,0.442958
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:09:15.422,86,2025-12-12 17:10:00,44s 578ms,False,90222.0,90222.01,90222.005,0.01,0.001108
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:30:59.970,216,2025-12-12 17:31:00,30ms,False,89973.36,89973.37,89973.365,0.01,0.001111
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:31:59.870,709,2025-12-12 17:32:00,130ms,False,89907.28,89907.29,89907.285,0.01,0.001112
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:32:59.970,631,2025-12-12 17:33:00,30ms,False,90079.38,90079.39,90079.385,0.01,0.00111
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:33:59.972,668,2025-12-12 17:34:00,28ms,False,90146.44,90146.45,90146.445,0.01,0.001109


In [None]:
btc_resampled_df

symbol,exchange,original_timestamp,snapshot_count,timestamp,data_staleness,is_stale,best_bid,best_ask,mid_price,spread,spread_bps
str,str,datetime[ms],u32,datetime[ms],duration[ms],bool,f64,f64,f64,f64,f64
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:04:59.322,633,2025-12-12 17:05:00,678ms,false,90098.0,90098.41,90098.205,0.41,0.045506
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:05:59.971,573,2025-12-12 17:06:00,29ms,false,90155.69,90157.76,90156.725,2.07,0.2296
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:06:59.972,555,2025-12-12 17:07:00,28ms,false,90234.0,90235.17,90234.585,1.17,0.129662
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:07:59.969,460,2025-12-12 17:08:00,31ms,false,90314.05,90314.06,90314.055,0.01,0.001107
"""BTC-USD""","""coinbaseadvanced""",2025-12-12 17:08:59.871,361,2025-12-12 17:09:00,129ms,false,90300.0,90304.0,90302.0,4.0,0.442958
…,…,…,…,…,…,…,…,…,…,…,…
"""BTC-USD""","""coinbaseadvanced""",2025-12-20 02:31:59.951,991,2025-12-20 02:32:00,49ms,false,88090.03,88090.88,88090.455,0.85,0.096492
"""BTC-USD""","""coinbaseadvanced""",2025-12-20 02:32:59.764,841,2025-12-20 02:33:00,236ms,false,88108.0,88108.64,88108.32,0.64,0.072638
"""BTC-USD""","""coinbaseadvanced""",2025-12-20 02:33:59.951,917,2025-12-20 02:34:00,49ms,false,88128.01,88128.16,88128.085,0.15,0.017021
"""BTC-USD""","""coinbaseadvanced""",2025-12-20 02:34:59.801,865,2025-12-20 02:35:00,199ms,false,88143.99,88144.0,88143.995,0.01,0.001135


---
## Universe Selection

In [None]:
import ccxt.pro as ccxtpro
import time
import pandas as pd
import numpy as np

In [None]:
e1 = ccxtpro.coinbaseadvanced()
e2 = ccxtpro.binanceus()
e3 = ccxtpro.kraken()

In [None]:
# Fetch tickers for all symbols to get volume data
tickers = await e3.fetch_tickers()

# Convert to list of tuples (symbol, volume) and sort by volume
volume_data = [(symbol, ticker.get('quoteVolume', 0)) for symbol, ticker in tickers.items() if ticker.get('quoteVolume')]
volume_data_sorted = sorted(volume_data, key=lambda x: x[1], reverse=True)

# Get top N symbols
N = 10000000
top_symbols = [symbol for symbol, volume in volume_data_sorted[:N]]

print(f"Top {N} symbols by 24h quote volume:")
for i, (symbol, volume) in enumerate(volume_data_sorted[:N], 1):
    print(f"{i}. {symbol}: ${volume:,.0f}")

Top 10000000 symbols by 24h quote volume:
1. BTC/USD: $322,993,564
2. USDT/USD: $267,667,174
3. ETH/USD: $162,346,716
4. USDC/EUR: $106,835,998
5. SOL/USD: $92,497,122
6. BTC/EUR: $66,292,379
7. USDC/USD: $66,033,869
8. EUR/USD: $54,426,888
9. USDT/EUR: $51,089,303
10. XRP/USD: $45,857,165
11. XMR/USD: $32,851,334
12. USDC/USDT: $32,229,453
13. ETH/EUR: $30,238,118
14. BTC/USDC: $28,446,542
15. DASH/USD: $25,317,608
16. SUI/USD: $22,011,569
17. BTC/JPY: $19,697,685
18. XMR/USDT: $19,279,475
19. ZEC/USD: $18,950,293
20. BTC/USDT: $17,646,357
21. LTC/USD: $15,235,481
22. SOL/EUR: $12,609,798
23. DOGE/USD: $12,059,357
24. ETH/USDC: $11,087,691
25. XRP/EUR: $10,547,699
26. ICP/USD: $10,232,376
27. USDC/GBP: $9,980,570
28. ETH/USDT: $9,878,240
29. GBP/USD: $9,797,884
30. SOL/USDT: $9,278,346
31. SOL/USDC: $8,942,754
32. FARTCOIN/USD: $8,924,330
33. ADA/USD: $8,432,785
34. USD/JPY: $8,129,082
35. TAO/USD: $5,881,874
36. BTC/GBP: $5,085,211
37. EUR/JPY: $5,082,479
38. LINK/USD: $4,858,533
39.

In [None]:
# Define universe size
universe_size = 200

# Filter to get symbols ending with /USDC for consistency
usdc_symbols = [(sym, vol) for sym, vol in volume_data_sorted if sym.endswith('/USD') and not sym.startswith("USD")]

# Define group sizes (proportions of universe_size)
tier1_size = 40  # Top 40 most liquid
tier2_size = 80  # Next 40 moderately liquid
tier3_size = 80  # Next 40 less liquid but still tradeable

# Create the three tiers
tier1_institutional = [sym for sym, vol in usdc_symbols[:tier1_size]]
tier2_competitive = [sym for sym, vol in usdc_symbols[tier1_size:tier1_size + tier2_size]]
tier3_niche = [sym for sym, vol in usdc_symbols[tier1_size + tier2_size:universe_size]]

# print(f"Universe Size: {universe_size}")
# print(f"\n{'='*80}")
# print(f"TIER 1 - INSTITUTIONAL (Top {tier1_size} by volume)")
# print(f"{'='*80}")
# print(f"Characteristics: Highest liquidity, tight spreads, heavy competition")
# print(f"Volume range: ${usdc_symbols[0][1]:,.0f} - ${usdc_symbols[tier1_size-1][1]:,.0f}")
# for i, (sym, vol) in enumerate(usdc_symbols[:tier1_size], 1):
#     print(f"{i:2d}. {sym:20s} ${vol:>15,.0f}")

# print(f"\n{'='*80}")
# print(f"TIER 2 - COMPETITIVE (Next {tier2_size} by volume)")
# print(f"{'='*80}")
# print(f"Characteristics: Good liquidity, reasonable spreads, moderate competition")
# print(f"Volume range: ${usdc_symbols[tier1_size][1]:,.0f} - ${usdc_symbols[tier1_size + tier2_size-1][1]:,.0f}")
# for i, (sym, vol) in enumerate(usdc_symbols[tier1_size:tier1_size + tier2_size], 1):
#     print(f"{i:2d}. {sym:20s} ${vol:>15,.0f}")

# print(f"\n{'='*80}")
# print(f"TIER 3 - NICHE/EDGE (Next {tier3_size} by volume)")
# print(f"{'='*80}")
# print(f"Characteristics: Adequate liquidity, wider spreads, potential edge opportunities")
# print(f"Volume range: ${usdc_symbols[tier1_size + tier2_size][1]:,.0f} - ${usdc_symbols[universe_size-1][1]:,.0f}")
# for i, (sym, vol) in enumerate(usdc_symbols[tier1_size + tier2_size:universe_size], 1):
#     print(f"{i:2d}. {sym:20s} ${vol:>15,.0f}")

# Create final symbols list for the universe
symbols = tier1_institutional + tier2_competitive + tier3_niche

print(f"\n{'='*80}")
print(f"SUMMARY")
print(f"{'='*80}")
print(f"Total symbols in universe: {len(symbols)}")
print(f"Tier 1 (Institutional): {len(tier1_institutional)}")
print(f"Tier 2 (Competitive): {len(tier2_competitive)}")
print(f"Tier 3 (Niche/Edge): {len(tier3_niche)}")


SUMMARY
Total symbols in universe: 200
Tier 1 (Institutional): 40
Tier 2 (Competitive): 80
Tier 3 (Niche/Edge): 80


In [None]:
len(symbols)

200

In [None]:
print(symbols[:25])

['BTC/USD', 'ETH/USD', 'SOL/USD', 'EUR/USD', 'XRP/USD', 'XMR/USD', 'DASH/USD', 'SUI/USD', 'ZEC/USD', 'LTC/USD', 'DOGE/USD', 'ICP/USD', 'GBP/USD', 'FARTCOIN/USD', 'ADA/USD', 'TAO/USD', 'LINK/USD', 'BCH/USD', 'PAXG/USD', 'PUMP/USD', 'PEPE/USD', 'CC/USD', 'RENDER/USD', 'XAUT/USD', 'WIF/USD']


In [None]:
import json

# Read NDJSON file
ndjson_data = []
fpath = "C:/Users/longp/FluxForge/data/raw/ready/ccxt_binanceus/segment_20251209T00_00001.ndjson"
with open(fpath, 'r') as f:
    for line in f:
        ndjson_data.append(json.loads(line))

In [None]:
orderbook_data = [entry for entry in ndjson_data if entry['type'] == 'orderbook']

In [None]:
orderbook_entry = orderbook_data[0]
orderbook_entry

{'type': 'orderbook',
 'exchange': 'binanceus',
 'symbol': 'BTC/USDT',
 'method': 'watchOrderBook',
 'data': {'bids': [[90390.05, 2.23167],
   [90390.04, 0.1275],
   [90389.98, 0.10912],
   [90389.82, 0.94748],
   [90368.32, 0.00163],
   [90282.94, 0.1001],
   [90282.93, 0.12979],
   [90282.92, 0.00082],
   [90268.75, 0.00011],
   [90236.1, 0.27416],
   [90202.86, 0.0001],
   [90191.0, 0.32148],
   [90163.7, 0.33093],
   [90129.95, 0.00037],
   [90052.63, 0.0016],
   [90048.7, 0.10849],
   [90000.0, 0.00549],
   [89964.7, 0.11121],
   [89919.82, 0.00234],
   [89892.53, 7e-05],
   [89886.91, 0.00158],
   [89879.1, 0.10731],
   [89842.2, 0.10638],
   [89833.12, 0.00025],
   [89810.37, 0.00011],
   [89780.13, 0.00089],
   [89771.76, 0.00043],
   [89744.45, 0.00083],
   [89714.33, 7e-05],
   [89686.82, 4e-05],
   [89671.73, 0.00127],
   [89580.68, 0.00037],
   [89556.03, 0.00014],
   [89534.74, 0.00073],
   [89516.66, 0.00079],
   [89510.78, 0.00044],
   [89500.0, 0.00138],
   [89457.24, 0

In [None]:
def extract_orderbook_features(orderbook_entry):
    """
    Extract comprehensive orderbook features from a single orderbook snapshot.
    Returns a dictionary of features with predictive power for modeling.
    """
    data = orderbook_entry['data']
    bids = np.array(data['bids'])  # [[price, size], ...]
    asks = np.array(data['asks'])  # [[price, size], ...]
    
    features = {
        'timestamp': orderbook_entry['collected_at'],
        'symbol': orderbook_entry['symbol'],
        'exchange': orderbook_entry['exchange'],
    }
    
    # ===== BASIC FEATURES =====
    # Best bid/ask
    features['best_bid'] = bids[0][0] if len(bids) > 0 else np.nan
    features['best_ask'] = asks[0][0] if len(asks) > 0 else np.nan
    features['mid_price'] = (features['best_bid'] + features['best_ask']) / 2
    features['spread'] = features['best_ask'] - features['best_bid']
    features['spread_bps'] = (features['spread'] / features['mid_price']) * 10000
    
    # Volume at best
    features['bid_size_level_0'] = bids[0][1] if len(bids) > 0 else 0
    features['ask_size_level_0'] = asks[0][1] if len(asks) > 0 else 0
    
    # ===== DEPTH FEATURES =====
    # Cumulative volume across levels
    n_levels = min(len(bids), len(asks), 10)
    for i in range(n_levels):
        if i < len(bids):
            features[f'bid_size_level_{i}'] = bids[i][1]
            features[f'bid_price_level_{i}'] = bids[i][0]
        if i < len(asks):
            features[f'ask_size_level_{i}'] = asks[i][1]
            features[f'ask_price_level_{i}'] = asks[i][0]
    
    # Total volume by depth
    for depth in [5, 10, 20, 50]:
        depth = min(depth, len(bids), len(asks))
        features[f'bid_volume_{depth}'] = np.sum(bids[:depth, 1])
        features[f'ask_volume_{depth}'] = np.sum(asks[:depth, 1])
        features[f'total_volume_{depth}'] = features[f'bid_volume_{depth}'] + features[f'ask_volume_{depth}']
        features[f'volume_imbalance_{depth}'] = (features[f'bid_volume_{depth}'] - features[f'ask_volume_{depth}']) / features[f'total_volume_{depth}']
    
    # ===== IMBALANCE FEATURES =====
    # Order flow imbalance at different levels
    features['ofi_level_0'] = (features['bid_size_level_0'] - features['ask_size_level_0']) / (features['bid_size_level_0'] + features['ask_size_level_0'])
    
    # Weighted order imbalance
    bid_notional_5 = np.sum(bids[:5, 0] * bids[:5, 1])
    ask_notional_5 = np.sum(asks[:5, 0] * asks[:5, 1])
    features['weighted_oi_5'] = (bid_notional_5 - ask_notional_5) / (bid_notional_5 + ask_notional_5)
    
    # ===== PRICE LEVEL FEATURES =====
    # Distance between levels
    features['bid_level_spacing_mean'] = np.mean(np.diff(bids[:10, 0])) if len(bids) >= 2 else 0
    features['ask_level_spacing_mean'] = np.mean(np.diff(asks[:10, 0])) if len(asks) >= 2 else 0
    
    # Price impact (how much price moves per unit volume)
    features['bid_depth_10bps'] = np.sum(bids[bids[:, 0] >= features['best_bid'] * 0.999, 1]) if len(bids) > 0 else 0
    features['ask_depth_10bps'] = np.sum(asks[asks[:, 0] <= features['best_ask'] * 1.001, 1]) if len(asks) > 0 else 0
    
    # ===== LIQUIDITY FEATURES =====
    # Volume-weighted average price
    features['vwap_bid_5'] = np.sum(bids[:5, 0] * bids[:5, 1]) / np.sum(bids[:5, 1]) if len(bids) >= 5 else np.nan
    features['vwap_ask_5'] = np.sum(asks[:5, 0] * asks[:5, 1]) / np.sum(asks[:5, 1]) if len(asks) >= 5 else np.nan
    features['vwap_spread'] = features['vwap_ask_5'] - features['vwap_bid_5']
    
    # Microprice (fair value estimate)
    features['microprice'] = (features['best_bid'] * features['ask_size_level_0'] + features['best_ask'] * features['bid_size_level_0']) / (features['bid_size_level_0'] + features['ask_size_level_0'])
    
    # ===== SHAPE FEATURES =====
    # Distribution of liquidity
    bid_volume_cumsum = np.cumsum(bids[:, 1])
    ask_volume_cumsum = np.cumsum(asks[:, 1])
    
    # Find where 50% and 90% of volume is
    total_bid_vol = bid_volume_cumsum[-1] if len(bid_volume_cumsum) > 0 else 0
    total_ask_vol = ask_volume_cumsum[-1] if len(ask_volume_cumsum) > 0 else 0
    
    features['bid_50pct_depth'] = np.argmax(bid_volume_cumsum >= total_bid_vol * 0.5) if total_bid_vol > 0 else 0
    features['ask_50pct_depth'] = np.argmax(ask_volume_cumsum >= total_ask_vol * 0.5) if total_ask_vol > 0 else 0
    
    # Concentration (Herfindahl index)
    if total_bid_vol > 0:
        bid_shares = bids[:10, 1] / total_bid_vol
        features['bid_concentration'] = np.sum(bid_shares ** 2)
    else:
        features['bid_concentration'] = 0
        
    if total_ask_vol > 0:
        ask_shares = asks[:10, 1] / total_ask_vol
        features['ask_concentration'] = np.sum(ask_shares ** 2)
    else:
        features['ask_concentration'] = 0
    
    # ===== PRESSURE FEATURES =====
    # Volume ratio at different depths
    for depth in [3, 5, 10]:
        bid_vol = np.sum(bids[:depth, 1])
        ask_vol = np.sum(asks[:depth, 1])
        features[f'volume_ratio_{depth}'] = bid_vol / ask_vol if ask_vol > 0 else np.inf
    
    # Notional value pressure
    bid_notional_10 = np.sum(bids[:10, 0] * bids[:10, 1])
    ask_notional_10 = np.sum(asks[:10, 0] * asks[:10, 1])
    features['notional_pressure_10'] = (bid_notional_10 - ask_notional_10) / (bid_notional_10 + ask_notional_10)
    
    # ===== ADVANCED FEATURES =====
    # Smart depth (volume-adjusted by distance from mid)
    bid_distances = (features['mid_price'] - bids[:, 0]) / features['mid_price']
    ask_distances = (asks[:, 0] - features['mid_price']) / features['mid_price']
    
    features['smart_bid_depth'] = np.sum(bids[:, 1] * np.exp(-100 * bid_distances))
    features['smart_ask_depth'] = np.sum(asks[:, 1] * np.exp(-100 * ask_distances))
    features['smart_depth_imbalance'] = (features['smart_bid_depth'] - features['smart_ask_depth']) / (features['smart_bid_depth'] + features['smart_ask_depth'])
    
    # Kyle's lambda (price impact coefficient approximation)
    features['kyle_lambda_bid'] = features['spread'] / (2 * features['bid_volume_5']) if features['bid_volume_5'] > 0 else np.nan
    features['kyle_lambda_ask'] = features['spread'] / (2 * features['ask_volume_5']) if features['ask_volume_5'] > 0 else np.nan
    
    # Amihud illiquidity measure (approximation)
    features['amihud_illiquidity'] = features['spread'] / (features['bid_volume_10'] + features['ask_volume_10'])
    
    # Order book slope (price change per volume)
    if len(bids) >= 5:
        bid_prices_5 = bids[:5, 0]
        bid_volumes_cumsum = np.cumsum(bids[:5, 1])
        features['bid_slope'] = (bid_prices_5[-1] - bid_prices_5[0]) / bid_volumes_cumsum[-1] if bid_volumes_cumsum[-1] > 0 else 0
    else:
        features['bid_slope'] = 0
        
    if len(asks) >= 5:
        ask_prices_5 = asks[:5, 0]
        ask_volumes_cumsum = np.cumsum(asks[:5, 1])
        features['ask_slope'] = (ask_prices_5[-1] - ask_prices_5[0]) / ask_volumes_cumsum[-1] if ask_volumes_cumsum[-1] > 0 else 0
    else:
        features['ask_slope'] = 0
    
    # Volume-weighted spread
    total_vol_10 = features['bid_volume_10'] + features['ask_volume_10']
    features['vw_spread'] = (features['vwap_ask_5'] - features['vwap_bid_5']) if not np.isnan(features['vwap_ask_5']) else features['spread']
    
    # Relative spread (normalized by mid price)
    features['relative_spread'] = features['spread'] / features['mid_price']
    
    # Effective spread (considering volume)
    features['effective_spread'] = 2 * abs(features['microprice'] - features['mid_price'])
    
    return features

In [None]:
extract_orderbook_features(orderbook_data[0])

{'timestamp': 1765241057511,
 'symbol': 'BTC/USDT',
 'exchange': 'binanceus',
 'best_bid': 90390.05,
 'best_ask': 90698.75,
 'mid_price': 90544.4,
 'spread': 308.6999999999971,
 'spread_bps': 34.093770570018364,
 'bid_size_level_0': 2.23167,
 'ask_size_level_0': 0.01966,
 'bid_price_level_0': 90390.05,
 'ask_price_level_0': 90698.75,
 'bid_size_level_1': 0.1275,
 'bid_price_level_1': 90390.04,
 'ask_size_level_1': 0.1001,
 'ask_price_level_1': 90698.76,
 'bid_size_level_2': 0.10912,
 'bid_price_level_2': 90389.98,
 'ask_size_level_2': 2e-05,
 'ask_price_level_2': 90698.82,
 'bid_size_level_3': 0.94748,
 'bid_price_level_3': 90389.82,
 'ask_size_level_3': 0.12896,
 'ask_price_level_3': 90698.85,
 'bid_size_level_4': 0.00163,
 'bid_price_level_4': 90368.32,
 'ask_size_level_4': 0.057,
 'ask_price_level_4': 90699.0,
 'bid_size_level_5': 0.1001,
 'bid_price_level_5': 90282.94,
 'ask_size_level_5': 0.01058,
 'ask_price_level_5': 90699.14,
 'bid_size_level_6': 0.12979,
 'bid_price_level_6': 

In [None]:
# CCXT Implementation Plan
# 1. Add CcxtConfig to config/config.py
# 2. Create ingestion/collectors/ccxt_collector.py
# 3. Update ingestion/orchestrators/ingestion_pipeline.py
# 4. Update config/config.examples.yaml

In [None]:
import sys
import os
import duckdb
import logging
from tqdm import tqdm
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
from config import load_config
import pyarrow.parquet as pq
import pyarrow as pa

logger = logging.getLogger(__name__)

config = load_config("config/config.yaml")

logging.basicConfig(
        level=getattr(logging, config.log_level),
        format=config.log_format
    )

  class DatabentoConfig(BaseModel):


In [None]:
pl_df = pl.scan_parquet("F:/processed/coinbase/ticker/**/*.parquet")
df = pl_df.filter(pl.col("product_id") == "BTC-USD").collect()
df

## Compaction

In [None]:
from etl.repartitioner import ParquetCompactor

compactor = ParquetCompactor(
    dataset_dir="F:/processed/coinbase/level2/",  # Directory containing the dataset
    target_file_size_mb=100,  # Target 100MB files
)

stats = compactor.compact(
    min_file_count=2,          # Only compact partitions with 2+ files
    target_file_count=1,       # Consolidate to exactly 1 file per partition
    delete_source_files=True,  # Delete original files after compaction
    dry_run=False,
)
stats

## Syncing

In [1]:
import logging
from config import load_config
from storage.sync import *
from storage.factory import create_sync_source_storage, create_sync_destination_storage

logger = logging.getLogger(__name__)
config = load_config("config/config.yaml")
logging.basicConfig(
        level=getattr(logging, config.log_level),
        format=config.log_format
    )

In [2]:
s3_storage = create_sync_destination_storage(config)  # s3
local_storage = create_sync_source_storage(config)  # local


source_storage = s3_storage # from
destination_storage = local_storage # to

sync = StorageSync(
    source=source_storage,
    destination=destination_storage,
)

2026-01-27 18:50:25,552 - storage.factory - INFO - [sync_destination] Initializing S3 storage: market-data-vault
2026-01-27 18:50:25,643 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2026-01-27 18:50:25,788 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2026-01-27 18:50:25,866 - storage.factory - INFO - [sync_source] Initializing local storage: ./data
2026-01-27 18:50:25,867 - storage.sync - INFO - [StorageSync] Initialized: s3:market-data-vault → local:./data


In [3]:
SYNC_PATHS = [
    "raw/ready/ccxt/orderbook/",
    "raw/ready/ccxt/trades/",
    "raw/ready/ccxt/ticker/",
]
suffix = ""
for sync_path in SYNC_PATHS:
    sync_path = sync_path + "exchange=coinbaseadvanced/"
    print(f"Syncing path: {sync_path}")
    # Sync processed parquet files
    stats = sync.sync(
        source_path=sync_path,
        dest_path=sync_path,
        pattern="**/*.parquet",
        recursive_list_files=False,
        delete_after_transfer=False,
        max_workers=50,
        skip_existing=True,
        dry_run=False,
    )
    print(stats)

2026-01-27 18:50:32,786 - storage.sync - INFO - [StorageSync] Listing files in raw/ready/ccxt/orderbook/exchange=coinbaseadvanced/ with pattern **/*.parquet


Syncing path: raw/ready/ccxt/orderbook/exchange=coinbaseadvanced/


2026-01-27 18:51:10,548 - storage.sync - INFO - [StorageSync] Found 119751 files to sync
2026-01-27 18:51:10,638 - storage.sync - INFO - [StorageSync] Checking for existing files at destination...
2026-01-27 18:51:19,524 - storage.sync - INFO - [StorageSync] Found 112213 existing files at destination
2026-01-27 18:51:19,572 - storage.sync - INFO - [StorageSync] 112213 files already exist, 7538 to transfer
2026-01-27 18:51:19,573 - storage.sync - INFO - [StorageSync] Transferring 7538 files (8215.42 MB)
Transferring files: 100%|██████████| 7538/7538 [03:40<00:00, 34.11file/s, MB=8215.4/8215.4, failed=0]
2026-01-27 18:55:01,098 - storage.sync - INFO - [StorageSync] Sync complete: SyncStats(transferred=7538, skipped=112213, failed=0, bytes=8215.42MB, duration=268.3s)
2026-01-27 18:55:01,171 - storage.sync - INFO - [StorageSync] Listing files in raw/ready/ccxt/trades/exchange=coinbaseadvanced/ with pattern **/*.parquet


SyncStats(transferred=7538, skipped=112213, failed=0, bytes=8215.42MB, duration=268.3s)
Syncing path: raw/ready/ccxt/trades/exchange=coinbaseadvanced/


2026-01-27 18:55:16,344 - storage.sync - INFO - [StorageSync] Found 33630 files to sync
2026-01-27 18:55:16,383 - storage.sync - INFO - [StorageSync] Checking for existing files at destination...
2026-01-27 18:55:23,023 - storage.sync - INFO - [StorageSync] Found 30386 existing files at destination
2026-01-27 18:55:23,042 - storage.sync - INFO - [StorageSync] 30386 files already exist, 3244 to transfer
2026-01-27 18:55:23,043 - storage.sync - INFO - [StorageSync] Transferring 3244 files (2322.02 MB)
Transferring files: 100%|██████████| 3244/3244 [01:12<00:00, 44.74file/s, MB=2322.0/2322.0, failed=0] 
2026-01-27 18:56:35,803 - storage.sync - INFO - [StorageSync] Sync complete: SyncStats(transferred=3244, skipped=30386, failed=0, bytes=2322.02MB, duration=94.6s)
2026-01-27 18:56:35,832 - storage.sync - INFO - [StorageSync] Listing files in raw/ready/ccxt/ticker/exchange=coinbaseadvanced/ with pattern **/*.parquet


SyncStats(transferred=3244, skipped=30386, failed=0, bytes=2322.02MB, duration=94.6s)
Syncing path: raw/ready/ccxt/ticker/exchange=coinbaseadvanced/


2026-01-27 18:56:53,831 - storage.sync - INFO - [StorageSync] Found 36433 files to sync
2026-01-27 18:56:53,869 - storage.sync - INFO - [StorageSync] Checking for existing files at destination...
2026-01-27 18:57:01,107 - storage.sync - INFO - [StorageSync] Found 32549 existing files at destination
2026-01-27 18:57:01,131 - storage.sync - INFO - [StorageSync] 32549 files already exist, 3884 to transfer
2026-01-27 18:57:01,133 - storage.sync - INFO - [StorageSync] Transferring 3884 files (7261.31 MB)
Transferring files: 100%|██████████| 3884/3884 [03:12<00:00, 20.20file/s, MB=7261.3/7261.3, failed=0]
2026-01-27 19:00:13,725 - storage.sync - INFO - [StorageSync] Sync complete: SyncStats(transferred=3884, skipped=32549, failed=0, bytes=7261.31MB, duration=217.9s)


SyncStats(transferred=3884, skipped=32549, failed=0, bytes=7261.31MB, duration=217.9s)


## Deleting

In [None]:
from storage.base import batch_delete_files

### Delete Non-Partitioned Files Only

To delete only the non-partitioned parquet files (in immediate directory) while keeping partitioned ones (in subdirectories), use `recursive=False`:

In [None]:
# Get only non-partitioned files (immediate directory, not recursive)
# This will match: raw/ready/ccxt/ticker/*.parquet
# But NOT: raw/ready/ccxt/ticker/exchange=binanceus/**/*.parquet
non_partitioned_files = storage.list_files(
    path="raw/ready/ccxt/ticker/",
    pattern="*.parquet",
    recursive=False  # Key: only immediate directory
)

print(f"Found {len(non_partitioned_files)} non-partitioned files")

Found 0 non-partitioned files


In [None]:
result = batch_delete_files(
    storage, 
    paths=[f["path"] for f in non_partitioned_files],
    dry_run=False
)
result

2025-12-20 02:28:50,184 - storage.base - INFO - Batch delete complete: 170 deleted, 0 failed out of 170 files


{'deleted': 170,
 'failed': 0,
 'errors': [],
 'files': ['raw/ready/ccxt/trades/segment_20251212T17_00001.parquet',
  'raw/ready/ccxt/trades/segment_20251212T17_00007.parquet',
  'raw/ready/ccxt/trades/segment_20251212T17_00010.parquet',
  'raw/ready/ccxt/trades/segment_20251212T17_00011.parquet',
  'raw/ready/ccxt/trades/segment_20251212T17_00014.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00002.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00005.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00006.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00007.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00008.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00009.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00010.parquet',
  'raw/ready/ccxt/trades/segment_20251212T19_00001.parquet',
  'raw/ready/ccxt/trades/segment_20251212T19_00002.parquet',
  'raw/ready/ccxt/trades/segment_20251212T19_00003.parquet',
  'raw/ready/ccxt/trades/segme

In [None]:
# To actually delete (remove dry_run=True)
# result = batch_delete_files(
#     storage, 
#     paths=[f["path"] for f in non_partitioned_files],
#     dry_run=False
# )
# print(f"Deleted {result['deleted']} files, {result['failed']} failed")

### Delete Multiple Channels at Once

In [None]:
# Delete non-partitioned files across multiple channels
channels = ["ticker", "trades", "orderbook"]
all_files_to_delete = []

for channel in channels:
    files = storage.list_files(
        path=f"raw/ready/ccxt/{channel}/",
        pattern="*.parquet",
        recursive=False  # Only immediate directory
    )
    all_files_to_delete.extend([f["path"] for f in files])
    print(f"{channel}: {len(files)} non-partitioned files")

print(f"\nTotal: {len(all_files_to_delete)} files to delete")

# Dry run
result = batch_delete_files(storage, paths=all_files_to_delete, dry_run=True)
print(f"Would delete {len(result['files'])} files")

# To actually delete:
# result = batch_delete_files(storage, paths=all_files_to_delete, dry_run=False)
# print(f"Deleted {result['deleted']} files, {result['failed']} failed")