In [1]:
import os
import numpy as np
import pandas as pd
import polars as pl

In [20]:
raw_ticker_pl = pl.scan_parquet("data/raw/ready/ccxt/ticker/*.parquet")
raw_trades_pl = pl.scan_parquet("data/raw/ready/ccxt/trades/*.parquet")
raw_orderbook_pl = pl.scan_parquet("data/raw/ready/ccxt/orderbook/*.parquet")

In [21]:
ticker_features_pl = pl.scan_parquet("data/processed/silver/ticker/**/*.parquet")
trades_features_pl = pl.scan_parquet("data/processed/silver/trades/**/*.parquet")
orderbook_features_pl = pl.scan_parquet("data/processed/silver/orderbook/**/*.parquet")

## Universe Selection

In [None]:
import ccxt.pro as ccxtpro
import time
import pandas as pd
import numpy as np

In [2]:
e1 = ccxtpro.coinbaseadvanced()
e2 = ccxtpro.binanceus()

In [11]:
# Fetch tickers for all symbols to get volume data
tickers = await e2.fetch_tickers()

# Convert to list of tuples (symbol, volume) and sort by volume
volume_data = [(symbol, ticker.get('quoteVolume', 0)) for symbol, ticker in tickers.items() if ticker.get('quoteVolume')]
volume_data_sorted = sorted(volume_data, key=lambda x: x[1], reverse=True)

# Get top N symbols
N = 10000000
top_symbols = [symbol for symbol, volume in volume_data_sorted[:N]]

print(f"Top {N} symbols by 24h quote volume:")
for i, (symbol, volume) in enumerate(volume_data_sorted[:N], 1):
    print(f"{i}. {symbol}: ${volume:,.0f}")

Top 10000000 symbols by 24h quote volume:
1. BTC/USDT: $3,136,391
2. XRP/USDT: $1,549,435
3. BTC/USD: $1,310,401
4. USDT/USD: $1,297,771
5. ETH/USD: $1,223,673
6. ETH/USDT: $829,197
7. BTC/USDC: $353,840
8. SOL/USD: $322,285
9. SOL/USDT: $275,944
10. XRP/USD: $271,699
11. ADA/USDT: $194,298
12. DOGE/USDT: $170,215
13. ZEC/USDT: $160,985
14. USDC/USDT: $148,270
15. ADA/USD: $140,129
16. BNB/USD: $140,076
17. SOL/USDC: $139,471
18. BNB/USDT: $123,596
19. DOGE/USD: $87,728
20. PENGU/USDT: $50,416
21. PEPE/USD: $39,368
22. USDC/USD: $37,982
23. HYPE/USDT: $35,559
24. ENA/USDT: $34,375
25. ASTER/USDT: $34,255
26. FLOKI/USDT: $34,010
27. SUI/USDT: $32,300
28. AVAX/USDT: $22,199
29. LTC/USDT: $20,316
30. LINK/USDT: $18,998
31. AXL/USDT: $17,958
32. HYPE/USD: $17,904
33. XLM/USDT: $16,194
34. HBAR/USDT: $14,842
35. FET/USDT: $14,528
36. DOT/USDT: $13,204
37. DASH/USDT: $13,074
38. ETH/USDC: $12,565
39. BCH/USDT: $12,554
40. TRUMP/USDT: $11,453
41. LINK/USD: $11,450
42. JUP/USDT: $11,315
43. PU

In [None]:
# Define universe size
universe_size = 200

# Filter to get symbols ending with /USDC for consistency
usdc_symbols = [(sym, vol) for sym, vol in volume_data_sorted if sym.endswith('/USDC') and not sym.startswith("USD")]

# Define group sizes (proportions of universe_size)
tier1_size = 40  # Top 20 most liquid
tier2_size = 80  # Next 40 moderately liquid
tier3_size = 80  # Next 40 less liquid but still tradeable

# Create the three tiers
tier1_institutional = [sym for sym, vol in usdc_symbols[:tier1_size]]
tier2_competitive = [sym for sym, vol in usdc_symbols[tier1_size:tier1_size + tier2_size]]
tier3_niche = [sym for sym, vol in usdc_symbols[tier1_size + tier2_size:universe_size]]

# print(f"Universe Size: {universe_size}")
# print(f"\n{'='*80}")
# print(f"TIER 1 - INSTITUTIONAL (Top {tier1_size} by volume)")
# print(f"{'='*80}")
# print(f"Characteristics: Highest liquidity, tight spreads, heavy competition")
# print(f"Volume range: ${usdc_symbols[0][1]:,.0f} - ${usdc_symbols[tier1_size-1][1]:,.0f}")
# for i, (sym, vol) in enumerate(usdc_symbols[:tier1_size], 1):
#     print(f"{i:2d}. {sym:20s} ${vol:>15,.0f}")

# print(f"\n{'='*80}")
# print(f"TIER 2 - COMPETITIVE (Next {tier2_size} by volume)")
# print(f"{'='*80}")
# print(f"Characteristics: Good liquidity, reasonable spreads, moderate competition")
# print(f"Volume range: ${usdc_symbols[tier1_size][1]:,.0f} - ${usdc_symbols[tier1_size + tier2_size-1][1]:,.0f}")
# for i, (sym, vol) in enumerate(usdc_symbols[tier1_size:tier1_size + tier2_size], 1):
#     print(f"{i:2d}. {sym:20s} ${vol:>15,.0f}")

# print(f"\n{'='*80}")
# print(f"TIER 3 - NICHE/EDGE (Next {tier3_size} by volume)")
# print(f"{'='*80}")
# print(f"Characteristics: Adequate liquidity, wider spreads, potential edge opportunities")
# print(f"Volume range: ${usdc_symbols[tier1_size + tier2_size][1]:,.0f} - ${usdc_symbols[universe_size-1][1]:,.0f}")
# for i, (sym, vol) in enumerate(usdc_symbols[tier1_size + tier2_size:universe_size], 1):
#     print(f"{i:2d}. {sym:20s} ${vol:>15,.0f}")

# Create final symbols list for the universe
symbols = tier1_institutional + tier2_competitive + tier3_niche

print(f"\n{'='*80}")
print(f"SUMMARY")
print(f"{'='*80}")
print(f"Total symbols in universe: {len(symbols)}")
print(f"Tier 1 (Institutional): {len(tier1_institutional)}")
print(f"Tier 2 (Competitive): {len(tier2_competitive)}")
print(f"Tier 3 (Niche/Edge): {len(tier3_niche)}")


SUMMARY
Total symbols in universe: 4
Tier 1 (Institutional): 4
Tier 2 (Competitive): 0
Tier 3 (Niche/Edge): 0


In [13]:
len(symbols)

175

In [14]:
print(symbols)

['BTC/USDT', 'XRP/USDT', 'ETH/USDT', 'SOL/USDT', 'ADA/USDT', 'DOGE/USDT', 'ZEC/USDT', 'BNB/USDT', 'PENGU/USDT', 'HYPE/USDT', 'ENA/USDT', 'ASTER/USDT', 'FLOKI/USDT', 'SUI/USDT', 'AVAX/USDT', 'LTC/USDT', 'LINK/USDT', 'AXL/USDT', 'XLM/USDT', 'HBAR/USDT', 'FET/USDT', 'DOT/USDT', 'DASH/USDT', 'BCH/USDT', 'TRUMP/USDT', 'JUP/USDT', 'PURR/USDT', 'DUSK/USDT', 'SHIB/USDT', 'ZEN/USDT', 'IOTA/USDT', 'VIRTUAL/USDT', 'ICP/USDT', 'ONE/USDT', 'KDA/USDT', 'XDC/USDT', 'MAGIC/USDT', 'POL/USDT', 'FIL/USDT', 'ALGO/USDT', 'PEPE/USDT', 'AAVE/USDT', 'REN/USDT', 'ATOM/USDT', 'MKR/USDT', 'RENDER/USDT', 'ONT/USDT', 'BONK/USDT', 'APE/USDT', 'VET/USDT', 'WIF/USDT', 'AVNT/USDT', 'APT/USDT', 'DGB/USDT', 'THETA/USDT', 'ZIL/USDT', 'ETC/USDT', 'SAND/USDT', 'KNC/USDT', 'UNI/USDT', 'NEO/USDT', 'ADX/USDT', 'FTM/USDT', 'PUMP/USDT', '1000MOG/USDT', 'NMR/USDT', 'BAT/USDT', 'LTO/USDT', 'WLFI/USDT', 'PAXG/USDT', 'ICX/USDT', 'GALA/USDT', 'ARB/USDT', 'XNO/USDT', 'G/USDT', 'VTHO/USDT', 'CRV/USDT', 'STMX/USDT', 'MXC/USDT', 'ONG/US

In [3]:
import json

# Read NDJSON file
ndjson_data = []
fpath = "C:/Users/longp/FluxForge/data/raw/ready/ccxt_binanceus/segment_20251209T00_00001.ndjson"
with open(fpath, 'r') as f:
    for line in f:
        ndjson_data.append(json.loads(line))

In [4]:
orderbook_data = [entry for entry in ndjson_data if entry['type'] == 'orderbook']

In [6]:
orderbook_entry = orderbook_data[0]
orderbook_entry

{'type': 'orderbook',
 'exchange': 'binanceus',
 'symbol': 'BTC/USDT',
 'method': 'watchOrderBook',
 'data': {'bids': [[90390.05, 2.23167],
   [90390.04, 0.1275],
   [90389.98, 0.10912],
   [90389.82, 0.94748],
   [90368.32, 0.00163],
   [90282.94, 0.1001],
   [90282.93, 0.12979],
   [90282.92, 0.00082],
   [90268.75, 0.00011],
   [90236.1, 0.27416],
   [90202.86, 0.0001],
   [90191.0, 0.32148],
   [90163.7, 0.33093],
   [90129.95, 0.00037],
   [90052.63, 0.0016],
   [90048.7, 0.10849],
   [90000.0, 0.00549],
   [89964.7, 0.11121],
   [89919.82, 0.00234],
   [89892.53, 7e-05],
   [89886.91, 0.00158],
   [89879.1, 0.10731],
   [89842.2, 0.10638],
   [89833.12, 0.00025],
   [89810.37, 0.00011],
   [89780.13, 0.00089],
   [89771.76, 0.00043],
   [89744.45, 0.00083],
   [89714.33, 7e-05],
   [89686.82, 4e-05],
   [89671.73, 0.00127],
   [89580.68, 0.00037],
   [89556.03, 0.00014],
   [89534.74, 0.00073],
   [89516.66, 0.00079],
   [89510.78, 0.00044],
   [89500.0, 0.00138],
   [89457.24, 0

In [2]:
def extract_orderbook_features(orderbook_entry):
    """
    Extract comprehensive orderbook features from a single orderbook snapshot.
    Returns a dictionary of features with predictive power for modeling.
    """
    data = orderbook_entry['data']
    bids = np.array(data['bids'])  # [[price, size], ...]
    asks = np.array(data['asks'])  # [[price, size], ...]
    
    features = {
        'timestamp': orderbook_entry['collected_at'],
        'symbol': orderbook_entry['symbol'],
        'exchange': orderbook_entry['exchange'],
    }
    
    # ===== BASIC FEATURES =====
    # Best bid/ask
    features['best_bid'] = bids[0][0] if len(bids) > 0 else np.nan
    features['best_ask'] = asks[0][0] if len(asks) > 0 else np.nan
    features['mid_price'] = (features['best_bid'] + features['best_ask']) / 2
    features['spread'] = features['best_ask'] - features['best_bid']
    features['spread_bps'] = (features['spread'] / features['mid_price']) * 10000
    
    # Volume at best
    features['bid_size_level_0'] = bids[0][1] if len(bids) > 0 else 0
    features['ask_size_level_0'] = asks[0][1] if len(asks) > 0 else 0
    
    # ===== DEPTH FEATURES =====
    # Cumulative volume across levels
    n_levels = min(len(bids), len(asks), 10)
    for i in range(n_levels):
        if i < len(bids):
            features[f'bid_size_level_{i}'] = bids[i][1]
            features[f'bid_price_level_{i}'] = bids[i][0]
        if i < len(asks):
            features[f'ask_size_level_{i}'] = asks[i][1]
            features[f'ask_price_level_{i}'] = asks[i][0]
    
    # Total volume by depth
    for depth in [5, 10, 20, 50]:
        depth = min(depth, len(bids), len(asks))
        features[f'bid_volume_{depth}'] = np.sum(bids[:depth, 1])
        features[f'ask_volume_{depth}'] = np.sum(asks[:depth, 1])
        features[f'total_volume_{depth}'] = features[f'bid_volume_{depth}'] + features[f'ask_volume_{depth}']
        features[f'volume_imbalance_{depth}'] = (features[f'bid_volume_{depth}'] - features[f'ask_volume_{depth}']) / features[f'total_volume_{depth}']
    
    # ===== IMBALANCE FEATURES =====
    # Order flow imbalance at different levels
    features['ofi_level_0'] = (features['bid_size_level_0'] - features['ask_size_level_0']) / (features['bid_size_level_0'] + features['ask_size_level_0'])
    
    # Weighted order imbalance
    bid_notional_5 = np.sum(bids[:5, 0] * bids[:5, 1])
    ask_notional_5 = np.sum(asks[:5, 0] * asks[:5, 1])
    features['weighted_oi_5'] = (bid_notional_5 - ask_notional_5) / (bid_notional_5 + ask_notional_5)
    
    # ===== PRICE LEVEL FEATURES =====
    # Distance between levels
    features['bid_level_spacing_mean'] = np.mean(np.diff(bids[:10, 0])) if len(bids) >= 2 else 0
    features['ask_level_spacing_mean'] = np.mean(np.diff(asks[:10, 0])) if len(asks) >= 2 else 0
    
    # Price impact (how much price moves per unit volume)
    features['bid_depth_10bps'] = np.sum(bids[bids[:, 0] >= features['best_bid'] * 0.999, 1]) if len(bids) > 0 else 0
    features['ask_depth_10bps'] = np.sum(asks[asks[:, 0] <= features['best_ask'] * 1.001, 1]) if len(asks) > 0 else 0
    
    # ===== LIQUIDITY FEATURES =====
    # Volume-weighted average price
    features['vwap_bid_5'] = np.sum(bids[:5, 0] * bids[:5, 1]) / np.sum(bids[:5, 1]) if len(bids) >= 5 else np.nan
    features['vwap_ask_5'] = np.sum(asks[:5, 0] * asks[:5, 1]) / np.sum(asks[:5, 1]) if len(asks) >= 5 else np.nan
    features['vwap_spread'] = features['vwap_ask_5'] - features['vwap_bid_5']
    
    # Microprice (fair value estimate)
    features['microprice'] = (features['best_bid'] * features['ask_size_level_0'] + features['best_ask'] * features['bid_size_level_0']) / (features['bid_size_level_0'] + features['ask_size_level_0'])
    
    # ===== SHAPE FEATURES =====
    # Distribution of liquidity
    bid_volume_cumsum = np.cumsum(bids[:, 1])
    ask_volume_cumsum = np.cumsum(asks[:, 1])
    
    # Find where 50% and 90% of volume is
    total_bid_vol = bid_volume_cumsum[-1] if len(bid_volume_cumsum) > 0 else 0
    total_ask_vol = ask_volume_cumsum[-1] if len(ask_volume_cumsum) > 0 else 0
    
    features['bid_50pct_depth'] = np.argmax(bid_volume_cumsum >= total_bid_vol * 0.5) if total_bid_vol > 0 else 0
    features['ask_50pct_depth'] = np.argmax(ask_volume_cumsum >= total_ask_vol * 0.5) if total_ask_vol > 0 else 0
    
    # Concentration (Herfindahl index)
    if total_bid_vol > 0:
        bid_shares = bids[:10, 1] / total_bid_vol
        features['bid_concentration'] = np.sum(bid_shares ** 2)
    else:
        features['bid_concentration'] = 0
        
    if total_ask_vol > 0:
        ask_shares = asks[:10, 1] / total_ask_vol
        features['ask_concentration'] = np.sum(ask_shares ** 2)
    else:
        features['ask_concentration'] = 0
    
    # ===== PRESSURE FEATURES =====
    # Volume ratio at different depths
    for depth in [3, 5, 10]:
        bid_vol = np.sum(bids[:depth, 1])
        ask_vol = np.sum(asks[:depth, 1])
        features[f'volume_ratio_{depth}'] = bid_vol / ask_vol if ask_vol > 0 else np.inf
    
    # Notional value pressure
    bid_notional_10 = np.sum(bids[:10, 0] * bids[:10, 1])
    ask_notional_10 = np.sum(asks[:10, 0] * asks[:10, 1])
    features['notional_pressure_10'] = (bid_notional_10 - ask_notional_10) / (bid_notional_10 + ask_notional_10)
    
    # ===== ADVANCED FEATURES =====
    # Smart depth (volume-adjusted by distance from mid)
    bid_distances = (features['mid_price'] - bids[:, 0]) / features['mid_price']
    ask_distances = (asks[:, 0] - features['mid_price']) / features['mid_price']
    
    features['smart_bid_depth'] = np.sum(bids[:, 1] * np.exp(-100 * bid_distances))
    features['smart_ask_depth'] = np.sum(asks[:, 1] * np.exp(-100 * ask_distances))
    features['smart_depth_imbalance'] = (features['smart_bid_depth'] - features['smart_ask_depth']) / (features['smart_bid_depth'] + features['smart_ask_depth'])
    
    # Kyle's lambda (price impact coefficient approximation)
    features['kyle_lambda_bid'] = features['spread'] / (2 * features['bid_volume_5']) if features['bid_volume_5'] > 0 else np.nan
    features['kyle_lambda_ask'] = features['spread'] / (2 * features['ask_volume_5']) if features['ask_volume_5'] > 0 else np.nan
    
    # Amihud illiquidity measure (approximation)
    features['amihud_illiquidity'] = features['spread'] / (features['bid_volume_10'] + features['ask_volume_10'])
    
    # Order book slope (price change per volume)
    if len(bids) >= 5:
        bid_prices_5 = bids[:5, 0]
        bid_volumes_cumsum = np.cumsum(bids[:5, 1])
        features['bid_slope'] = (bid_prices_5[-1] - bid_prices_5[0]) / bid_volumes_cumsum[-1] if bid_volumes_cumsum[-1] > 0 else 0
    else:
        features['bid_slope'] = 0
        
    if len(asks) >= 5:
        ask_prices_5 = asks[:5, 0]
        ask_volumes_cumsum = np.cumsum(asks[:5, 1])
        features['ask_slope'] = (ask_prices_5[-1] - ask_prices_5[0]) / ask_volumes_cumsum[-1] if ask_volumes_cumsum[-1] > 0 else 0
    else:
        features['ask_slope'] = 0
    
    # Volume-weighted spread
    total_vol_10 = features['bid_volume_10'] + features['ask_volume_10']
    features['vw_spread'] = (features['vwap_ask_5'] - features['vwap_bid_5']) if not np.isnan(features['vwap_ask_5']) else features['spread']
    
    # Relative spread (normalized by mid price)
    features['relative_spread'] = features['spread'] / features['mid_price']
    
    # Effective spread (considering volume)
    features['effective_spread'] = 2 * abs(features['microprice'] - features['mid_price'])
    
    return features

In [5]:
extract_orderbook_features(orderbook_data[0])

{'timestamp': 1765241057511,
 'symbol': 'BTC/USDT',
 'exchange': 'binanceus',
 'best_bid': 90390.05,
 'best_ask': 90698.75,
 'mid_price': 90544.4,
 'spread': 308.6999999999971,
 'spread_bps': 34.093770570018364,
 'bid_size_level_0': 2.23167,
 'ask_size_level_0': 0.01966,
 'bid_price_level_0': 90390.05,
 'ask_price_level_0': 90698.75,
 'bid_size_level_1': 0.1275,
 'bid_price_level_1': 90390.04,
 'ask_size_level_1': 0.1001,
 'ask_price_level_1': 90698.76,
 'bid_size_level_2': 0.10912,
 'bid_price_level_2': 90389.98,
 'ask_size_level_2': 2e-05,
 'ask_price_level_2': 90698.82,
 'bid_size_level_3': 0.94748,
 'bid_price_level_3': 90389.82,
 'ask_size_level_3': 0.12896,
 'ask_price_level_3': 90698.85,
 'bid_size_level_4': 0.00163,
 'bid_price_level_4': 90368.32,
 'ask_size_level_4': 0.057,
 'ask_price_level_4': 90699.0,
 'bid_size_level_5': 0.1001,
 'bid_price_level_5': 90282.94,
 'ask_size_level_5': 0.01058,
 'ask_price_level_5': 90699.14,
 'bid_size_level_6': 0.12979,
 'bid_price_level_6': 

In [None]:
# CCXT Implementation Plan
# 1. Add CcxtConfig to config/config.py
# 2. Create ingestion/collectors/ccxt_collector.py
# 3. Update ingestion/orchestrators/ingestion_pipeline.py
# 4. Update config/config.examples.yaml

In [1]:
import sys
import os
import duckdb
import logging
from tqdm import tqdm
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
from config import load_config
import pyarrow.parquet as pq
import pyarrow as pa

logger = logging.getLogger(__name__)

config = load_config("config/config.yaml")

logging.basicConfig(
        level=getattr(logging, config.log_level),
        format=config.log_format
    )

  class DatabentoConfig(BaseModel):


In [None]:
pl_df = pl.scan_parquet("F:/processed/coinbase/ticker/**/*.parquet")
df = pl_df.filter(pl.col("product_id") == "BTC-USD").collect()
df

## Compaction

In [None]:
from etl.repartitioner import ParquetCompactor

compactor = ParquetCompactor(
    dataset_dir="F:/processed/coinbase/level2/",  # Directory containing the dataset
    target_file_size_mb=100,  # Target 100MB files
)

stats = compactor.compact(
    min_file_count=2,          # Only compact partitions with 2+ files
    target_file_count=1,       # Consolidate to exactly 1 file per partition
    delete_source_files=True,  # Delete original files after compaction
    dry_run=False,
)
stats

## Syncing

In [9]:
import logging
from config import load_config
from storage.sync import StorageSync
from storage.factory import create_sync_source_storage, create_sync_destination_storage

logger = logging.getLogger(__name__)
config = load_config("config/config.yaml")
logging.basicConfig(
        level=getattr(logging, config.log_level),
        format=config.log_format
    )

In [10]:
sync = StorageSync(
    source=create_sync_destination_storage(config),  # s3
    destination=create_sync_source_storage(config),  # local
)

2025-12-20 11:18:34,696 - storage.factory - INFO - [sync_destination] Initializing S3 storage: market-data-vault
2025-12-20 11:18:34,701 - storage.factory - INFO - [sync_source] Initializing local storage: ./data
2025-12-20 11:18:34,703 - storage.sync - INFO - [StorageSync] Initialized: s3:market-data-vault → local:./data


In [11]:
SYNC_PATHS = [
    "raw/ready/ccxt/ticker/",
    "raw/ready/ccxt/trades/",
    "raw/ready/ccxt/orderbook/",
    # "processed/ccxt/ticker/",
    # "processed/ccxt/trades/",
    # "processed/ccxt/orderbook/hf/",
    # "processed/ccxt/orderbook/bars/"
]
for sync_path in SYNC_PATHS:
    print(f"Syncing path: {sync_path}")
    # Sync processed parquet files
    stats = sync.sync(
        source_path=sync_path,
        dest_path=sync_path,
        pattern="*.parquet",
        recursive_list_files=True,
        delete_after_transfer=False,
        max_workers=42,
        skip_existing=True,
        dry_run=True,
    )
    print(stats)

2025-12-20 11:19:09,570 - storage.sync - INFO - [StorageSync] Listing files in raw/ready/ccxt/ticker/ with pattern *.parquet


Syncing path: raw/ready/ccxt/ticker/


2025-12-20 11:19:17,699 - storage.sync - INFO - [StorageSync] Found 23558 files to sync
2025-12-20 11:19:17,699 - storage.sync - INFO - [StorageSync] DRY RUN - no files will be transferred
2025-12-20 11:19:17,699 - storage.sync - INFO -   Would transfer: raw/ready/ccxt/ticker/exchange=binanceus/symbol=AAVE-USD/year=2025/month=12/day=20/hour=2/segment_20251220T02_00001.parquet (570351 bytes)
2025-12-20 11:19:17,699 - storage.sync - INFO -   Would transfer: raw/ready/ccxt/ticker/exchange=binanceus/symbol=AAVE-USD/year=2025/month=12/day=20/hour=3/segment_20251220T03_00001.parquet (889751 bytes)
2025-12-20 11:19:17,699 - storage.sync - INFO -   Would transfer: raw/ready/ccxt/ticker/exchange=binanceus/symbol=ADA-USD/year=2025/month=12/day=20/hour=2/segment_20251220T02_00001.parquet (440847 bytes)
2025-12-20 11:19:17,699 - storage.sync - INFO -   Would transfer: raw/ready/ccxt/ticker/exchange=binanceus/symbol=ADA-USD/year=2025/month=12/day=20/hour=3/segment_20251220T03_00001.parquet (1004630

KeyboardInterrupt: 

## Deleting

In [4]:
from storage.base import batch_delete_files

In [6]:
storage = sync.destination
storage

<storage.base.S3Storage at 0x273a6382d80>

### Delete Non-Partitioned Files Only

To delete only the non-partitioned parquet files (in immediate directory) while keeping partitioned ones (in subdirectories), use `recursive=False`:

In [8]:
# Get only non-partitioned files (immediate directory, not recursive)
# This will match: raw/ready/ccxt/ticker/*.parquet
# But NOT: raw/ready/ccxt/ticker/exchange=binanceus/**/*.parquet
non_partitioned_files = storage.list_files(
    path="raw/ready/ccxt/ticker/",
    pattern="*.parquet",
    recursive=False  # Key: only immediate directory
)

print(f"Found {len(non_partitioned_files)} non-partitioned files")

Found 0 non-partitioned files


In [36]:
result = batch_delete_files(
    storage, 
    paths=[f["path"] for f in non_partitioned_files],
    dry_run=False
)
result

2025-12-20 02:28:50,184 - storage.base - INFO - Batch delete complete: 170 deleted, 0 failed out of 170 files


{'deleted': 170,
 'failed': 0,
 'errors': [],
 'files': ['raw/ready/ccxt/trades/segment_20251212T17_00001.parquet',
  'raw/ready/ccxt/trades/segment_20251212T17_00007.parquet',
  'raw/ready/ccxt/trades/segment_20251212T17_00010.parquet',
  'raw/ready/ccxt/trades/segment_20251212T17_00011.parquet',
  'raw/ready/ccxt/trades/segment_20251212T17_00014.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00002.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00005.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00006.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00007.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00008.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00009.parquet',
  'raw/ready/ccxt/trades/segment_20251212T18_00010.parquet',
  'raw/ready/ccxt/trades/segment_20251212T19_00001.parquet',
  'raw/ready/ccxt/trades/segment_20251212T19_00002.parquet',
  'raw/ready/ccxt/trades/segment_20251212T19_00003.parquet',
  'raw/ready/ccxt/trades/segme

In [None]:
# To actually delete (remove dry_run=True)
# result = batch_delete_files(
#     storage, 
#     paths=[f["path"] for f in non_partitioned_files],
#     dry_run=False
# )
# print(f"Deleted {result['deleted']} files, {result['failed']} failed")

### Delete Multiple Channels at Once

In [None]:
# Delete non-partitioned files across multiple channels
channels = ["ticker", "trades", "orderbook"]
all_files_to_delete = []

for channel in channels:
    files = storage.list_files(
        path=f"raw/ready/ccxt/{channel}/",
        pattern="*.parquet",
        recursive=False  # Only immediate directory
    )
    all_files_to_delete.extend([f["path"] for f in files])
    print(f"{channel}: {len(files)} non-partitioned files")

print(f"\nTotal: {len(all_files_to_delete)} files to delete")

# Dry run
result = batch_delete_files(storage, paths=all_files_to_delete, dry_run=True)
print(f"Would delete {len(result['files'])} files")

# To actually delete:
# result = batch_delete_files(storage, paths=all_files_to_delete, dry_run=False)
# print(f"Deleted {result['deleted']} files, {result['failed']} failed")

## Upload Local & Cloud

In [None]:
import logging
from pathlib import Path
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from config import load_config
from storage.factory import (
    create_etl_storage_input,
    create_etl_storage_output,
)
from etl.job import ETLJob

logger = logging.getLogger(__name__)

config = load_config("config/config.yaml")

logging.basicConfig(
        level=getattr(logging, config.log_level),
        format=config.log_format
    )

# Create storage backends
storage_input = create_etl_storage_input(config)
storage_output = create_etl_storage_output(config)

logger.info(f"Storage Input:  {storage_input.backend_type} @ {storage_input.base_path}")
logger.info(f"Storage Output: {storage_output.backend_type} @ {storage_output.base_path}")


SOURCE_PATHS = [
    "processed/coinbase/market_trades/",
    "processed/coinbase/ticker/",
    "processed/coinbase/level2/",
]

for upload_path in SOURCE_PATHS:
    print(f"\nUploading files from {upload_path}...")
    files_to_upload = storage_input.list_files(upload_path, pattern="**/*.parquet")
    print(f"Found {len(files_to_upload)} files to upload.")

    def upload_file(file_info):
        """Upload a single file and delete it locally"""
        fpath = file_info['path']
        try:
            storage_output.write_file(
                local_path=storage_input.get_full_path(fpath),
                remote_path=fpath
            )
            storage_input.delete(fpath)
            return {'success': True, 'path': fpath}
        except Exception as e:
            return {'success': False, 'path': fpath, 'error': str(e)}

    # Parallelize uploads with thread pool
    max_workers = 10  # Adjust based on your bandwidth and system
    failed_uploads = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(upload_file, f): f for f in files_to_upload}
        
        for future in tqdm(as_completed(futures), total=len(files_to_upload), desc="Uploading files"):
            result = future.result()
            if not result['success']:
                failed_uploads.append(result)
                print(f"Failed to upload {result['path']}: {result['error']}")

    if failed_uploads:
        print(f"\nFailed to upload {len(failed_uploads)} files")
    else:
        print(f"\nSuccessfully uploaded all {len(files_to_upload)} files")

## ETL

In [1]:
from etl.parquet_etl_pipeline import ParquetETLPipeline, ParquetETLConfig

# Configure
config = ParquetETLConfig(
    horizons=[5, 15, 60, 300, 900],
    bar_durations=[60, 300, 900, 3600],
    max_levels=20,
    ofi_levels=10,
    mode='hybrid',
)

pipeline = ParquetETLPipeline(config)


In [None]:
# Process ticker (fully vectorized)
ticker_df = pipeline.process_ticker(
    input_path="data/raw/ready/ccxt/ticker",
    output_path="data/silver/ticker",
)
print(f"Processed {len(ticker_df)} ticker records")

In [None]:
# Process trades (fully vectorized)
trades_df = pipeline.process_trades(
    input_path="data/raw/ready/ccxt/trades",
    output_path="data/silver/trades",
)
print(f"Processed {len(trades_df)} trade records")

In [None]:
# Process orderbook with trades (hybrid)
hf_df, bars_df = pipeline.process_orderbook_with_trades(
    orderbook_path="data/raw/ready/ccxt/orderbook",
    trades_path="data/raw/ready/ccxt/trades",
    output_hf_path="data/silver/orderbook/hf",
    output_bars_path="data/silver/orderbook/bars",
)
print(f"Produced {len(hf_df)} HF feature rows")
print(f"Produced {len(bars_df)} bar aggregates")

In [14]:
import logging
import sys
from pathlib import Path
from datetime import datetime, timedelta
import os

from config import load_config
from storage.factory import (
    create_etl_storage_input,
    create_etl_storage_output,
    get_etl_input_path,
    get_etl_output_path,
    get_processing_path
)
from etl.job import ETLJob

logger = logging.getLogger(__name__)

config = load_config("config/config.yaml")

logging.basicConfig(
        level=getattr(logging, config.log_level),
        format=config.log_format
    )

In [15]:
# Create storage backends
storage_input = create_etl_storage_input(config)
storage_output = create_etl_storage_output(config)

# Get paths from config
input_path = get_etl_input_path(config, "coinbase")
output_path = get_etl_output_path(config, "coinbase")
processing_path = get_processing_path(config, "coinbase")

logger.info(f"Storage Input:  {storage_input.backend_type} @ {storage_input.base_path}")
logger.info(f"Storage Output: {storage_output.backend_type} @ {storage_output.base_path}")
logger.info(f"Input path: {input_path}")
logger.info(f"Output path: {output_path}")
logger.info(f"Processing path: {processing_path}")

2025-12-04 10:56:44,371 - storage.factory - INFO - [etl_input] Initializing local storage: F:/
2025-12-04 10:56:44,373 - storage.factory - INFO - [etl_output] Initializing S3 storage: market-data-vault
2025-12-04 10:56:44,455 - __main__ - INFO - Storage Input:  local @ F:/
2025-12-04 10:56:44,457 - __main__ - INFO - Storage Output: s3 @ market-data-vault
2025-12-04 10:56:44,457 - __main__ - INFO - Input path: raw/ready/coinbase
2025-12-04 10:56:44,460 - __main__ - INFO - Output path: processed/coinbase
2025-12-04 10:56:44,461 - __main__ - INFO - Processing path: raw/processing/coinbase


In [16]:
channel_config = None
if hasattr(config.etl, 'channels') and config.etl.channels:
    channel_config = {
        channel_name: {
            "partition_cols": channel_cfg.partition_cols,
            "processor_options": channel_cfg.processor_options,
        }
        for channel_name, channel_cfg in config.etl.channels.items()
        if channel_cfg.enabled
    }
channel_config

{'level2': {'partition_cols': ['product_id', 'date'],
  'processor_options': {'reconstruct_lob': False,
   'compute_features': False,
   'add_derived_fields': True}},
 'market_trades': {'partition_cols': ['product_id', 'date'],
  'processor_options': {'add_derived_fields': True, 'infer_aggressor': False}},
 'ticker': {'partition_cols': ['product_id', 'date'],
  'processor_options': {'add_derived_fields': True}}}

In [17]:
job = ETLJob(
    storage_input=storage_input,
    storage_output=storage_output,
    input_path=input_path,
    output_path=output_path,
    delete_after_processing=config.etl.delete_after_processing,
    processing_path=processing_path,
    channel_config=channel_config,
    )

2025-12-04 10:57:14,545 - etl.processors.coinbase.level2_processor - INFO - [CoinbaseLevel2Processor] Initialized: reconstruct_lob=False, compute_features=False
2025-12-04 10:57:14,546 - etl.processors.raw_parser - INFO - [RawParser] Initialized for source=coinbase, channel_filter=level2
2025-12-04 10:57:14,546 - etl.writers.parquet_writer - INFO - [ParquetWriter] Initialized: storage=s3, compression=snappy
2025-12-04 10:57:14,548 - etl.orchestrators.pipeline - INFO - [ETLPipeline] Initialized: reader=NDJSONReader, processors=RawParser → Level2Processor, writer=ParquetWriter
2025-12-04 10:57:14,548 - etl.processors.coinbase.trades_processor - INFO - [CoinbaseTradesProcessor] Initialized: add_derived_fields=True, infer_aggressor=False
2025-12-04 10:57:14,550 - etl.processors.raw_parser - INFO - [RawParser] Initialized for source=coinbase, channel_filter=market_trades
2025-12-04 10:57:14,551 - etl.writers.parquet_writer - INFO - [ParquetWriter] Initialized: storage=s3, compression=snappy

In [18]:
job.process_all()

2025-12-04 10:58:46,494 - etl.job - INFO - [ETLJob] Scanning for segments in raw/ready/coinbase
2025-12-04 10:58:46,496 - etl.job - INFO - [ETLJob] Found 3 segment(s) to process
2025-12-04 10:58:46,509 - etl.job - INFO - [ETLJob] Processing segment: segment_20251204T18_00011.ndjson
2025-12-04 10:58:46,510 - etl.orchestrators.coinbase_segment_pipeline - INFO - [CoinbaseSegmentPipeline] Processing segment: segment_20251204T18_00011.ndjson
2025-12-04 10:58:46,510 - etl.orchestrators.pipeline - INFO - [ETLPipeline] Executing: F:\raw\processing\coinbase\segment_20251204T18_00011.ndjson → processed/coinbase/level2 (partition_cols=['product_id', 'date'])
2025-12-04 10:58:56,576 - etl.readers.ndjson_reader - INFO - [NDJSONReader] Read 107200 records from segment_20251204T18_00011.ndjson (0 errors)
2025-12-04 10:59:00,366 - etl.writers.parquet_writer - INFO - [ParquetWriter] Wrote 10755 records to processed/coinbase/level2/product_id=ADA-USD/date=2025-12-04/part_20251204T10_114a4663.parquet (91

## Test Startup Migration (active/ → ready/)

In [None]:
# Simulate orphan files in active/ directory by creating test files
import shutil
from pathlib import Path
from storage.factory import create_etl_storage_input
from config import load_config

config = load_config()
storage = create_etl_storage_input(config)

# Create test orphan file in active/
test_active_path = Path(storage.base_path) / "raw/active/ccxt/ticker/exchange=test/symbol=TEST-USD/year=2025/month=1/day=19/hour=14"
test_active_path.mkdir(parents=True, exist_ok=True)
test_file = test_active_path / "test_orphan.parquet"
test_file.write_text("fake parquet content")

print(f"Created test orphan file: {test_file}")
print(f"File exists: {test_file.exists()}")

In [None]:
# Now test the StreamingParquetWriter startup migration
from ingestion.writers.parquet_writer import StreamingParquetWriter
import asyncio

# Create writer instance
writer = StreamingParquetWriter(
    storage=storage,
    active_path="raw/active/ccxt/",
    ready_path="raw/ready/ccxt/",
    source_name="test",
    batch_size=100,
    flush_interval_seconds=5,
    queue_maxsize=10000,
    segment_max_mb=100,
    partition_by=["exchange", "symbol"]
)

# Start the writer - this should trigger migration
await writer.start()

# Check if file was migrated
test_ready_path = Path(storage.base_path) / "raw/ready/ccxt/ticker/exchange=test/symbol=TEST-USD/year=2025/month=1/day=19/hour=14"
migrated_file = test_ready_path / "test_orphan.parquet"

print(f"\n--- Migration Test Results ---")
print(f"Original file exists in active/: {test_file.exists()}")
print(f"Migrated file exists in ready/: {migrated_file.exists()}")

if migrated_file.exists():
    print("✅ SUCCESS: Startup migration working!")
else:
    print("❌ FAILED: File was not migrated")

# Cleanup
await writer.stop()
if migrated_file.exists():
    migrated_file.unlink()
    # Clean up empty dirs
    for parent in migrated_file.parents:
        if parent.exists() and not any(parent.iterdir()):
            parent.rmdir()