In [1]:
import os
import numpy as np
import pandas as pd
import polars as pl

---
## Universe Selection

In [None]:
import ccxt.pro as ccxtpro
import time
import pandas as pd
import numpy as np

In [None]:
e1 = ccxtpro.coinbaseadvanced()
e2 = ccxtpro.binanceus()
e3 = ccxtpro.kraken()

In [None]:
# Fetch tickers for all symbols to get volume data
tickers = await e3.fetch_tickers()

# Convert to list of tuples (symbol, volume) and sort by volume
volume_data = [(symbol, ticker.get('quoteVolume', 0)) for symbol, ticker in tickers.items() if ticker.get('quoteVolume')]
volume_data_sorted = sorted(volume_data, key=lambda x: x[1], reverse=True)

# Get top N symbols
N = 10000000
top_symbols = [symbol for symbol, volume in volume_data_sorted[:N]]

print(f"Top {N} symbols by 24h quote volume:")
for i, (symbol, volume) in enumerate(volume_data_sorted[:N], 1):
    print(f"{i}. {symbol}: ${volume:,.0f}")

Top 10000000 symbols by 24h quote volume:
1. BTC/USD: $322,993,564
2. USDT/USD: $267,667,174
3. ETH/USD: $162,346,716
4. USDC/EUR: $106,835,998
5. SOL/USD: $92,497,122
6. BTC/EUR: $66,292,379
7. USDC/USD: $66,033,869
8. EUR/USD: $54,426,888
9. USDT/EUR: $51,089,303
10. XRP/USD: $45,857,165
11. XMR/USD: $32,851,334
12. USDC/USDT: $32,229,453
13. ETH/EUR: $30,238,118
14. BTC/USDC: $28,446,542
15. DASH/USD: $25,317,608
16. SUI/USD: $22,011,569
17. BTC/JPY: $19,697,685
18. XMR/USDT: $19,279,475
19. ZEC/USD: $18,950,293
20. BTC/USDT: $17,646,357
21. LTC/USD: $15,235,481
22. SOL/EUR: $12,609,798
23. DOGE/USD: $12,059,357
24. ETH/USDC: $11,087,691
25. XRP/EUR: $10,547,699
26. ICP/USD: $10,232,376
27. USDC/GBP: $9,980,570
28. ETH/USDT: $9,878,240
29. GBP/USD: $9,797,884
30. SOL/USDT: $9,278,346
31. SOL/USDC: $8,942,754
32. FARTCOIN/USD: $8,924,330
33. ADA/USD: $8,432,785
34. USD/JPY: $8,129,082
35. TAO/USD: $5,881,874
36. BTC/GBP: $5,085,211
37. EUR/JPY: $5,082,479
38. LINK/USD: $4,858,533
39.

In [None]:
# Define universe size
universe_size = 200

# Filter to get symbols ending with /USDC for consistency
usdc_symbols = [(sym, vol) for sym, vol in volume_data_sorted if sym.endswith('/USD') and not sym.startswith("USD")]

# Define group sizes (proportions of universe_size)
tier1_size = 40  # Top 40 most liquid
tier2_size = 80  # Next 40 moderately liquid
tier3_size = 80  # Next 40 less liquid but still tradeable

# Create the three tiers
tier1_institutional = [sym for sym, vol in usdc_symbols[:tier1_size]]
tier2_competitive = [sym for sym, vol in usdc_symbols[tier1_size:tier1_size + tier2_size]]
tier3_niche = [sym for sym, vol in usdc_symbols[tier1_size + tier2_size:universe_size]]

# print(f"Universe Size: {universe_size}")
# print(f"\n{'='*80}")
# print(f"TIER 1 - INSTITUTIONAL (Top {tier1_size} by volume)")
# print(f"{'='*80}")
# print(f"Characteristics: Highest liquidity, tight spreads, heavy competition")
# print(f"Volume range: ${usdc_symbols[0][1]:,.0f} - ${usdc_symbols[tier1_size-1][1]:,.0f}")
# for i, (sym, vol) in enumerate(usdc_symbols[:tier1_size], 1):
#     print(f"{i:2d}. {sym:20s} ${vol:>15,.0f}")

# print(f"\n{'='*80}")
# print(f"TIER 2 - COMPETITIVE (Next {tier2_size} by volume)")
# print(f"{'='*80}")
# print(f"Characteristics: Good liquidity, reasonable spreads, moderate competition")
# print(f"Volume range: ${usdc_symbols[tier1_size][1]:,.0f} - ${usdc_symbols[tier1_size + tier2_size-1][1]:,.0f}")
# for i, (sym, vol) in enumerate(usdc_symbols[tier1_size:tier1_size + tier2_size], 1):
#     print(f"{i:2d}. {sym:20s} ${vol:>15,.0f}")

# print(f"\n{'='*80}")
# print(f"TIER 3 - NICHE/EDGE (Next {tier3_size} by volume)")
# print(f"{'='*80}")
# print(f"Characteristics: Adequate liquidity, wider spreads, potential edge opportunities")
# print(f"Volume range: ${usdc_symbols[tier1_size + tier2_size][1]:,.0f} - ${usdc_symbols[universe_size-1][1]:,.0f}")
# for i, (sym, vol) in enumerate(usdc_symbols[tier1_size + tier2_size:universe_size], 1):
#     print(f"{i:2d}. {sym:20s} ${vol:>15,.0f}")

# Create final symbols list for the universe
symbols = tier1_institutional + tier2_competitive + tier3_niche

print(f"\n{'='*80}")
print(f"SUMMARY")
print(f"{'='*80}")
print(f"Total symbols in universe: {len(symbols)}")
print(f"Tier 1 (Institutional): {len(tier1_institutional)}")
print(f"Tier 2 (Competitive): {len(tier2_competitive)}")
print(f"Tier 3 (Niche/Edge): {len(tier3_niche)}")


SUMMARY
Total symbols in universe: 200
Tier 1 (Institutional): 40
Tier 2 (Competitive): 80
Tier 3 (Niche/Edge): 80


In [None]:
len(symbols)

200

In [None]:
print(symbols[:25])

['BTC/USD', 'ETH/USD', 'SOL/USD', 'EUR/USD', 'XRP/USD', 'XMR/USD', 'DASH/USD', 'SUI/USD', 'ZEC/USD', 'LTC/USD', 'DOGE/USD', 'ICP/USD', 'GBP/USD', 'FARTCOIN/USD', 'ADA/USD', 'TAO/USD', 'LINK/USD', 'BCH/USD', 'PAXG/USD', 'PUMP/USD', 'PEPE/USD', 'CC/USD', 'RENDER/USD', 'XAUT/USD', 'WIF/USD']


## Syncing

In [None]:
import logging
from config import load_config
from storage.sync import *
from storage.factory import create_sync_source_storage, create_sync_destination_storage

logger = logging.getLogger(__name__)
config = load_config("config/config.yaml")
logging.basicConfig(
        level=getattr(logging, config.log_level),
        format=config.log_format
    )

s3_storage = create_sync_destination_storage(config)  # s3
local_storage = create_sync_source_storage(config)  # local

2026-02-05 18:34:11,437 - storage.factory - INFO - [sync_destination] Initializing S3 storage: market-data-vault
2026-02-05 18:34:11,634 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2026-02-05 18:34:11,783 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2026-02-05 18:34:11,850 - storage.factory - INFO - [sync_source] Initializing local storage: ./data


In [2]:
sync = StorageSync(
    source = s3_storage, # from
    destination = local_storage, # to
)

2026-02-05 18:34:14,046 - storage.sync - INFO - [StorageSync] Initialized: s3:market-data-vault → local:./data


In [3]:
SYNC_PATHS = [
    "raw/ready/ccxt/orderbook/",
    "raw/ready/ccxt/trades/",
    "raw/ready/ccxt/ticker/",
]
suffix = ""
for sync_path in SYNC_PATHS:
    sync_path = sync_path + "exchange=coinbaseadvanced/"
    print(f"Syncing path: {sync_path}")
    # Sync processed parquet files
    stats = sync.sync(
        source_path=sync_path,
        dest_path=sync_path,
        pattern="**/*.parquet",
        recursive_list_files=False,
        delete_after_transfer=False,
        max_workers=50,
        skip_existing=True,
        dry_run=False,
    )
    print(stats)

2026-02-05 18:34:15,935 - storage.sync - INFO - [StorageSync] Listing files in raw/ready/ccxt/orderbook/exchange=coinbaseadvanced/ with pattern **/*.parquet


Syncing path: raw/ready/ccxt/orderbook/exchange=coinbaseadvanced/


2026-02-05 18:34:57,343 - storage.sync - INFO - [StorageSync] Found 133107 files to sync
2026-02-05 18:34:57,440 - storage.sync - INFO - [StorageSync] Checking for existing files at destination...
2026-02-05 18:35:13,433 - storage.sync - INFO - [StorageSync] Found 131137 existing files at destination
2026-02-05 18:35:13,481 - storage.sync - INFO - [StorageSync] 131137 files already exist, 1970 to transfer
2026-02-05 18:35:13,482 - storage.sync - INFO - [StorageSync] Transferring 1970 files (1214.45 MB)
Transferring files: 100%|██████████| 1970/1970 [00:33<00:00, 58.08file/s, MB=1214.4/1214.4, failed=0] 
2026-02-05 18:35:47,477 - storage.sync - INFO - [StorageSync] Sync complete: SyncStats(transferred=1970, skipped=131137, failed=0, bytes=1214.45MB, duration=91.5s)
2026-02-05 18:35:47,523 - storage.sync - INFO - [StorageSync] Listing files in raw/ready/ccxt/trades/exchange=coinbaseadvanced/ with pattern **/*.parquet


SyncStats(transferred=1970, skipped=131137, failed=0, bytes=1214.45MB, duration=91.5s)
Syncing path: raw/ready/ccxt/trades/exchange=coinbaseadvanced/


2026-02-05 18:36:04,773 - storage.sync - INFO - [StorageSync] Found 40222 files to sync
2026-02-05 18:36:04,803 - storage.sync - INFO - [StorageSync] Checking for existing files at destination...
2026-02-05 18:36:10,315 - storage.sync - INFO - [StorageSync] Found 38670 existing files at destination
2026-02-05 18:36:10,331 - storage.sync - INFO - [StorageSync] 38670 files already exist, 1552 to transfer
2026-02-05 18:36:10,332 - storage.sync - INFO - [StorageSync] Transferring 1552 files (472.27 MB)
Transferring files: 100%|██████████| 1552/1552 [00:18<00:00, 84.94file/s, MB=472.3/472.3, failed=0] 
2026-02-05 18:36:28,674 - storage.sync - INFO - [StorageSync] Sync complete: SyncStats(transferred=1552, skipped=38670, failed=0, bytes=472.27MB, duration=41.2s)
2026-02-05 18:36:28,691 - storage.sync - INFO - [StorageSync] Listing files in raw/ready/ccxt/ticker/exchange=coinbaseadvanced/ with pattern **/*.parquet


SyncStats(transferred=1552, skipped=38670, failed=0, bytes=472.27MB, duration=41.2s)
Syncing path: raw/ready/ccxt/ticker/exchange=coinbaseadvanced/


2026-02-05 18:36:49,155 - storage.sync - INFO - [StorageSync] Found 43845 files to sync
2026-02-05 18:36:49,190 - storage.sync - INFO - [StorageSync] Checking for existing files at destination...
2026-02-05 18:36:54,678 - storage.sync - INFO - [StorageSync] Found 42277 existing files at destination
2026-02-05 18:36:54,693 - storage.sync - INFO - [StorageSync] 42277 files already exist, 1568 to transfer
2026-02-05 18:36:54,693 - storage.sync - INFO - [StorageSync] Transferring 1568 files (838.66 MB)
Transferring files: 100%|██████████| 1568/1568 [00:25<00:00, 61.52file/s, MB=838.7/838.7, failed=0] 
2026-02-05 18:37:20,245 - storage.sync - INFO - [StorageSync] Sync complete: SyncStats(transferred=1568, skipped=42277, failed=0, bytes=838.66MB, duration=51.6s)


SyncStats(transferred=1568, skipped=42277, failed=0, bytes=838.66MB, duration=51.6s)


## Scan and Fix Corrupted Files

In [4]:
from pathlib import Path
import traceback
from tqdm import tqdm
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import pyarrow.parquet as pq
from config import load_config
from storage.sync import *
from storage.factory import create_sync_source_storage, create_sync_destination_storage

logger = logging.getLogger(__name__)
config = load_config("config/config.yaml")
logging.basicConfig(
        level=getattr(logging, config.log_level),
        format=config.log_format
    )

s3_storage = create_sync_destination_storage(config)  # s3
local_storage = create_sync_source_storage(config)  # local

sync = StorageSync(
    source=s3_storage,
    destination=local_storage,
)

2026-02-05 18:37:20,394 - storage.factory - INFO - [sync_destination] Initializing S3 storage: market-data-vault
2026-02-05 18:37:20,401 - storage.factory - INFO - [sync_source] Initializing local storage: ./data
2026-02-05 18:37:20,402 - storage.sync - INFO - [StorageSync] Initialized: s3:market-data-vault → local:./data


In [5]:
SYNC_PATHS = [
    "raw/ready/ccxt/orderbook/",
    "raw/ready/ccxt/trades/",
    "raw/ready/ccxt/ticker/",
]

# Tuning for ROG Flow Z13 - high parallelism for I/O bound tasks
MAX_WORKERS = 32  # Adjust based on your network/disk bandwidth


def fast_validate_parquet(file_path: str) -> bool:
    """
    Fast validation - just read parquet footer/metadata instead of full file.
    This is 10-100x faster than .collect() for large files.
    """
    try:
        # This reads just the footer (contains magic bytes + metadata)
        pq.read_metadata(file_path)
        return True
    except Exception:
        return False


def validate_and_repair(parquet_file: dict, local_storage, s3_storage, sync) -> dict | None:
    """
    Validate a single file and attempt repair if corrupted.
    Returns problem info dict if file is still broken after repair, else None.
    """
    file_relative_path = str(parquet_file['path'])
    file_full_path = local_storage.get_full_path(file_relative_path)
    
    # Fast validation first
    if fast_validate_parquet(file_full_path):
        return None  # File is OK
    
    # File is corrupted - try to repair from S3
    if s3_storage.exists(file_relative_path):
        sync_resp = sync._transfer_file(
            src_path=file_relative_path,
            dst_path=file_relative_path,
            delete_after=False
        )
        if sync_resp.get('success'):
            # Re-validate after sync
            if fast_validate_parquet(file_full_path):
                return {"status": "repaired", "path": file_relative_path}
            
            # Still broken - delete from both
            try:
                s3_storage.delete(file_relative_path)
                local_storage.delete(file_relative_path)
            except Exception:
                pass
            return {"status": "deleted", "path": file_relative_path, "error": "corrupted_in_s3"}
    
    # Not in S3, just delete local
    try:
        local_storage.delete(file_relative_path)
    except Exception:
        pass
    return {"status": "deleted_local_only", "path": file_relative_path, "error": "not_in_s3"}


# Main execution with parallel processing
for sync_path in SYNC_PATHS:
    print(f"\n{'='*80}")
    print(f"Checking: {sync_path}")
    print(f"{'='*80}")
    
    parquet_files = local_storage.list_files(sync_path, pattern="**/*.parquet")
    total_files = len(parquet_files)
    print(f"Found {total_files} parquet files to validate")
    
    problem_files = []
    repaired_files = []
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all validation tasks
        futures = {
            executor.submit(validate_and_repair, pf, local_storage, s3_storage, sync): pf 
            for pf in parquet_files
        }
        
        # Process results with progress bar
        for future in tqdm(as_completed(futures), total=total_files, desc="Validating"):
            result = future.result()
            if result:
                if result.get("status") == "repaired":
                    repaired_files.append(result)
                else:
                    problem_files.append(result)
    
    print(f"\n{'='*80}")
    print(f"Summary for {sync_path}:")
    print(f"  Total files checked: {total_files}")
    print(f"  Files repaired from S3: {len(repaired_files)}")
    print(f"  Files deleted (unrecoverable): {len(problem_files)}")
    print(f"{'='*80}")


Checking: raw/ready/ccxt/orderbook/
Found 133107 parquet files to validate


Validating: 100%|██████████| 133107/133107 [01:33<00:00, 1428.00it/s]



Summary for raw/ready/ccxt/orderbook/:
  Total files checked: 133107
  Files repaired from S3: 0
  Files deleted (unrecoverable): 148

Checking: raw/ready/ccxt/trades/
Found 40222 parquet files to validate


Validating: 100%|██████████| 40222/40222 [00:42<00:00, 947.51it/s] 



Summary for raw/ready/ccxt/trades/:
  Total files checked: 40222
  Files repaired from S3: 0
  Files deleted (unrecoverable): 114

Checking: raw/ready/ccxt/ticker/
Found 43845 parquet files to validate


Validating: 100%|██████████| 43845/43845 [01:58<00:00, 371.03it/s] 


Summary for raw/ready/ccxt/ticker/:
  Total files checked: 43845
  Files repaired from S3: 0
  Files deleted (unrecoverable): 125



