In [1]:
import sys
import os
import duckdb
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
from config import load_config

  class DatabentoConfig(BaseModel):


In [None]:
pl_df = pl.scan_parquet("F:/processed/coinbase/ticker/**/*.parquet")
df = pl_df.filter(pl.col("product_id") == "BTC-USD").collect()
df

# Verify Processed Schemas

Let's inspect the actual column schemas from your processed data:

In [None]:
ticker_pl = pl.scan_parquet("F:/processed/coinbase/ticker/**/*.parquet")
level2_pl = pl.scan_parquet("F:/processed/coinbase/level2/**/*.parquet")
market_trades_pl = pl.scan_parquet("F:/processed/coinbase/market_trades/**/*.parquet")

In [4]:
# Ticker schema
print("=" * 80)
print("TICKER SCHEMA")
print("=" * 80)
ticker_sample = ticker_pl.limit(1).collect()
print(f"Columns ({len(ticker_sample.columns)}):")
for col in sorted(ticker_sample.columns):
    dtype = ticker_sample[col].dtype
    print(f"  {col:30s} {dtype}")
print(f"\nSample record:")
print(ticker_sample.head(1))

TICKER SCHEMA
Columns (33):
  __index_level_0__              Int64
  best_ask                       Float64
  best_ask_quantity              Float64
  best_bid                       Float64
  best_bid_quantity              Float64
  capture_timestamp              String
  channel                        String
  client_id                      String
  date                           String
  datetime                       String
  day                            Int64
  event_type                     String
  high_24h                       Float64
  high_52w                       Float64
  hour                           Int64
  low_24h                        Float64
  low_52w                        Float64
  microsecond                    Int64
  mid_price                      Float64
  minute                         Int64
  month                          Int64
  price                          Float64
  price_percent_chg_24h          Float64
  product_id                     String
  range

In [5]:
# Level2 schema
print("\n" + "=" * 80)
print("LEVEL2 SCHEMA")
print("=" * 80)
level2_sample = level2_pl.limit(1).collect()
print(f"Columns ({len(level2_sample.columns)}):")
for col in sorted(level2_sample.columns):
    dtype = level2_sample[col].dtype
    print(f"  {col:30s} {dtype}")
print(f"\nSample record:")
print(level2_sample.head(1))


LEVEL2 SCHEMA
Columns (23):
  __index_level_0__              Int64
  capture_timestamp              String
  channel                        String
  client_id                      String
  date                           String
  datetime                       String
  day                            Int64
  event_time                     String
  event_type                     String
  hour                           Int64
  is_snapshot                    Boolean
  microsecond                    Int64
  minute                         Int64
  month                          Int64
  new_quantity                   Float64
  price_level                    Float64
  product_id                     String
  second                         Int64
  sequence_num                   Int64
  server_timestamp               String
  side                           String
  side_normalized                String
  year                           Int64

Sample record:
shape: (1, 23)
┌─────────┬────────────┬──

In [6]:
# Market Trades schema
print("\n" + "=" * 80)
print("MARKET_TRADES SCHEMA")
print("=" * 80)
trades_sample = market_trades_pl.limit(1).collect()
print(f"Columns ({len(trades_sample.columns)}):")
for col in sorted(trades_sample.columns):
    dtype = trades_sample[col].dtype
    print(f"  {col:30s} {dtype}")
print(f"\nSample record:")
print(trades_sample.head(1))


MARKET_TRADES SCHEMA
Columns (24):
  __index_level_0__              Int64
  capture_timestamp              String
  channel                        String
  client_id                      String
  date                           String
  datetime                       String
  day                            Int64
  event_type                     String
  hour                           Int64
  microsecond                    Int64
  minute                         Int64
  month                          Int64
  price                          Float64
  product_id                     String
  second                         Int64
  sequence_num                   Int64
  server_timestamp               String
  side                           String
  side_normalized                String
  size                           Float64
  time                           String
  trade_id                       String
  value                          Float64
  year                           Int64

Sample re

In [7]:
# Summary comparison
print("\n" + "=" * 80)
print("SCHEMA COMPARISON SUMMARY")
print("=" * 80)
print(f"{'Channel':<20} {'Total Columns':<15} {'Key Derived Fields'}")
print("-" * 80)
print(f"{'ticker':<20} {len(ticker_sample.columns):<15} mid_price, spread, spread_bps, range_24h")
print(f"{'level2':<20} {len(level2_sample.columns):<15} is_snapshot, side_normalized")
print(f"{'market_trades':<20} {len(trades_sample.columns):<15} value, side_normalized")
print("=" * 80)

# Check for time partitioning fields
common_fields = ['date', 'year', 'month', 'day', 'hour']
print("\nTime partitioning fields (common to all):")
for field in common_fields:
    ticker_has = field in ticker_sample.columns
    level2_has = field in level2_sample.columns
    trades_has = field in trades_sample.columns
    status = "✓" if all([ticker_has, level2_has, trades_has]) else "✗"
    print(f"  {status} {field}")


SCHEMA COMPARISON SUMMARY
Channel              Total Columns   Key Derived Fields
--------------------------------------------------------------------------------
ticker               33              mid_price, spread, spread_bps, range_24h
level2               23              is_snapshot, side_normalized
market_trades        24              value, side_normalized

Time partitioning fields (common to all):
  ✓ date
  ✓ year
  ✓ month
  ✓ day
  ✓ hour
