In [6]:
import sys
import os
import duckdb
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
from config import load_config

In [2]:
import glob
import logging
from tqdm import tqdm
from etl.processors.time_utils import add_time_fields, parse_timestamp_fields
import pyarrow as pa
import pyarrow.parquet as pq
from etl.writers.parquet_writer import ParquetWriter

In [3]:
# Search for all files matching the pattern
source = "coinbase"
channel = "ticker"
timefields_params = {"timestamp_field": 'server_timestamp', "fallback_field": "capture_timestamp"}

nan_files = glob.glob(f'F://processed/{source}/{channel}/*/date=nan/*.parquet')
print(f"Found {len(nan_files)} files with date=nan\n")


config = load_config("config/config.yaml")

# Configure logging
logging.basicConfig(
    level=getattr(logging, config.log_level),
    format=config.log_format
)

# Convert channel config from Pydantic to dict format
channel_config = None
if hasattr(config.etl, 'channels') and config.etl.channels:
    channel_config = {
        channel_name: {
            "partition_cols": channel_cfg.partition_cols,
            "processor_options": channel_cfg.processor_options,
        }
        for channel_name, channel_cfg in config.etl.channels.items()
        if channel_cfg.enabled
    }


partition_cols = channel_config[channel]['partition_cols']
output_path = Path(config.etl.output_dir) / "coinbase" / channel
writer = ParquetWriter()


2025-11-28 15:32:50,435 - etl.writers.parquet_writer - INFO - [ParquetWriter] Initialized: output_dir=None, compression=snappy


Found 9692 files with date=nan



In [4]:
unprocessed_files = []
for fpath in tqdm(nan_files, desc="Processing files"):
    try:
        df = pd.read_parquet(fpath)
        records = df.to_dict(orient='records')
        for record in records:
            record = add_time_fields(record, **timefields_params)
        new_df = pd.DataFrame.from_records(records)
        writer.write(new_df, output_path=output_path, partition_cols=partition_cols)
        os.remove(fpath)
    except Exception as e:
        unprocessed_files.append(fpath)
fpath

Processing files:   0%|          | 0/9692 [00:00<?, ?it/s]

Processing files:   3%|▎         | 306/9692 [00:00<00:11, 784.88it/s]2025-11-28 15:33:36,180 - etl.writers.parquet_writer - INFO - [ParquetWriter] Wrote 1 records to ticker\product_id=AERO-USD\date=2025-11-27\part_20251128T15_def82d5d.parquet (0.6 KB)
2025-11-28 15:33:36,181 - etl.writers.parquet_writer - INFO - [ParquetWriter] Total: 1 records written across 1 files
2025-11-28 15:33:36,208 - etl.writers.parquet_writer - INFO - [ParquetWriter] Wrote 1 records to ticker\product_id=AERO-USD\date=2025-11-27\part_20251128T15_1a6f7a27.parquet (0.6 KB)
2025-11-28 15:33:36,208 - etl.writers.parquet_writer - INFO - [ParquetWriter] Total: 2 records written across 2 files
2025-11-28 15:33:36,270 - etl.writers.parquet_writer - INFO - [ParquetWriter] Wrote 1 records to ticker\product_id=AERO-USD\date=2025-11-27\part_20251128T15_b5c28523.parquet (0.6 KB)
2025-11-28 15:33:36,270 - etl.writers.parquet_writer - INFO - [ParquetWriter] Total: 3 records written across 3 files
2025-11-28 15:33:36,306 - et

'F://processed/coinbase/ticker\\product_id=ZRO-USD\\date=nan\\part_20251127T05_a88c1502.parquet'

In [17]:
writer.write(new_df, output_path=output_path, partition_cols=partition_cols)

2025-11-28 15:29:08,099 - etl.writers.parquet_writer - INFO - [ParquetWriter] Wrote 1 records to ticker\product_id=ADA-USD\date=2025-11-27\part_20251128T15_273e48c4.parquet (0.6 KB)
2025-11-28 15:29:08,100 - etl.writers.parquet_writer - INFO - [ParquetWriter] Total: 1 records written across 1 files


In [18]:
os.remove(fpath)