## Data Preprocessing
#### Data loading & Hold-on split at 2025-01-01


In [1]:
import os
import pandas as pd

class CFG:
    # --- Data Parameters ---
    COIN_ID_COLUMN   = 'coin_id'
    TIMESTAMP_COLUMN = 'timestamp'
    TARGET_COLUMN    = 'target_direction'

def load_data(path: str) -> pd.DataFrame:
    """
    Load all coins from a Parquet file, ensure datetime conversion & sorting.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Data file not found: {path}")

    df = pd.read_parquet(path, engine='pyarrow')

    # ensure timestamp column is datetime type
    if not pd.api.types.is_datetime64_any_dtype(df[CFG.TIMESTAMP_COLUMN]):
        df[CFG.TIMESTAMP_COLUMN] = pd.to_datetime(df[CFG.TIMESTAMP_COLUMN])

    # sort by coin_id then timestamp
    df.sort_values([CFG.COIN_ID_COLUMN, CFG.TIMESTAMP_COLUMN],
                   inplace=True, ignore_index=True)
    return df

def split_by_date(df: pd.DataFrame, date_str: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Hold-out split at a fixed date.
    - date_str: ISO format date (e.g. '2025-01-01')
    Returns (train_df, test_df).
    """
    cutoff = pd.to_datetime(date_str)
    train = df[df[CFG.TIMESTAMP_COLUMN] < cutoff].copy()
    test  = df[df[CFG.TIMESTAMP_COLUMN] >= cutoff].copy()

    print(f"[Date Split @ {date_str}]")
    print(f"  train period: {train[CFG.TIMESTAMP_COLUMN].min()} → {train[CFG.TIMESTAMP_COLUMN].max()}")
    print(f"  test  period: {test[CFG.TIMESTAMP_COLUMN].min()} → {test[CFG.TIMESTAMP_COLUMN].max()}")
    return train, test

if __name__ == "__main__":
    # load the full dataset from the local Parquet file
    data_path = os.path.join(os.getcwd(), "OHLCV_ffill.parquet")
    df = load_data(data_path)

    # fixed-date split at 2025-01-01 only
    train, test = split_by_date(df, "2025-01-01")

    # display shapes for quick sanity check
    print("=== Dataset Shapes ===")
    print(f"Split @2025-01-01: train={train.shape}, test={test.shape}")

[Date Split @ 2025-01-01]
  train period: 2021-01-01 00:00:00 → 2024-12-31 23:59:00
  test  period: 2025-01-01 00:00:00 → 2025-04-27 03:00:00
=== Dataset Shapes ===
Split @2025-01-01: train=(10519200, 14), test=(836105, 14)


## Resampling
#### Time intervals: 1-min, 10-min, 1-hour, 1-day 

In [2]:
import os
import pandas as pd

class CFG:
    # Data parameters
    COIN_ID_COLUMN   = 'coin_id'
    TIMESTAMP_COLUMN = 'timestamp'

def load_data(path: str) -> pd.DataFrame:
    """
    Load full Parquet dataset, ensure timestamp dtype & sort by coin_id + timestamp.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Data file not found: {path}")

    df = pd.read_parquet(path, engine='pyarrow')
    if not pd.api.types.is_datetime64_any_dtype(df[CFG.TIMESTAMP_COLUMN]):
        df[CFG.TIMESTAMP_COLUMN] = pd.to_datetime(df[CFG.TIMESTAMP_COLUMN])
    df.sort_values([CFG.COIN_ID_COLUMN, CFG.TIMESTAMP_COLUMN],
                   inplace=True, ignore_index=True)
    return df

def split_by_date(df: pd.DataFrame, date_str: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Fixed-date hold-out split at date_str (inclusive for test).
    """
    cutoff = pd.to_datetime(date_str)
    train = df[df[CFG.TIMESTAMP_COLUMN] < cutoff].copy()
    test  = df[df[CFG.TIMESTAMP_COLUMN] >= cutoff].copy()
    print(f"[Date Split @ {date_str}]")
    print(f"  train: {train[CFG.TIMESTAMP_COLUMN].min()} → {train[CFG.TIMESTAMP_COLUMN].max()}")
    print(f"  test : {test[CFG.TIMESTAMP_COLUMN].min()} → {test[CFG.TIMESTAMP_COLUMN].max()}")
    return train, test

def resample_ohlcv(df: pd.DataFrame, freq: str) -> pd.DataFrame:
    """
    Resample OHLCV for each coin_id at given freq.
    """
    agg = {
        "open":                         "first",
        "high":                         "max",
        "low":                          "min",
        "close":                        "last",
        "volume":                       "sum",
        "quote_asset_volume":           "sum",
        "number_of_trades":             "sum",
        "taker_buy_base_asset_volume":  "sum",
        "taker_buy_quote_asset_volume": "sum",
    }
    df = df.set_index("timestamp")
    out = (
        df
        .groupby("coin_id", observed=True)
        .resample(freq)
        .agg(agg)
        .dropna(subset=["open"])
        .reset_index()
    )
    return out

if __name__ == "__main__":
    # 1) Load data
    data_path = os.path.join(os.getcwd(), "OHLCV_ffill.parquet")
    df = load_data(data_path)

    # 2) Single fixed-date split at 2025-01-01
    train_df, test_df = split_by_date(df, "2025-01-01")

    # 3) Resampling & saving
    base_out = os.path.join("resampled")
    os.makedirs(base_out, exist_ok=True)

    freq_map = ["1min", "10min", "1h", "1d"]
    for label, subset in [("train", train_df), ("test", test_df)]:
        for freq in freq_map:
            print(f"Resampling {label} at {freq} …")
            rs = resample_ohlcv(subset, freq)
            out_file = f"{label}_{freq}.parquet"
            out_path = os.path.join(base_out, out_file)
            rs.to_parquet(out_path)
            print(f"  → saved {out_path} (rows={rs.shape[0]})")

[Date Split @ 2025-01-01]
  train: 2021-01-01 00:00:00 → 2024-12-31 23:59:00
  test : 2025-01-01 00:00:00 → 2025-04-27 03:00:00
Resampling train at 1min …
  → saved resampled/train_1min.parquet (rows=10519200)
Resampling train at 10min …
  → saved resampled/train_10min.parquet (rows=1051920)
Resampling train at 1h …
  → saved resampled/train_1h.parquet (rows=175320)
Resampling train at 1d …
  → saved resampled/train_1d.parquet (rows=7305)
Resampling test at 1min …
  → saved resampled/test_1min.parquet (rows=836105)
Resampling test at 10min …
  → saved resampled/test_10min.parquet (rows=83615)
Resampling test at 1h …
  → saved resampled/test_1h.parquet (rows=13940)
Resampling test at 1d …
  → saved resampled/test_1d.parquet (rows=585)
