In [None]:
import polars as pl
from pathlib import Path

In [None]:
data_path = Path("../data/processed_data")

In [None]:
parquet_files = list(data_path.glob("*.parquet"))

In [None]:
dfs = {f.stem: pl.read_parquet(f) for f in parquet_files}

In [None]:
def overview(df: pl.DataFrame):
    print("Shape:", df.shape)
    print("\nColumns and types:")
    print(df.dtypes)
    print("\nFirst 5 rows:")
    print(df.head(5))
    print("\nSummary statistics:")
    print(df.describe())
    print("-" * 40)

In [None]:
for name, df in dfs.items():
    print(f"{name}")
    overview(df)

df_long
Shape: (19800, 3)

Columns and types:
[Date, String, Int64]

First 5 rows:
shape: (5, 3)
┌────────────┬──────────┬───────┐
│ Date       ┆ Drink    ┆ Views │
│ ---        ┆ ---      ┆ ---   │
│ date       ┆ str      ┆ i64   │
╞════════════╪══════════╪═══════╡
│ 2020-07-01 ┆ "Matcha" ┆ 1969  │
│ 2020-07-02 ┆ "Matcha" ┆ 1960  │
│ 2020-07-03 ┆ "Matcha" ┆ 1920  │
│ 2020-07-04 ┆ "Matcha" ┆ 1822  │
│ 2020-07-05 ┆ "Matcha" ┆ 1954  │
└────────────┴──────────┴───────┘

Summary statistics:
shape: (9, 4)
┌────────────┬─────────────────────┬──────────────┬─────────────┐
│ statistic  ┆ Date                ┆ Drink        ┆ Views       │
│ ---        ┆ ---                 ┆ ---          ┆ ---         │
│ str        ┆ str                 ┆ str          ┆ f64         │
╞════════════╪═════════════════════╪══════════════╪═════════════╡
│ count      ┆ 19800               ┆ 19800        ┆ 19800.0     │
│ null_count ┆ 0                   ┆ 0            ┆ 0.0         │
│ mean       ┆ 2023-03-17 12:00:

In [None]:
def check_nixtla_compatibility(df: pl.DataFrame):
    required_cols = ["ds", "unique_id", "y"]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        print("Missing required columns:", missing)
    else:
        print("Required columns present.")
    
    # Check types
    print("Column types:")
    print(df.dtypes)
    
    # Optional: Check multi-series
    if "id" in df.columns:
        print("Multiple time series detected. Number of unique series:", df.select(pl.col("id").n_unique())[0,0])

for name, df in dfs.items():
    print(f"=== Checking {name} for Nixtla compatibility ===")
    check_nixtla_compatibility(df)
    print("-" * 40)


=== Checking df_long for Nixtla compatibility ===
Missing required columns: ['ds', 'unique_id', 'y']
Column types:
[Date, String, Int64]
----------------------------------------
=== Checking df_outliers_capped for Nixtla compatibility ===
Required columns present.
Column types:
[Date, String, Float64]
----------------------------------------
=== Checking fourier+trend+holidays_test for Nixtla compatibility ===
Required columns present.
Column types:
[Date, String, Float64, Boolean, String, Int64, Int64, Int64, Float32, Float32, Float32, Float32, Float32, Float32, Float32]
----------------------------------------
=== Checking fourier+trend+holidays_test_future for Nixtla compatibility ===
Missing required columns: ['y']
Column types:
[String, Date, Boolean, String, Int64, Int64, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64]
----------------------------------------
=== Checking fourier+trend+holidays_train for Nixtla compatibility ===
Required columns present.
Col