# Data Validation Checklist

Use this notebook to run schema checks and baseline statistics for each processed CSV drop before training.

In [None]:
from __future__ import annotations

import pandas as pd
from src.data.schema import PRIMARY_KEY, validate_dataframe
from src.utils.config import load_config

config = load_config("configs/base.yaml")
df = pd.read_csv(config.data_path)
errors = validate_dataframe(df)
if errors:
    print("Schema violations detected:")
    for error in errors:
        print("-", error)
else:
    print("Schema validation passed. OK")

row_count = len(df)
unique_rows = df.drop_duplicates(subset=list(PRIMARY_KEY)).shape[0]
print(f"Rows: {row_count:,}")
print(f"Unique {PRIMARY_KEY}: {unique_rows:,}")

summary = df.describe(include="all", percentiles=[0.01, 0.5, 0.99])
summary.head()