In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import matplotlib.pyplot as plt

ROOT = Path.cwd().resolve()
while ROOT != ROOT.parent and not (ROOT / "data").exists():
    ROOT = ROOT.parent

SAMPLE = ROOT / "data" / "gold" / "samples" / "bicing_gold_final_plus_sample_1M_strat_holidays.parquet"
df = pq.read_table(SAMPLE).to_pandas()

df["time_hour"] = pd.to_datetime(df["time_hour"])
df["date"] = pd.to_datetime(df["date"])

# holiday_any consistente (no dependas de is_holiday legacy)
df["holiday_any"] = (
    (df["is_holiday_barcelona"] == 1) |
    (df["is_holiday_catalunya"] == 1) |
    (df["is_holiday_spain"] == 1)
).astype(int)

df["holiday_scope_final"] = df["holiday_scope"].fillna("none")

print("shape:", df.shape)
print("range:", df["time_hour"].min(), "->", df["time_hour"].max())
print(df["holiday_scope_final"].value_counts())


In [None]:
df["capacity_mean"] = df["bikes_available_mean"] + df["docks_available_mean"]
bad = (df["bikes_available_mean"] > df["capacity_mean"]).sum()
zero_cap = (df["capacity_mean"] <= 0).sum()

print("capacity <=0:", zero_cap)
print("bikes > capacity:", bad)

df["occ_ratio"] = np.where(df["capacity_mean"] > 0, df["bikes_available_mean"] / df["capacity_mean"], np.nan)
print(df["occ_ratio"].describe())
