In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# ------------------------------------------------------------
# 0) Stier
# ------------------------------------------------------------
DATA = Path.cwd() / "data"
if not DATA.exists():
    DATA = Path.cwd().parent / "data"

RAW        = DATA / "1_raw"
INTERIM    = DATA / "2_interim"
PROCESSED  = DATA / "3_processed"
INTERIM.mkdir(parents=True, exist_ok=True)
PROCESSED.mkdir(parents=True, exist_ok=True)

PREDICTION_MAPPING_CSV = RAW / "prediction_mapping.csv"
MASTER_DATA_CSV        = INTERIM / "master_data.csv"
PURCHASE_ORDERS_CSV    = RAW / "kernel_purchase_orders.csv"

# ------------------------------------------------------------
# 1) Hjelpefunksjoner
# ------------------------------------------------------------
def _prep_asof_keys(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if "rm_id" in out.columns:
        out["rm_id"] = pd.to_numeric(out["rm_id"], errors="coerce").astype("int64")
    if "date" in out.columns:
        out["date"] = pd.to_datetime(out["date"], utc=False, errors="coerce")
    out = out.dropna(subset=[c for c in ["rm_id","date"] if c in out.columns])
    out = out.sort_values(["rm_id","date"]).reset_index(drop=True)
    return out

def read_mapping(mapping_path: Path) -> pd.DataFrame:
    df = pd.read_csv(mapping_path, parse_dates=["forecast_start_date", "forecast_end_date"])
    return df[["ID","rm_id","forecast_start_date","forecast_end_date"]].copy()

def read_master(master_path: Path) -> pd.DataFrame:
    df = pd.read_csv(master_path, low_memory=False)
    df["date_arrival"] = pd.to_datetime(df["date_arrival"], utc=True, errors="coerce").dt.tz_convert(None)
    df["net_weight"]   = pd.to_numeric(df.get("net_weight"), errors="coerce").fillna(0.0)
    for c in ["rm_id","purchase_order_id","purchase_order_item_no","receival_item_no"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
    recv_key = ["rm_id","purchase_order_id","purchase_order_item_no","receival_item_no"]
    if not set(recv_key).issubset(df.columns):
        recv_key = ["rm_id","date_arrival","net_weight"]
    df = df.drop_duplicates(subset=recv_key, keep="first").copy()
    df["date"] = pd.to_datetime(df["date_arrival"].dt.date)
    return df

def complete_calendar(daily_rm: pd.DataFrame) -> pd.DataFrame:
    full = pd.DataFrame({"date": pd.date_range(daily_rm["date"].min(), daily_rm["date"].max(), freq="D")})
    full["rm_id"] = daily_rm.name
    out = full.merge(daily_rm, on=["rm_id","date"], how="left")
    out["net_weight"] = out["net_weight"].fillna(0.0)
    return out

def build_daily(master_df: pd.DataFrame) -> pd.DataFrame:
    master_df = master_df.copy()
    master_df["rm_id"] = pd.to_numeric(master_df["rm_id"], errors="coerce").astype("int64")
    daily = (master_df.groupby(["rm_id","date"], as_index=False)["net_weight"].sum()
             .sort_values(["rm_id","date"]))
    daily = (daily.groupby("rm_id", group_keys=False)
                   .apply(complete_calendar)
                   .sort_values(["rm_id","date"])
                   .reset_index(drop=True))
    daily["cum_kg"] = daily.groupby("rm_id")["net_weight"].cumsum()
    daily["deliveries_flag"] = (daily["net_weight"] > 0).astype(int)
    for w in [7,14,28,56,365]:
        daily[f"r{w}_kg"]   = daily.groupby("rm_id")["net_weight"].rolling(w, min_periods=1).sum().reset_index(level=0, drop=True)
        daily[f"r{w}_days"] = daily.groupby("rm_id")["deliveries_flag"].rolling(w, min_periods=1).sum().reset_index(level=0, drop=True)
    def safe_div(a, b):
        b = b.replace(0, np.nan)
        return (a / b).fillna(0.0)
    daily["r28_mean_kg_per_day"] = safe_div(daily["r28_kg"], daily["r28_days"])
    daily["r56_mean_kg_per_day"] = safe_div(daily["r56_kg"], daily["r56_days"])
    tmp = daily[["rm_id","date","net_weight"]].copy()
    tmp["last_deliv_date"] = tmp["date"].where(tmp["net_weight"] > 0)
    tmp["last_deliv_date"] = tmp.groupby("rm_id")["last_deliv_date"].ffill()
    daily["days_since_last"] = (daily["date"] - tmp["last_deliv_date"]).dt.days.fillna(10_000).astype(int)
    return daily

def read_orders_with_rm(orders_path: Path | None, master_df: pd.DataFrame) -> pd.DataFrame | None:
    if orders_path is None or not orders_path.exists():
        return None
    po = pd.read_csv(
        orders_path,
        low_memory=False,
        parse_dates=["delivery_date","created_date_time","modified_date_time"]
    )
    for c in ["purchase_order_id","purchase_order_item_no"]:
        if c in po.columns:
            po[c] = pd.to_numeric(po[c], errors="coerce").astype("Int64")
    po["quantity"] = pd.to_numeric(po.get("quantity"), errors="coerce").fillna(0.0)
    rm_lookup = (master_df[["rm_id","purchase_order_id","purchase_order_item_no"]]
                 .dropna()
                 .drop_duplicates())
    po = po.merge(rm_lookup, on=["purchase_order_id","purchase_order_item_no"], how="left")
    if "delivery_date" in po.columns:
        po["delivery_date"] = pd.to_datetime(po["delivery_date"], utc=True, errors="coerce").dt.tz_convert(None)
    if "created_date_time" in po.columns:
        po["created_date_time"] = pd.to_datetime(po["created_date_time"], utc=True, errors="coerce").dt.tz_convert(None)
    if "modified_date_time" in po.columns:
        po["modified_date_time"] = pd.to_datetime(po["modified_date_time"], utc=True, errors="coerce").dt.tz_convert(None)
    keep_cols = [c for c in ["rm_id","quantity","delivery_date","created_date_time","modified_date_time","purchase_order_id","purchase_order_item_no"] if c in po.columns]
    po = po[keep_cols].dropna(subset=["rm_id","delivery_date"]).copy()
    po["rm_id"] = pd.to_numeric(po["rm_id"], errors="coerce").astype("int64")
    return po

def same_period_last_year(df_map: pd.DataFrame, csum: pd.DataFrame) -> pd.DataFrame:
    """
    Beregner total vekt i samme periode i fjor.
    Bruker groupby og apply i stedet for merge_asof for å unngå sorteringsproblemer.
    """
    df_map = df_map.copy()
    df_map["rm_id"] = pd.to_numeric(df_map["rm_id"], errors="coerce").astype("int64")
    
    # Forbered cumulative data
    csum_clean = csum[["rm_id","date","cum_kg"]].copy()
    csum_clean["rm_id"] = pd.to_numeric(csum_clean["rm_id"], errors="coerce").astype("int64")
    csum_clean["date"] = pd.to_datetime(csum_clean["date"], errors="coerce")
    csum_clean = csum_clean.dropna(subset=["rm_id","date"])
    csum_clean = csum_clean.sort_values(["rm_id","date"]).reset_index(drop=True)
    
    # Beregn datoer for i fjor
    df_map["prev_start"] = df_map["forecast_start_date"] - pd.Timedelta(days=366)
    df_map["prev_end"] = df_map["forecast_end_date"] - pd.Timedelta(days=365)
    
    results = []
    
    # Gruppe etter rm_id og beregn for hver gruppe
    for rm_id, group in df_map.groupby("rm_id"):
        # Hent cumulative data for denne rm_id
        rm_csum = csum_clean[csum_clean["rm_id"] == rm_id].copy()
        
        if rm_csum.empty:
            # Ingen data for denne rm_id
            for idx, row in group.iterrows():
                results.append({"ID": row["ID"], "same_period_last_year_kg": 0.0})
            continue
        
        # For hver rad i gruppen
        for idx, row in group.iterrows():
            prev_start = row["prev_start"]
            prev_end = row["prev_end"]
            
            # Finn nærmeste dato <= prev_start
            before_start = rm_csum[rm_csum["date"] <= prev_start]
            cum_a = before_start["cum_kg"].iloc[-1] if not before_start.empty else 0.0
            
            # Finn nærmeste dato <= prev_end
            before_end = rm_csum[rm_csum["date"] <= prev_end]
            cum_b = before_end["cum_kg"].iloc[-1] if not before_end.empty else 0.0
            
            # Beregn differanse
            diff = max(0.0, cum_b - cum_a)
            results.append({"ID": row["ID"], "same_period_last_year_kg": diff})
    
    return pd.DataFrame(results)

def add_orders_features(df_map: pd.DataFrame, orders: pd.DataFrame | None) -> pd.DataFrame:
    if orders is None or orders.empty:
        df_map["orders_qty_in_window"] = 0.0
        df_map["orders_lines_in_window"] = 0
        return df_map
    ordr = df_map[["ID","rm_id","forecast_start_date","forecast_end_date","cutoff_date"]].merge(orders, on="rm_id", how="left")
    known_mask = True
    if "created_date_time" in ordr.columns:
        known_mask = ordr["created_date_time"] <= ordr["cutoff_date"]
    if "modified_date_time" in ordr.columns:
        known_mask &= (ordr["modified_date_time"].isna() | (ordr["modified_date_time"] <= ordr["cutoff_date"]))
    ordr = ordr[known_mask]
    in_win = (ordr["delivery_date"] >= ordr["forecast_start_date"]) & (ordr["delivery_date"] <= ordr["forecast_end_date"])
    if {"purchase_order_id","purchase_order_item_no"}.issubset(ordr.columns):
        grp = ordr[in_win].groupby("ID", as_index=False).agg(
            orders_qty_in_window=("quantity","sum"),
            orders_lines_in_window=("purchase_order_id","nunique")
        )
    else:
        grp = ordr[in_win].groupby("ID", as_index=False).agg(
            orders_qty_in_window=("quantity","sum"),
            orders_lines_in_window=("quantity","size")
        )
    out = df_map.merge(grp, on="ID", how="left")
    out["orders_qty_in_window"] = out["orders_qty_in_window"].fillna(0.0)
    out["orders_lines_in_window"] = out["orders_lines_in_window"].fillna(0).astype(int)
    return out

def make_features_from_mapping(daily: pd.DataFrame, df_map_in: pd.DataFrame, orders: pd.DataFrame | None) -> pd.DataFrame:
    df_map = df_map_in.copy()
    df_map["rm_id"] = pd.to_numeric(df_map["rm_id"], errors="coerce").astype("int64")
    df_map["cutoff_date"] = df_map["forecast_start_date"] - pd.Timedelta(days=1)
    df_map["window_days"] = (df_map["forecast_end_date"] - df_map["forecast_start_date"]).dt.days + 1
    feat_date = daily.rename(columns={"date":"cutoff_date"}).copy()
    feat_date["rm_id"] = pd.to_numeric(feat_date["rm_id"], errors="coerce").astype("int64")
    feat_date = feat_date.sort_values(["rm_id","cutoff_date"])
    hist_cols = ["rm_id","cutoff_date","cum_kg","r7_kg","r14_kg","r28_kg","r56_kg","r365_kg",
                 "r7_days","r14_days","r28_days","r56_days","r365_days",
                 "r28_mean_kg_per_day","r56_mean_kg_per_day","days_since_last"]
    X = df_map.merge(feat_date[hist_cols], on=["rm_id","cutoff_date"], how="left")
    csum = daily[["rm_id","date","cum_kg"]].copy()
    X = X.merge(same_period_last_year(df_map, csum), on="ID", how="left")
    X["start_month"] = X["forecast_start_date"].dt.month
    X["start_dow"]   = X["forecast_start_date"].dt.dayofweek
    X["end_month"]   = X["forecast_end_date"].dt.month
    X = add_orders_features(X, orders)
    for c in ["cum_kg","r7_kg","r14_kg","r28_kg","r56_kg","r365_kg",
              "r7_days","r14_days","r28_days","r56_days","r365_days",
              "r28_mean_kg_per_day","r56_mean_kg_per_day","days_since_last",
              "same_period_last_year_kg","orders_qty_in_window"]:
        if c in X.columns:
            X[c] = X[c].fillna(0.0)
    return X

def build_training_windows_template(daily: pd.DataFrame, mapping_like: pd.DataFrame,
                                    step_days: int = 7, min_history_days: int = 400) -> pd.DataFrame:
    win_lengths = sorted(((mapping_like["forecast_end_date"] - mapping_like["forecast_start_date"]).dt.days + 1).unique())
    pieces = []
    for rm, g in daily.groupby("rm_id"):
        first_date = g["date"].min()
        last_date  = g["date"].max()
        for wlen in win_lengths:
            start = first_date + pd.Timedelta(days=min_history_days)
            end_latest = last_date - pd.Timedelta(days=wlen)
            if start > end_latest:
                continue
            starts = pd.date_range(start, end_latest, freq=f"{step_days}D")
            df = pd.DataFrame({
                "rm_id": rm,
                "forecast_start_date": starts,
                "forecast_end_date": starts + pd.to_timedelta(wlen - 1, unit="D")
            })
            pieces.append(df)
    if not pieces:
        return pd.DataFrame(columns=["ID","rm_id","forecast_start_date","forecast_end_date"])
    tmpl = pd.concat(pieces, ignore_index=True)
    tmpl = tmpl.sort_values(["rm_id","forecast_start_date"]).reset_index(drop=True)
    tmpl["ID"] = np.arange(1, len(tmpl) + 1)
    return tmpl[["ID","rm_id","forecast_start_date","forecast_end_date"]]

def label_from_daily(daily: pd.DataFrame, df_map: pd.DataFrame) -> pd.DataFrame:
    g = daily[["rm_id","date","net_weight"]].copy()
    g = _prep_asof_keys(g)
    g["cum"] = g.groupby("rm_id")["net_weight"].cumsum()
    a = df_map[["ID","rm_id","forecast_start_date"]].rename(columns={"forecast_start_date":"date"})
    b = df_map[["ID","rm_id","forecast_end_date"]].rename(columns={"forecast_end_date":"date"})
    a = _prep_asof_keys(a)
    b = _prep_asof_keys(b)
    a = pd.merge_asof(a, g[["rm_id","date","cum"]], by="rm_id", on="date", direction="backward")
    b = pd.merge_asof(b, g[["rm_id","date","cum"]], by="rm_id", on="date", direction="backward")
    y = (b[["ID","cum"]].rename(columns={"cum":"cum_b"})
         .merge(a[["ID","cum"]].rename(columns={"cum":"cum_a"}), on="ID", how="left"))
    y["y_window_kg"] = (y["cum_b"] - y["cum_a"]).fillna(0.0)
    return y[["ID","y_window_kg"]]

# ------------------------------------------------------------
# 2) Kjøring
# ------------------------------------------------------------
df_map_raw = read_mapping(PREDICTION_MAPPING_CSV)
df_master  = read_master(MASTER_DATA_CSV)
daily      = build_daily(df_master)
orders     = read_orders_with_rm(PURCHASE_ORDERS_CSV, df_master)

df_map_raw.to_csv(PROCESSED / "prediction_mapping_base_copy.csv", index=False)

X_submit = make_features_from_mapping(daily, df_map_raw, orders)

out_cols = ["ID","rm_id","forecast_start_date","forecast_end_date","cutoff_date","window_days",
            "cum_kg","r7_kg","r14_kg","r28_kg","r56_kg","r365_kg",
            "r7_days","r14_days","r28_days","r56_days","r365_days",
            "r28_mean_kg_per_day","r56_mean_kg_per_day","days_since_last",
            "same_period_last_year_kg",
            "orders_qty_in_window","orders_lines_in_window",
            "start_month","start_dow","end_month"]
X_submit[out_cols].to_csv(PROCESSED / "features_for_submission.csv", index=False)
print(f"Wrote {len(X_submit)} rows -> {PROCESSED/'features_for_submission.csv'}")

try:
    train_template = build_training_windows_template(daily, df_map_raw, step_days=7, min_history_days=400)
    X_train = make_features_from_mapping(daily, train_template, orders)
    y_train = label_from_daily(daily, train_template)
    train = X_train.merge(y_train, on="ID", how="left")
    train[out_cols + ["y_window_kg"]].to_csv(PROCESSED / "training_features_and_labels.csv", index=False)
    print(f"Wrote {len(train)} rows -> {PROCESSED/'training_features_and_labels.csv'}")
except Exception as e:
    print("Training set build skipped or failed:", e)

  .apply(complete_calendar)


Wrote 30450 rows -> c:\Users\TIMJ\tdt4173-course-project\data\3_processed\features_for_submission.csv
