In [None]:

# transform_pipeline_unified.py
# Budget Core Schema (BCS) — robust for Jupyter & CLI, with 0/1 one-hot encoding

import os
import re
import sys
import json
from zipfile import ZipFile

import numpy as np
import pandas as pd

# -------- PDF (optional, graceful fallback) --------
REPORT_ENABLED = True
try:
    from reportlab.lib.pagesizes import A4
    from reportlab.pdfgen import canvas
except Exception:
    REPORT_ENABLED = False

# ----------------------------
# Helpers
# ----------------------------

def normalize_col(name: str) -> str:
    """Normalize a column name: replace non-alnum with underscores, collapse repeats, trim."""
    s = re.sub(r'[^0-9A-Za-z]+', '_', str(name))
    s = re.sub(r'_+', '_', s).strip('_')
    return s

def zscore(series: pd.Series):
    """Return z-score standardized Series (population std, ddof=0)."""
    mean = float(series.mean())
    std = float(series.std(ddof=0))
    if std == 0:
        # Avoid divide-by-zero: centered values, std reported as 0
        return (series - mean), mean, std
    return (series - mean) / std, mean, std

def compute_vif(X: pd.DataFrame) -> pd.DataFrame:
    """
    Compute VIF for each column by regressing it on the others (OLS via least squares),
    adding an intercept term.
    """
    X = X.copy().dropna()
    vifs = {}
    for col in X.columns:
        y = X[col].values
        X_others = X.drop(columns=[col]).values
        # Add intercept term
        X_design = np.column_stack([np.ones(X_others.shape[0]), X_others])
        beta, *_ = np.linalg.lstsq(X_design, y, rcond=None)
        y_hat = X_design @ beta

        ss_res = np.sum((y - y_hat) ** 2)
        ss_tot = np.sum((y - y.mean()) ** 2)
        R2 = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0.0

        vif = 1.0 / (1.0 - R2) if (1 - R2) != 0 else np.inf
        vifs[col] = float(vif)
    return pd.DataFrame({"feature": list(vifs.keys()), "VIF": list(vifs.values())})

def write_pdf_report(path: str, df: pd.DataFrame, scale_stats: dict,
                     vif_orig: pd.DataFrame, vif_reduced: pd.DataFrame,
                     outputs: list):
    """Create a one-page PDF with summary, scaling stats, VIF, and output file locations."""
    if not REPORT_ENABLED:
        # If reportlab isn't available, write a light TXT note instead
        with open(path.replace(".pdf", ".txt"), "w", encoding="utf-8") as f:
            f.write("Data Transform Report — Budget Core Schema (BCS)\n")
            f.write("(PDF creation skipped; 'reportlab' not installed)\n\n")
            f.write("Summary:\n")
            f.write("1) Loaded input CSV; normalized column names.\n")
            f.write("2) Feature Engineering: Total_Spend = R_D_Spend + Marketing_Spend.\n")
            f.write("3) One-Hot (0/1): State_California, State_Florida, State_New York.\n")
            f.write("4) Scaling: z-score for Total_Spend & Administration; saved scaled_features.csv.\n")
            f.write("5) VIF: original vs reduced schema; saved vif_table.csv.\n")
            f.write("6) Final BCS: saved as Processed_BCS.csv.\n\n")
            f.write(f"Rows: {len(df)} | Columns: {df.shape[1]}\n")
            if "State" in df.columns:
                states = ", ".join(sorted(df["State"].astype(str).unique()))
                f.write(f"States present: {states}\n\n")
            f.write("Scaling stats (mean, std):\n")
            for col, st in scale_stats.items():
                f.write(f" - {col}: mean={st['mean']:.2f}, std={st['std']:.2f}\n")
            f.write("\nVIF — Original:\n")
            for _, r in vif_orig.iterrows():
                f.write(f" - {r['feature']}: VIF={r['VIF']:.2f}\n")
            f.write("\nVIF — Reduced:\n")
            for _, r in vif_reduced.iterrows():
                f.write(f" - {r['feature']}: VIF={r['VIF']:.2f}\n")
            f.write("\nOutputs:\n")
            for p in outputs:
                f.write(f" - {p}\n")
        return

    # PDF path
    c = canvas.Canvas(path, pagesize=A4)
    width, height = A4
    margin = 50
    cursor_y = height - margin

    def write_line(text, size=11):
        nonlocal cursor_y
        c.setFont("Helvetica", size)
        c.drawString(margin, cursor_y, text)
        cursor_y -= 14

    # Title
    c.setFont("Helvetica-Bold", 16)
    c.drawString(margin, cursor_y, "Data Transform Report — Budget Core Schema (BCS)")
    cursor_y -= 24

    # Summary
    lines = [
        "1) Loaded input CSV; normalized column names.",
        "2) Feature Engineering: Total_Spend = R_D_Spend + Marketing_Spend.",
        "3) One-Hot (0/1): State_California, State_Florida, State_New York.",
        "4) Scaling: z-score for Total_Spend & Administration; saved scaled_features.csv.",
        "5) VIF: original schema vs reduced schema; saved vif_table.csv.",
        "6) Final BCS: saved as Processed_BCS.csv.",
    ]
    for line in lines:
        write_line(line)

    # Basic stats
    write_line("")
    if "State" in df.columns:
        states = ", ".join(sorted(df["State"].astype(str).unique()))
        write_line(f"Rows: {len(df):,} | Columns: {df.shape[1]} | States present: {states}")
    else:
        write_line(f"Rows: {len(df):,} | Columns: {df.shape[1]}")

    # Scaling stats
    write_line("")
    write_line("Scaling stats (mean, std):")
    for col, st in scale_stats.items():
        write_line(f" - {col}: mean={st['mean']:.2f}, std={st['std']:.2f}")

    # VIF Original
    write_line("")
    write_line("VIF — Original (R_D_Spend, Administration, Marketing_Spend):")
    for _, r in vif_orig.iterrows():
        write_line(f" - {r['feature']}: VIF={r['VIF']:.2f}")

    # VIF Reduced
    write_line("")
    write_line("VIF — Reduced (Total_Spend, Administration):")
    for _, r in vif_reduced.iterrows():
        write_line(f" - {r['feature']}: VIF={r['VIF']:.2f}")

    # Outputs
    write_line("")
    write_line("Outputs:")
    for p in outputs:
        write_line(f" - {p}")

    c.showPage()
    c.save()

# ----------------------------
# Main pipeline
# ----------------------------

def run_pipeline(input_csv: str,
                 processed_out: str = "Processed_BCS.csv",
                 scaled_out: str = "scaled_features.csv",
                 vif_out: str = "vif_table.csv",
                 report_out: str = "Data_Transform_Report.pdf",
                 zip_out: str = "data_transform_bundle.zip",
                 add_alias_scaled: bool = False):
    """Execute the end-to-end BCS pipeline."""
    # 0) Validate input
    if not os.path.exists(input_csv):
        raise FileNotFoundError(f"Input file not found: {input_csv}")

    # 1) Load
    df = pd.read_csv(input_csv, skipinitialspace=True)

    # Normalize columns
    df.columns = [normalize_col(c) for c in df.columns]
    # Expect: R_D_Spend, Administration, Marketing_Spend, State, Profit

    # Convert numerics
    for col in ["R_D_Spend", "Administration", "Marketing_Spend", "Profit"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Drop rows missing core numerics
    df = df.dropna(subset=["R_D_Spend", "Administration", "Marketing_Spend", "Profit"])

    # 2) Feature engineering
    df["Total_Spend"] = df["R_D_Spend"].fillna(0) + df["Marketing_Spend"].fillna(0)

    # 3) One-Hot (stable schema, 0/1)
    for st_col in ["State_California", "State_Florida", "State_New York"]:
        df[st_col] = 0  # default 0
    if "State" in df.columns:
        s = df["State"].astype(str)
        df.loc[s == "California", "State_California"] = 1
        df.loc[s == "Florida", "State_Florida"] = 1
        df.loc[s == "New York", "State_New York"] = 1

    # 4) Scaling (z-score) — save separately and attach to processed dataset
    scale_stats = {}
    df["Total_Spend_scaled"], m_ts, s_ts = zscore(df["Total_Spend"])
    df["Administration_scaled"], m_ad, s_ad = zscore(df["Administration"])
    scale_stats["Total_Spend"] = {"mean": m_ts, "std": s_ts}
    scale_stats["Administration"] = {"mean": m_ad, "std": s_ad}

    # Optional alias for continuity (same as Total_Spend_scaled)
    if add_alias_scaled:
        df["R_D_Marketing_Spend_scaled"] = df["Total_Spend_scaled"]

    # Save scaled-only file
    scaled_cols = ["Total_Spend_scaled", "Administration_scaled"]
    if add_alias_scaled:
        scaled_cols.append("R_D_Marketing_Spend_scaled")
    df[scaled_cols].to_csv(scaled_out, index=False)

    # 5) VIF
    vif_orig = compute_vif(df[["R_D_Spend", "Administration", "Marketing_Spend"]])
    vif_reduced = compute_vif(df[["Total_Spend", "Administration"]])
    pd.concat([
        vif_orig.assign(schema="Original (R&D, Admin, Marketing)"),
        vif_reduced.assign(schema="Reduced (Total_Spend, Admin)")
    ]).to_csv(vif_out, index=False)

    # 6) Final BCS
    bcs_cols = [
        "Total_Spend", "Administration", "Profit",
        "State_California", "State_Florida", "State_New York",
        "Total_Spend_scaled", "Administration_scaled",
    ]
    if add_alias_scaled:
        bcs_cols.append("R_D_Marketing_Spend_scaled")
    df[bcs_cols].to_csv(processed_out, index=False)

    # 7) Report (PDF or TXT fallback)
    write_pdf_report(
        report_out, df, scale_stats, vif_orig, vif_reduced,
        outputs=[processed_out, scaled_out, vif_out, report_out if REPORT_ENABLED else report_out.replace(".pdf", ".txt")]
    )

    # 8) ZIP Bundle
    with ZipFile(zip_out, "w") as zf:
        for p in [processed_out, scaled_out, vif_out, report_out if REPORT_ENABLED else report_out.replace(".pdf", ".txt")]:
            if os.path.exists(p):
                zf.write(p)

    # Console summary
    summary = {
        "input": input_csv,
        "rows": int(df.shape[0]),
        "cols": int(df.shape[1]),
        "scale_stats": scale_stats,
        "vif_original": vif_orig.to_dict(orient="records"),
        "vif_reduced": vif_reduced.to_dict(orient="records"),
        "outputs": [processed_out, scaled_out, vif_out, report_out if REPORT_ENABLED else report_out.replace(".pdf", ".txt"), zip_out],
    }
    print(json.dumps(summary, indent=2))

# ----------------------------
# CLI entry (Jupyter-safe)
# ----------------------------

def main(arg_list=None):
    """
    Jupyter-safe main: if arg_list is None,
    - In notebooks: ignore sys.argv to avoid injected args; use defaults.
    - In terminal: parse sys.argv[1:] and ignore unknown args.
    """
    import argparse

    parser = argparse.ArgumentParser(
        description="BCS Data Transform Pipeline (0/1 OHE)",
        add_help=True
    )
    parser.add_argument("--in", dest="input_csv", required=False,
                        default="Cleaned Dataset - DTD.csv",
                        help="Path to input CSV (default: Cleaned Dataset - DTD.csv)")
    parser.add_argument("--processed", dest="processed_out", default="Processed_BCS.csv")
    parser.add_argument("--scaled", dest="scaled_out", default="scaled_features.csv")
    parser.add_argument("--vif", dest="vif_out", default="vif_table.csv")
    parser.add_argument("--report", dest="report_out", default="Data_Transform_Report.pdf")
    parser.add_argument("--zip", dest="zip_out", default="data_transform_bundle.zip")
    parser.add_argument("--alias-scaled", dest="alias_scaled", action="store_true",
                        help="Also add R_D_Marketing_Spend_scaled as alias of Total_Spend_scaled")
    # Accept-and-ignore common notebook injections (e.g., --f, -f)
    parser.add_argument("--f", dest="jupyter_kernel_file", default=None,
                        help="(Ignored) Jupyter kernel connection file")
    parser.add_argument("-f", dest="jupyter_kernel_file_short", default=None,
                        help="(Ignored) Jupyter kernel connection file")

    # Decide which args to parse
    if arg_list is None:
        in_notebook = ("ipykernel" in sys.modules) or ("JPY_PARENT_PID" in os.environ)
        arg_list = [] if in_notebook else sys.argv[1:]

    # Ignore unknown args to prevent SystemExit
    args, unknown = parser.parse_known_args(arg_list)

    try:
        run_pipeline(
            input_csv=args.input_csv,
            processed_out=args.processed_out,
            scaled_out=args.scaled_out,
            vif_out=args.vif_out,
            report_out=args.report_out,
            zip_out=args.zip_out,
            add_alias_scaled=args.alias_scaled,
        )
    except Exception as exc:
        print(f"[Error] {type(exc).__name__}: {exc}")
        print("Tip: Check that the input CSV path is correct (use --in \"your_file.csv\") or call run_pipeline(...) directly.")

if __name__ == "__main__":
    main()

# ----------------------------
# Notebook usage examples
# ----------------------------
# 1) Direct function call (no CLI, safest in notebooks):
# run_pipeline(input_csv="Cleaned Dataset - DTD.csv", add_alias_scaled=True)
#
# 2) Simulate CLI flags inside a notebook:
# main([
#   "--in", "Cleaned Dataset - DTD.csv",
#   "--processed", "Processed_BCS.csv",
#   "--scaled", "scaled_features.csv",
#   "--vif", "vif_table.csv",
#   "--report", "Data_Transform_Report.pdf",
#   "--zip", "data_transform_bundle.zip",
#   "--alias-scaled"
# ])
