In [None]:
# ==========================================================
# CHUNK 1 — IMPORTS & GLOBAL PATHS
# ==========================================================

from pathlib import Path

BASE_DIR = Path.cwd().parent if "notebooks" in Path.cwd().parts else Path.cwd()

DATA_RAW = BASE_DIR / "data" / "raw"
DATA_PROCESSED = BASE_DIR / "data" / "processed"

DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

import os
import gc
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs

BASE = DATA_PROCESSED

print("Environment ready.")

In [None]:
# ==========================================================
# CHUNK 2 — CSV → PARQUET (speed optimization)
# ==========================================================

for file in os.listdir(DATA_PROCESSED):
    if file.endswith(".csv"):
        csv_path = DATA_PROCESSED / file
        pq_path  = csv_path.with_suffix('.parquet')
        print(f"Converting → {file}")
        pl.read_csv(csv_path).write_parquet(pq_path)

# Also convert raw bureau.csv for aggregation
print("Converting raw bureau.csv")
pl.read_csv(DATA_RAW / "bureau.csv").write_parquet(DATA_PROCESSED / "bureau.parquet")

In [None]:
# ==========================================================
# CHUNK 3 — Bureau Balance Aggregation
# ==========================================================

bureau = pl.read_parquet(f"{DATA_PROCESSED}/bureau.parquet")
balance = pl.read_parquet(f"{DATA_PROCESSED}/bureau_bal_loan.parquet")

# Attach SK_ID_CURR
bb = balance.join(
    bureau.select(["SK_ID_BUREAU", "SK_ID_CURR"]),
    on="SK_ID_BUREAU",
    how="left"
)

numeric_cols = bb.select(cs.numeric()).columns

agg_exprs = []
for col in numeric_cols:
    agg_exprs.extend([
        pl.col(col).mean().alias(f"{col}_MEAN"),
        pl.col(col).min().alias(f"{col}_MIN"),
        pl.col(col).max().alias(f"{col}_MAX"),
        pl.col(col).sum().alias(f"{col}_SUM"),
    ])

bb_agg = (
    bb.group_by("SK_ID_CURR")
      .agg(agg_exprs + [pl.len().alias("BB_COUNT")])
)

bb_agg.write_parquet(f"{BASE}/bureau_bal_loan.parquet")
print("✓ Bureau balance aggregation saved.")

In [None]:
import os
print(os.listdir(BASE))

In [None]:
# ==========================================================
# DIAGNOSTIC - Check which files exist
# ==========================================================

import os

print("Files in BASE:", BASE)
files = os.listdir(BASE)
parquet_files = [f for f in files if f.endswith('.parquet')]
csv_files = [f for f in files if f.endswith('.csv')]

print("\nParquet files found:")
for f in sorted(parquet_files):
    print(f"  ✓ {f}")

print("\nCSV files found (need conversion):")
for f in sorted(csv_files):
    print(f"  ✗ {f}")

# Check for missing files needed for merge
required = [
    "application_train.parquet",
    "bureau_agg.parquet",
    "previous_agg.parquet",
    "pos_agg.parquet",
    "installments_agg.parquet",
    "cc_agg.parquet",
    "bureau_bal_loan.parquet"
]

print("\nStatus of required files:")
for req in required:
    status = "✓" if req in parquet_files else "✗ MISSING"
    print(f"  {status} {req}")


In [None]:
# ==========================================================
# FIX — Convert missing application files to parquet
# ==========================================================

print("Converting missing application CSV files to parquet...")

# Check and convert application_train.csv
if os.path.exists(DATA_RAW / "application_train.csv"):
    print("Converting application_train.csv...")
    pl.read_csv(DATA_RAW / "application_train.csv").write_parquet(
        DATA_PROCESSED / "application_train.parquet"
    )
    print("✓ application_train.parquet created")
else:
    print("✗ application_train.csv not found in raw data")

# Check and convert application_test.csv
if os.path.exists(DATA_RAW / "application_test.csv"):
    print("Converting application_test.csv...")
    pl.read_csv(DATA_RAW / "application_test.csv").write_parquet(
        DATA_PROCESSED / "application_test.parquet"
    )
    print("✓ application_test.parquet created")
else:
    print("✗ application_test.csv not found in raw data")

print("\nAll required files should now be ready!")


In [None]:
# ==========================================================
# CHUNK 4 — Merge All Aggregates (TRAIN)
# ==========================================================

# Verify all required files exist before proceeding
required_files = [
    "application_train.parquet",
    "bureau_agg.parquet",
    "previous_agg.parquet",
    "pos_agg.parquet",
    "installments_agg.parquet",
    "cc_agg.parquet",
    "bureau_bal_loan.parquet"
]

missing = []
for req in required_files:
    if not os.path.exists(f"{BASE}/{req}"):
        missing.append(req)

if missing:
    raise FileNotFoundError(f"Missing required parquet files: {missing}\nRun the FIX cell above first.")

print("✓ All required files exist. Proceeding with merge...\n")

train        = pl.scan_parquet(f"{BASE}/application_train.parquet")
bureau_agg   = pl.scan_parquet(f"{BASE}/bureau_agg.parquet")
prev_agg     = pl.scan_parquet(f"{BASE}/previous_agg.parquet")
pos_agg      = pl.scan_parquet(f"{BASE}/pos_agg.parquet")
inst_agg     = pl.scan_parquet(f"{BASE}/installments_agg.parquet")
cc_agg       = pl.scan_parquet(f"{BASE}/cc_agg.parquet")
bb_agg       = pl.scan_parquet(f"{BASE}/bureau_bal_loan.parquet")

train_merged = (
    train
    .join(bureau_agg, on="SK_ID_CURR", how="left", suffix="_bur")
    .join(prev_agg,   on="SK_ID_CURR", how="left", suffix="_prev")
    .join(pos_agg,    on="SK_ID_CURR", how="left", suffix="_pos")
    .join(inst_agg,   on="SK_ID_CURR", how="left", suffix="_inst")
    .join(cc_agg,     on="SK_ID_CURR", how="left", suffix="_cc")
    .join(bb_agg,     on="SK_ID_CURR", how="left", suffix="_bb")
)

train_final = train_merged.collect()
train_final.write_parquet(f"{BASE}/train_full.parquet")
print("TRAIN SHAPE:", train_final.shape)


In [None]:
# ==========================================================
# CHUNK 4b — TEST MERGE PIPELINE
# ==========================================================

test = pl.scan_parquet(f"{BASE}/application_test.parquet")

test_merged = (
    test
    .join(bureau_agg, on="SK_ID_CURR", how="left", suffix="_bur")
    .join(bb_agg,     on="SK_ID_CURR", how="left", suffix="_bal")
    .join(prev_agg,   on="SK_ID_CURR", how="left", suffix="_prev")
    .join(pos_agg,    on="SK_ID_CURR", how="left", suffix="_pos")
    .join(inst_agg,   on="SK_ID_CURR", how="left", suffix="_ins")
    .join(cc_agg,     on="SK_ID_CURR", how="left", suffix="_cc")
)

test_final = test_merged.collect()
test_final.write_parquet(f"{BASE}/test_full.parquet")

print("TEST SHAPE:", test_final.shape)


In [None]:
# ==========================================================
# CHUNK 5 — Missing Value Strategy
# ==========================================================

train = train_final
test  = test_final
TARGET_COL = "TARGET"

missing = (
    train.select([
        ((pl.col(c).is_null().sum() / train.height) * 100).alias(c)
        for c in train.columns
    ])
)

missing_t = missing.transpose(include_header=True)
missing_t.columns = ["column", "missing_percent"]
missing_sorted = missing_t.sort("missing_percent", descending=True)

drop_cols = missing_sorted.filter(pl.col("missing_percent") > 80)["column"].to_list()
median_candidates = missing_sorted.filter(
    (pl.col("missing_percent") >= 10) &
    (pl.col("missing_percent") <= 80)
)["column"].to_list()
native_cols = missing_sorted.filter(
    pl.col("missing_percent") < 10
)["column"].to_list()

drop_cols = [c for c in drop_cols if c != TARGET_COL]
median_candidates = [c for c in median_candidates if c != TARGET_COL]

numeric_cols = train.select(cs.numeric()).columns
median_fill_cols = [c for c in median_candidates if c in numeric_cols]

def clean_dataset(df):
    df = df.drop(drop_cols)
    for col in median_fill_cols:
        df = df.with_columns(pl.col(col).fill_null(pl.col(col).median()))
    return df

train_clean = clean_dataset(train)
test_clean  = clean_dataset(test)

train_clean = train_clean.select([TARGET_COL] + [c for c in train_clean.columns if c != TARGET_COL])

train_clean.write_parquet(f"{BASE}/train_clean.parquet")
test_clean.write_parquet(f"{BASE}/test_clean.parquet")

print(train_clean.shape, test_clean.shape)


In [None]:
# ==========================================================
# CHUNK 6 — Correlation-Based Feature Pruning (≥ 0.95)
# ==========================================================

import polars as pl
import polars.selectors as cs
import pandas as pd
import numpy as np
from pathlib import Path

# Use the BASE from earlier chunks, or set it if not available
if 'BASE' not in locals():
    BASE_DIR = Path.cwd().parent if "notebooks" in Path.cwd().parts else Path.cwd()
    BASE = BASE_DIR / "data" / "processed"
else:
    BASE = Path(BASE) if not isinstance(BASE, Path) else BASE

TARGET_COL = "TARGET"

# ----------------------------------------------------------
# Load cleaned datasets
# ----------------------------------------------------------
train = pl.read_parquet(f"{BASE}/train_clean.parquet")
test  = pl.read_parquet(f"{BASE}/test_clean.parquet")

print("Input shapes:", train.shape, test.shape)

# ----------------------------------------------------------
# 1) Select only numerical columns for correlation analysis
# ----------------------------------------------------------
numeric_cols = train.select(cs.numeric()).columns
numeric_cols = [c for c in numeric_cols if c != TARGET_COL]

# Convert only the numeric subset (Polars → Pandas for corr calc)
df_pd = train.select(numeric_cols).to_pandas()

# ----------------------------------------------------------
# 2) Compute absolute correlation matrix
# ----------------------------------------------------------
corr_matrix = df_pd.corr().abs()

# Upper triangle mask (avoid duplicate checks)
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

# ----------------------------------------------------------
# 3) Identify columns with correlation above threshold
# ----------------------------------------------------------
corr_threshold = 0.95
corr_drop_cols = [
    col for col in upper.columns
    if any(upper[col] > corr_threshold)
]

print(f"Correlation DROP count: {len(corr_drop_cols)}")

# ----------------------------------------------------------
# 4) Drop columns in both train and test
# ----------------------------------------------------------
train_corr_reduced = train.drop(corr_drop_cols)
test_corr_reduced  = test.drop(corr_drop_cols)

print("Output shapes:", train_corr_reduced.shape, test_corr_reduced.shape)

# ----------------------------------------------------------
# 5) Save reduced datasets
# ----------------------------------------------------------
train_corr_reduced.write_parquet(f"{BASE}/train_corr_reduced.parquet")
test_corr_reduced.write_parquet(f"{BASE}/test_corr_reduced.parquet")

print("✓ Correlation-based pruning completed.")


In [None]:
# ==========================================================
# SETUP — Ensure BASE path is correct for all chunks
# ==========================================================
from pathlib import Path

BASE_DIR = Path.cwd().parent if "notebooks" in Path.cwd().parts else Path.cwd()
BASE = BASE_DIR / "data" / "processed"

print(f"BASE path set to: {BASE}")
print(f"Files available: {len(list(BASE.glob('*.parquet')))} parquet files")


In [None]:
# ==========================================================
# CHUNK 7 — FEATURE ENGINEERING (SAFE VERSION)
# ==========================================================

import pandas as pd
from pathlib import Path

BASE_DIR = Path.cwd().parent if "notebooks" in Path.cwd().parts else Path.cwd()
BASE = BASE_DIR / "data" / "processed"

TARGET_COL = "TARGET"

# ----------------------------------------------------------
# Load correlation-reduced parquet files
# ----------------------------------------------------------
train = pd.read_parquet(f"{BASE}/train_corr_reduced.parquet")
test  = pd.read_parquet(f"{BASE}/test_corr_reduced.parquet")

print("Loaded:", train.shape, test.shape)

# ----------------------------------------------------------
# SAFE FEATURE ENGINEERING FUNCTION
# ----------------------------------------------------------
def add_feature_engineering(df):
    eps = 1e-6
    target = df[TARGET_COL] if TARGET_COL in df else None

    if TARGET_COL in df:
        df = df.drop(columns=[TARGET_COL])

    def safe(col):
        return col in df.columns

    # -------------------------
    # 1) Core Financial Ratios
    # -------------------------
    if safe("AMT_CREDIT") and safe("AMT_INCOME_TOTAL"):
        df["CREDIT_TO_INCOME"] = df["AMT_CREDIT"] / (df["AMT_INCOME_TOTAL"] + eps)

    if safe("AMT_ANNUITY") and safe("AMT_INCOME_TOTAL"):
        df["ANNUITY_TO_INCOME"] = df["AMT_ANNUITY"] / (df["AMT_INCOME_TOTAL"] + eps)

    if safe("AMT_CREDIT") and safe("AMT_ANNUITY"):
        df["CREDIT_TO_ANNUITY"] = df["AMT_CREDIT"] / (df["AMT_ANNUITY"] + eps)

    # -------------------------
    # 2) Employment Ratios
    # -------------------------
    if safe("DAYS_EMPLOYED") and safe("DAYS_BIRTH"):
        df["DAYS_EMPLOYED_PERC"] = df["DAYS_EMPLOYED"] / (df["DAYS_BIRTH"] + eps)

    # -------------------------
    # 3) Household Income
    # -------------------------
    if safe("CNT_FAM_MEMBERS") and safe("AMT_INCOME_TOTAL"):
        df["INC_PER_PERSON"] = df["AMT_INCOME_TOTAL"] / (df["CNT_FAM_MEMBERS"] + eps)

    # -------------------------
    # 4) Age Groups
    # -------------------------
    if safe("DAYS_BIRTH"):
        df["AGE_SEGMENTS"] = pd.cut(
            -df["DAYS_BIRTH"]/365,
            bins=[0, 25, 35, 45, 55, 65, 120],
            labels=[1, 2, 3, 4, 5, 6]
        ).astype("float32")

    # -------------------------
    # 5) EXT_SOURCE Interactions
    # -------------------------
    ext = ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]
    ext = [c for c in ext if safe(c)]

    if len(ext) == 3:
        df["EXT_SOURCE_1_2"] = df["EXT_SOURCE_1"] * df["EXT_SOURCE_2"]
        df["EXT_SOURCE_1_3"] = df["EXT_SOURCE_1"] * df["EXT_SOURCE_3"]
        df["EXT_SOURCE_2_3"] = df["EXT_SOURCE_2"] * df["EXT_SOURCE_3"]

        df["EXT_SOURCES_SUM"] = df[ext].sum(axis=1)
        df["EXT_SOURCES_MEAN"] = df["EXT_SOURCES_SUM"] / 3

    # -------------------------
    # 6) Document Count
    # -------------------------
    doc_cols = [c for c in df.columns if "FLAG_DOCUMENT" in c]
    if len(doc_cols) > 0:
        df["FLAG_DOCUMENT_SUM"] = df[doc_cols].sum(axis=1)

    # -------------------------
    # 7) Payment Ratios
    # -------------------------
    if safe("AMT_ANNUITY") and safe("AMT_CREDIT"):
        df["PAYMENT_RATE"] = df["AMT_ANNUITY"] / (df["AMT_CREDIT"] + eps)

    if safe("AMT_ANNUITY") and safe("AMT_INCOME_TOTAL"):
        df["PAYMENT_TO_INCOME"] = df["AMT_ANNUITY"] / (df["AMT_INCOME_TOTAL"] + eps)

    # -------------------------
    # 8) Time-based Ratios
    # -------------------------
    if safe("DAYS_REGISTRATION") and safe("DAYS_BIRTH"):
        df["DAYS_REGISTRATION_TO_BIRTH"] = df["DAYS_REGISTRATION"] / (df["DAYS_BIRTH"] + eps)

    if safe("DAYS_ID_PUBLISH") and safe("DAYS_BIRTH"):
        df["DAYS_ID_CHANGE_TO_BIRTH"] = df["DAYS_ID_PUBLISH"] / (df["DAYS_BIRTH"] + eps)

    # -------------------------
    # 9) Emergency State
    # -------------------------
    if safe("EMERGENCYSTATE_MODE"):
        df["EMERGENCY_STATE_FLAG"] = (df["EMERGENCYSTATE_MODE"] == "Yes").astype("float32")

    # -------------------------
    # Reattach TARGET column
    # -------------------------
    if target is not None:
        df.insert(0, TARGET_COL, target)

    return df

# ----------------------------------------------------------
# Apply FE to both train and test
# ----------------------------------------------------------
train_fe = add_feature_engineering(train.copy())
test_fe  = add_feature_engineering(test.copy())

print("Feature Engineering Completed!")
print("New train shape:", train_fe.shape)
print("New test shape:", test_fe.shape)

# Save new FE datasets
train_fe.to_parquet(f"{BASE}/train_fe.parquet")
test_fe.to_parquet(f"{BASE}/test_fe.parquet")

print("Saved FE datasets.")


In [None]:
# ==========================================================
# CHUNK 8A — Column Cleanup + Alignment
# ==========================================================
import pandas as pd
import numpy as np

TARGET_COL = "TARGET"

train_pd = pd.read_parquet(f"{BASE}/train_fe.parquet")
test_pd  = pd.read_parquet(f"{BASE}/test_fe.parquet")

print("Loaded:", train_pd.shape, test_pd.shape)

# 1) Column sanitization
train_pd.columns = train_pd.columns.str.replace("[^A-Za-z0-9_]+", "_", regex=True)
test_pd.columns  = test_pd.columns.str.replace("[^A-Za-z0-9_]+", "_", regex=True)

# 2) Feature column alignment
feature_cols = [c for c in train_pd.columns if c != TARGET_COL]
test_pd = test_pd.reindex(columns=feature_cols)

train_pd = train_pd[[TARGET_COL] + feature_cols]

print("Columns aligned!")
print(train_pd.shape, test_pd.shape)

train_pd.to_parquet(f"{BASE}/train_aligned.parquet")
test_pd.to_parquet(f"{BASE}/test_aligned.parquet")

print("Saved CHUNK 8A.")

In [None]:
# ==========================================================
# CHUNK 8B — Column-by-Column Label Encoding (Safe RAM)
# ==========================================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

TARGET_COL = "TARGET"

train_pd = pd.read_parquet(f"{BASE}/train_aligned.parquet")
test_pd  = pd.read_parquet(f"{BASE}/test_aligned.parquet")

feature_cols = [c for c in train_pd.columns if c != TARGET_COL]

for col in feature_cols:
    if train_pd[col].dtype == "object":
        print("Encoding:", col)

        le = LabelEncoder()
        combined = pd.concat([train_pd[col], test_pd[col]], axis=0).astype(str)

        le.fit(combined)

        train_pd[col] = le.transform(train_pd[col].astype(str))
        test_pd[col]  = le.transform(test_pd[col].astype(str))

print("Label encoding done.")
train_pd.to_parquet(f"{BASE}/train_encoded.parquet")
test_pd.to_parquet(f"{BASE}/test_encoded.parquet")


In [None]:
# ==========================================================
# CHUNK 8C — Missing Fill + Float32 Cast (Safe)
# ==========================================================
import pandas as pd
import numpy as np

TARGET_COL = "TARGET"

train_pd = pd.read_parquet(f"{BASE}/train_encoded.parquet")
test_pd  = pd.read_parquet(f"{BASE}/test_encoded.parquet")

feature_cols = [c for c in train_pd.columns if c != TARGET_COL]

# Fill NaN → 0
train_pd[feature_cols] = train_pd[feature_cols].fillna(0)
test_pd[feature_cols]  = test_pd[feature_cols].fillna(0)

# Convert to float32 column-by-column
for col in feature_cols:
    train_pd[col] = train_pd[col].astype(np.float32)
    test_pd[col]  = test_pd[col].astype(np.float32)

print("Float32 conversion done!")

train_pd.to_parquet(f"{BASE}/train_fe_clean.parquet")
test_pd.to_parquet(f"{BASE}/test_fe_clean.parquet")

print("CHUNK 8 COMPLETED.")
print(train_pd.shape, test_pd.shape)


In [None]:
# ==========================================================
# CHUNK 9 — Feature Importance Pruning (LightGBM CV)
# ==========================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc

TARGET_COL = "TARGET"

print("=== Loading FE-clean datasets ===")

train_pd = pd.read_parquet(f"{BASE}/train_fe_clean.parquet")
test_pd  = pd.read_parquet(f"{BASE}/test_fe_clean.parquet")

print("Train shape:", train_pd.shape)
print("Test shape :", test_pd.shape)

# ----------------------------------------------------------
# 1) Separate X and y
# ----------------------------------------------------------
y = train_pd[TARGET_COL].values
X = train_pd.drop(columns=[TARGET_COL])

feature_names = X.columns.tolist()

print(f"Total features: {len(feature_names)}")

# ----------------------------------------------------------
# 2) LightGBM Base Parameters
# ----------------------------------------------------------
params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": -1,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
    "verbosity": -1,
    "seed": 42
}

# ----------------------------------------------------------
# 3) Stratified 5-Fold CV
# ----------------------------------------------------------
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

importances = np.zeros(len(feature_names))
fold_scores = []

print("\n=== Starting Feature Importance CV ===")

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):

    print(f"\n----- FOLD {fold} -----")

    dtrain = lgb.Dataset(X.iloc[tr_idx], y[tr_idx])
    dvalid = lgb.Dataset(X.iloc[val_idx], y[val_idx])

    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dvalid],
        num_boost_round=300,
        callbacks=[
            lgb.early_stopping(stopping_rounds=30, verbose=False)
        ]
    )

    preds = model.predict(X.iloc[val_idx])
    auc = roc_auc_score(y[val_idx], preds)
    fold_scores.append(auc)

    # accumulate gain importance
    importances += model.feature_importance(importance_type="gain")

    del dtrain, dvalid, model
    gc.collect()

print("\nFold AUCs:", fold_scores)
print("Mean AUC:", np.mean(fold_scores))

# ----------------------------------------------------------
# 4) Normalize importance over folds
# ----------------------------------------------------------
importances /= 5

importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=True)

print("\nLowest 20 features:")
print(importance_df.head(20))

# ----------------------------------------------------------
# 5) Prune bottom 35%
# ----------------------------------------------------------
drop_fraction = 0.35
drop_count = int(len(feature_names) * drop_fraction)

drop_features = importance_df.head(drop_count)["feature"].tolist()

print(f"\nDropping {drop_count} weak features...")

# ----------------------------------------------------------
# 6) Reduce datasets
# ----------------------------------------------------------
train_reduced = X.drop(columns=drop_features)
test_reduced  = test_pd.drop(columns=drop_features)

print("New shapes:")
print("Train reduced:", train_reduced.shape)
print("Test reduced :", test_reduced.shape)

# Reattach TARGET
train_reduced.insert(0, TARGET_COL, y)

# ----------------------------------------------------------
# 7) Save reduced feature sets
# ----------------------------------------------------------
train_reduced.to_parquet(f"{BASE}/final_train_reduced.parquet")
test_reduced.to_parquet(f"{BASE}/final_test_reduced.parquet")

print("\nSaved: final_train_reduced.parquet")
print("Saved: final_test_reduced.parquet")
print("=== CHUNK 9 COMPLETE ===")


In [None]:
%pip install optuna

In [None]:
# ==========================================================
# CHUNK 10 — OPTUNA TUNING (Fast Version, 3 Trials)
# ==========================================================

import optuna
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

TARGET_COL = "TARGET"

# ----------------------------------------------------------
# Load FI-pruned dataset
# ----------------------------------------------------------
print("=== Loading final reduced dataset ===")

train_df = pd.read_parquet(f"{BASE}/final_train_reduced.parquet")
y = train_df[TARGET_COL].values
X = train_df.drop(columns=[TARGET_COL])

print("Train:", X.shape)

# ----------------------------------------------------------
# OPTUNA OBJECTIVE FUNCTION
# ----------------------------------------------------------
def objective(trial):

    params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",

        # Search Space (Optimized for speed + quality)
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.08),
        "num_leaves": trial.suggest_int("num_leaves", 31, 160),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
        "max_depth": trial.suggest_int("max_depth", -1, 12),

        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 6),

        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 3.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 3.0),
    }

    # 3-fold CV — very fast but stable
    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    aucs = []

    for tr_idx, val_idx in kf.split(X, y):
        dtrain = lgb.Dataset(X.iloc[tr_idx], y[tr_idx])
        dvalid = lgb.Dataset(X.iloc[val_idx], y[val_idx])

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dvalid],
            num_boost_round=1000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=60, verbose=False)
            ]
        )

        preds = model.predict(X.iloc[val_idx])
        aucs.append(roc_auc_score(y[val_idx], preds))

    return np.mean(aucs)

# ----------------------------------------------------------
# RUN OPTUNA (ONLY 5 TRIALS)
# ----------------------------------------------------------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=3, show_progress_bar=True)

print("\nBEST AUC:", study.best_value)
print("BEST PARAMS:", study.best_params)

# ----------------------------------------------------------
# SAVE RESULTS
# ----------------------------------------------------------
import json

with open(f"{BASE}/best_optuna_params.json", "w") as f:
    json.dump(study.best_params, f, indent=4)

with open(f"{BASE}/best_optuna_auc.txt", "w") as f:
    f.write(str(study.best_value))

print("\nSaved: best_optuna_params.json and best_optuna_auc.txt")


In [None]:
# study.optimize(objective, n_trials=5) #  try it later if you want to reach better results.

In [None]:
# ==========================================================
# CHUNK 11 — FINAL LIGHTGBM TRAINING (OOF + TEST PREDICTIONS)
# ==========================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import joblib
import json

TARGET_COL = "TARGET"

print("=== Loading final reduced datasets ===")

train_df = pd.read_parquet(f"{BASE}/final_train_reduced.parquet")
test_df  = pd.read_parquet(f"{BASE}/final_test_reduced.parquet")

y = train_df[TARGET_COL].values
X = train_df.drop(columns=[TARGET_COL])
X_test = test_df.copy()

feature_names = X.columns.tolist()

print("Train:", X.shape)
print("Test :", X_test.shape)

# ----------------------------------------------------------
# Load best Optuna parameters
# ----------------------------------------------------------
with open(f"{BASE}/best_optuna_params.json", "r") as f:
    best_params = json.load(f)

# LightGBM requires these fixed params
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt",
})

print("\nUsing best params:")
print(best_params)

# ----------------------------------------------------------
# Prepare OOF + Test containers
# ----------------------------------------------------------
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []

print("\n=== Training FINAL MODEL with 5-FOLD CV ===")

# ----------------------------------------------------------
# Main CV Loop
# ----------------------------------------------------------
for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):

    print(f"\n----- FOLD {fold} -----")

    X_train, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_train, y_val = y[tr_idx], y[val_idx]

    dtrain = lgb.Dataset(X_train, y_train)
    dvalid = lgb.Dataset(X_val, y_val)

    model = lgb.train(
        best_params,
        dtrain,
        valid_sets=[dvalid],
        num_boost_round=5000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=200)
        ]
    )

    # OOF predictions
    val_pred = model.predict(X_val)
    oof_preds[val_idx] = val_pred

    auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(auc)

    print(f"Fold {fold} AUC: {auc:.6f}")

    # Test predictions (mean over folds)
    test_preds += model.predict(X_test) / kf.n_splits

    # Save each fold model (optional)
    joblib.dump(model, f"{BASE}/lgbm_fold{fold}.pkl")

print("\nCV AUC scores:", fold_scores)
print("Mean AUC:", np.mean(fold_scores))

# ----------------------------------------------------------
# Save OOF predictions
# ----------------------------------------------------------
oof_df = pd.DataFrame({
    "TARGET": y,
    "oof_lgb": oof_preds
})
oof_df.to_csv(f"{BASE}/oof_lgb.csv", index=False)

print("Saved OOF predictions → oof_lgb.csv")

# ----------------------------------------------------------
# Save test predictions for submission
# ----------------------------------------------------------
submission = pd.DataFrame({
    "SK_ID_CURR": test_df.index if "SK_ID_CURR" not in test_df else test_df["SK_ID_CURR"],
    "TARGET": test_preds
})

submission.to_csv(f"{BASE}/submission_lgbm.csv", index=False)

print("Saved submission → submission_lgbm.csv")

# ----------------------------------------------------------
# Save final averaged model (optional)
# ----------------------------------------------------------
joblib.dump(best_params, f"{BASE}/lgbm_final_params.pkl")

print("\n=== CHUNK 11 COMPLETE ===")


In [None]:
%pip install catboost

In [None]:
# ==========================================================
# CHUNK 12 — CatBoost FINAL Model (OOF + Test Predictions)
# ==========================================================

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import joblib

TARGET_COL = "TARGET"

print("=== Loading final reduced dataset ===")

train_df = pd.read_parquet(f"{BASE}/final_train_reduced.parquet")
test_df  = pd.read_parquet(f"{BASE}/final_test_reduced.parquet")

y = train_df[TARGET_COL].values
X = train_df.drop(columns=[TARGET_COL])
X_test = test_df.copy()

feature_names = X.columns.tolist()

print("Train:", X.shape)
print("Test :", X_test.shape)

# ----------------------------------------------------------
# CatBoost base parameters
# ----------------------------------------------------------
cat_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "learning_rate": 0.03,
    "depth": 8,
    "l2_leaf_reg": 3.0,
    "iterations": 1000,
    "random_seed": 42,
    "verbose": False,
    "task_type": "CPU"
}

# ----------------------------------------------------------
# Prepare OOF + Test buffers
# ----------------------------------------------------------
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

kf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
fold_scores = []

print("\n=== Training CatBoost with 2-FOLD CV ===")

# ----------------------------------------------------------
# Main CV loop
# ----------------------------------------------------------
for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):

    print(f"\n----- FOLD {fold} -----")

    X_train, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_train, y_val = y[tr_idx], y[val_idx]

    model = CatBoostClassifier(**cat_params)

    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val)
    )

    # OOF prediction
    val_pred = model.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_pred

    auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(auc)

    print(f"Fold {fold} AUC: {auc:.6f}")

    # Add test prediction (average over folds)
    test_preds += model.predict_proba(X_test)[:, 1] / kf.n_splits

    # Save fold model
    model.save_model(f"{BASE}/catboost_fold{fold}.cbm")

print("\nCV AUC scores:", fold_scores)
print("Mean AUC:", np.mean(fold_scores))

# ----------------------------------------------------------
# Save OOF predictions
# ----------------------------------------------------------
oof_df = pd.DataFrame({
    "TARGET": y,
    "oof_cat": oof_preds
})
oof_df.to_csv(f"{BASE}/oof_cat.csv", index=False)

print("Saved OOF predictions → oof_cat.csv")

# ----------------------------------------------------------
# Save TEST predictions
# ----------------------------------------------------------
submission = pd.DataFrame({
    "SK_ID_CURR": test_df.index if "SK_ID_CURR" not in test_df else test_df["SK_ID_CURR"],
    "TARGET": test_preds
})

submission.to_csv(f"{BASE}/submission_cat.csv", index=False)

print("Saved submission → submission_cat.csv") # Submit to Kaggle!

print("\n=== CHUNK 12 COMPLETE ===")


In [None]:
# ==========================================================
# CHUNK 13 — XGBoost FINAL Model (OOF + Test Predictions)
# ==========================================================

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

TARGET_COL = "TARGET"

print("=== Loading final reduced dataset ===")

train_df = pd.read_parquet(f"{BASE}/final_train_reduced.parquet")
test_df  = pd.read_parquet(f"{BASE}/final_test_reduced.parquet")

y = train_df[TARGET_COL].values
X = train_df.drop(columns=[TARGET_COL])
X_test = test_df.copy()

print("Train:", X.shape)
print("Test :", X_test.shape)

# ----------------------------------------------------------
# XGBoost Parameters (tuned for speed + stability)
# ----------------------------------------------------------
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "eta": 0.03,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 2.0,
    "alpha": 0.0,
    "tree_method": "hist",      # fastest for CPU
    "random_state": 42
}

# ----------------------------------------------------------
# Prepare OOF and Test buffers
# ----------------------------------------------------------
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

kf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
fold_scores = []

print("\n=== Training XGBoost with 2-FOLD CV ===")

# ----------------------------------------------------------
# Cross-validation loop
# ----------------------------------------------------------
for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):

    print(f"\n----- FOLD {fold} -----")

    X_train, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_train, y_val = y[tr_idx], y[val_idx]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test)

    model = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=400,
        evals=[(dvalid, "valid")],
        early_stopping_rounds=100,
        verbose_eval=200
    )

    # OOF prediction
    val_pred = model.predict(dvalid)
    oof_preds[val_idx] = val_pred

    auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(auc)

    print(f"Fold {fold} AUC: {auc:.6f}")

    # Test prediction (average over folds)
    test_preds += model.predict(dtest) / kf.n_splits

    # Save fold model
    model.save_model(f"{BASE}/xgb_fold{fold}.json")

print("\nCV AUC scores:", fold_scores)
print("Mean AUC:", np.mean(fold_scores))

# ----------------------------------------------------------
# Save OOF predictions
# ----------------------------------------------------------
oof_df = pd.DataFrame({
    "TARGET": y,
    "oof_xgb": oof_preds
})
oof_df.to_csv(f"{BASE}/oof_xgb.csv", index=False)

print("Saved OOF predictions → oof_xgb.csv")

# ----------------------------------------------------------
# Save test predictions for submission
# ----------------------------------------------------------
submission = pd.DataFrame({
    "SK_ID_CURR": test_df.index if "SK_ID_CURR" not in test_df else test_df["SK_ID_CURR"],
    "TARGET": test_preds
})

submission.to_csv(f"{BASE}/submission_xgb.csv", index=False)

print("Saved submission → submission_xgb.csv")

print("\n=== CHUNK 13 COMPLETE ===")

In [None]:
# ==========================================================
# CHUNK 14 — Blending Weight Optimization (OOF)
# ==========================================================

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

print("Loading OOF files...")

oof_lgb = pd.read_csv(f"{BASE}/oof_lgb.csv")
oof_cat = pd.read_csv(f"{BASE}/oof_cat.csv")
oof_xgb = pd.read_csv(f"{BASE}/oof_xgb.csv")

# Merge all OOF files side-by-side
df = pd.DataFrame({
    "TARGET": oof_lgb["TARGET"],
    "oof_lgb": oof_lgb["oof_lgb"],
    "oof_cat": oof_cat["oof_cat"],
    "oof_xgb": oof_xgb["oof_xgb"]
})

print("OOF combined shape:", df.shape)

y_true = df["TARGET"].values

# ----------------------------------------------------------
# Define weight search grid
# ----------------------------------------------------------
cat_weights = [0.40, 0.50, 0.60, 0.70]
lgb_weights = [0.10, 0.15, 0.20, 0.25]

# XGB weight = 1 - (cat + lgb)

best_auc = -1
best_weights = None
results = []

print("\n=== Starting Blending Weight Grid-Search ===\n")

for w_cat in cat_weights:
    for w_lgb in lgb_weights:
        w_xgb = 1 - (w_cat + w_lgb)

        if w_xgb < 0:
            continue

        pred = (
            w_cat * df["oof_cat"] +
            w_lgb * df["oof_lgb"] +
            w_xgb * df["oof_xgb"]
        )

        auc = roc_auc_score(y_true, pred)
        results.append((w_cat, w_lgb, w_xgb, auc))

        print(f"Weights → Cat={w_cat}, LGB={w_lgb}, XGB={w_xgb:.2f} | AUC={auc:.6f}")

        if auc > best_auc:
            best_auc = auc
            best_weights = (w_cat, w_lgb, w_xgb)

# ----------------------------------------------------------
# Show best blending results
# ----------------------------------------------------------
print("\n=============================")
print(" BEST BLENDING RESULTS")
print("=============================")
print(f"Best AUC:     {best_auc:.6f}")
print(f"Best Weights: Cat={best_weights[0]}, LGB={best_weights[1]}, XGB={best_weights[2]}")
print("=============================\n")

# ----------------------------------------------------------
# Save weight search results & best weights
# ----------------------------------------------------------
pd.DataFrame(results, columns=["cat", "lgb", "xgb", "auc"]).to_csv(
    f"{BASE}/blending_weight_results.csv", index=False
)

with open(f"{BASE}/best_blending_weights.txt", "w") as f:
    f.write(f"Best AUC={best_auc}\n")
    f.write(f"Weights: Cat={best_weights[0]}, LGB={best_weights[1]}, XGB={best_weights[2]}\n")

print("Saved blending results and best weights.")


In [None]:
# ==========================================================
# CHUNK 15 — STACKING MODEL (Logistic Regression)
# ==========================================================

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import joblib

print("=== Loading OOF predictions for stacking ===")

oof_lgb = pd.read_csv(f"{BASE}/oof_lgb.csv")
oof_cat = pd.read_csv(f"{BASE}/oof_cat.csv")
oof_xgb = pd.read_csv(f"{BASE}/oof_xgb.csv")

# Merge into a single meta-feature dataframe
df = pd.DataFrame({
    "TARGET": oof_lgb["TARGET"],
    "oof_cat": oof_cat["oof_cat"],
    "oof_lgb": oof_lgb["oof_lgb"],
    "oof_xgb": oof_xgb["oof_xgb"]
})

print("Stacking DF shape:", df.shape)
print(df.head())

y_true = df["TARGET"].values

# ----------------------------------------------------------
# META-FEATURE Matrix
# ----------------------------------------------------------
X_meta = df[["oof_cat", "oof_lgb", "oof_xgb"]].values

# ----------------------------------------------------------
# Train Logistic Regression Meta-Model
# ----------------------------------------------------------
stack_model = LogisticRegression(max_iter=400)
stack_model.fit(X_meta, y_true)

# ----------------------------------------------------------
# Stacking OOF prediction
# ----------------------------------------------------------
stack_oof_pred = stack_model.predict_proba(X_meta)[:, 1]
auc_stack = roc_auc_score(y_true, stack_oof_pred)

print("\nSTACKING OOF AUC:", auc_stack)

# ----------------------------------------------------------
# SAVE stacking model
# ----------------------------------------------------------
model_path = f"{BASE}/stacking_lr_model.pkl"
joblib.dump(stack_model, model_path)

print("Saved stacking model →", model_path)

# ----------------------------------------------------------
# Load test predictions from all 3 models
# These are submission_xxx.csv files produced earlier
# ----------------------------------------------------------
test_lgb = pd.read_csv(f"{BASE}/submission_lgbm.csv")
test_cat = pd.read_csv(f"{BASE}/submission_cat.csv")
test_xgb = pd.read_csv(f"{BASE}/submission_xgb.csv")

# Must align by SK_ID_CURR
base = test_lgb[["SK_ID_CURR"]].copy()
base["pred_cat"] = test_cat["TARGET"]
base["pred_lgb"] = test_lgb["TARGET"]
base["pred_xgb"] = test_xgb["TARGET"]

# Apply stacking model on test predictions
X_test_meta = base[["pred_cat", "pred_lgb", "pred_xgb"]].values
stack_test_pred = stack_model.predict_proba(X_test_meta)[:, 1]

# ----------------------------------------------------------
# Final stacking submission
# ----------------------------------------------------------
submission = pd.DataFrame({
    "SK_ID_CURR": base["SK_ID_CURR"],
    "TARGET": stack_test_pred
})

submission.to_csv(f"{BASE}/submission_stacking.csv", index=False)

print("Saved final stacking submission → submission_stacking.csv")

# ----------------------------------------------------------
# Save OOF + stacked predictions for analysis
# ----------------------------------------------------------
df_out = df.copy()
df_out["stack_pred"] = stack_oof_pred
df_out.to_csv(f"{BASE}/stacking_oof_predictions.csv", index=False)

print("Saved OOF stacking predictions → stacking_oof_predictions.csv")

print("\n=== CHUNK 15 COMPLETE ===")


In [None]:
%pip install reportlab

In [None]:
# ==========================================================
# CHUNK 16 — FINAL STACKING REPORT + SUBMISSION CHECK
# ==========================================================



import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_curve, auc, precision_recall_curve, roc_auc_score
)
from sklearn.calibration import calibration_curve
from reportlab.platypus import SimpleDocTemplate, Image, Paragraph, Spacer
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet

print("=== Loading stacking OOF data ===")
df = pd.read_csv(f"{BASE}/stacking_oof_predictions.csv")

y_true = df["TARGET"].values
stack_pred = df["stack_pred"].values
oof_cat = df["oof_cat"].values
oof_lgb = df["oof_lgb"].values
oof_xgb = df["oof_xgb"].values

# Best blending weights (optional for comparison)
w_cat, w_lgb, w_xgb = 0.50, 0.25, 0.25
blend_pred = w_cat * oof_cat + w_lgb * oof_lgb + w_xgb * oof_xgb


# ==========================================================
# Helper: Save figure
# ==========================================================
def save_fig(path):
    plt.savefig(path, dpi=250, bbox_inches="tight")
    plt.close()


# ==========================================================
# 1) ROC CURVE
# ==========================================================
fpr, tpr, _ = roc_curve(y_true, stack_pred)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}", linewidth=2.3)
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – Stacking Model")
plt.legend()
plt.grid(alpha=0.3)

roc_path = f"{BASE}/ROC_STACKING.png"
save_fig(roc_path)
print("Saved ROC curve.")


# ==========================================================
# 2) PRECISION–RECALL CURVE
# ==========================================================
precision, recall, _ = precision_recall_curve(y_true, stack_pred)

plt.figure(figsize=(7, 5))
plt.plot(recall, precision, linewidth=2.3)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve – Stacking")
plt.grid(alpha=0.3)

pr_path = f"{BASE}/PR_STACKING.png"
save_fig(pr_path)
print("Saved PR curve.")


# ==========================================================
# 3) CALIBRATION CURVE
# ==========================================================
prob_true, prob_pred = calibration_curve(y_true, stack_pred, n_bins=20)

plt.figure(figsize=(7, 5))
plt.plot(prob_pred, prob_true, "o-", linewidth=2)
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("Predicted Probability")
plt.ylabel("Observed Default Rate")
plt.title("Calibration Curve – Stacking")
plt.grid(alpha=0.3)

cal_path = f"{BASE}/CALIBRATION_STACKING.png"
save_fig(cal_path)
print("Saved calibration curve.")


# ==========================================================
# 4) MODEL COMPARISON (OOF AUC)
# ==========================================================
auc_cat = roc_auc_score(y_true, oof_cat)
auc_lgb = roc_auc_score(y_true, oof_lgb)
auc_xgb = roc_auc_score(y_true, oof_xgb)
auc_blend = roc_auc_score(y_true, blend_pred)
auc_stack = roc_auc_score(y_true, stack_pred)

model_names = ["CatBoost", "LightGBM", "XGBoost", "Blending", "Stacking"]
scores = [auc_cat, auc_lgb, auc_xgb, auc_blend, auc_stack]

plt.figure(figsize=(10, 5))
plt.plot(model_names, scores, marker="o", linewidth=2.5)
plt.title("Model Comparison – OOF AUC Scores")
plt.ylabel("AUC")
plt.grid(alpha=0.3)

mc_path = f"{BASE}/MODEL_COMPARISON.png"
save_fig(mc_path)
print("Saved model comparison chart.")


# ==========================================================
# 5) PDF REPORT
# ==========================================================
pdf_path = f"{BASE}/stacking_final_report.pdf"
doc = SimpleDocTemplate(pdf_path, pagesize=A4)
styles = getSampleStyleSheet()

story = []
story.append(Paragraph("<b>Stacking Model Evaluation Report</b>", styles["Title"]))
story.append(Spacer(1, 20))

def add_image(title, path):
    story.append(Paragraph(f"<b>{title}</b>", styles["Heading2"]))
    story.append(Spacer(1, 10))
    story.append(Image(path, width=480, height=320))
    story.append(Spacer(1, 30))

add_image("ROC Curve", roc_path)
add_image("Precision–Recall Curve", pr_path)
add_image("Calibration Curve", cal_path)
add_image("Model Comparison", mc_path)

doc.build(story)
print("PDF created:", pdf_path)


# ==========================================================
# 6) FINAL SUBMISSION CHECK
# ==========================================================
sub = pd.read_csv(f"{BASE}/submission_stacking.csv")
print("\nSUBMISSION SHAPE →", sub.shape)
print(sub.head())

print("\n=== CHUNK 16 COMPLETED ===")

In [None]:
# ==========================================================
# CHUNK 17 — FINAL LGBM TRAINING (AFTER FI PRUNING) + SHAP
# (LightGBM >=4.0 compatible, no verbose_eval argument)
# ==========================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
import gc

print("=== Loading final reduced train set ===")

# Load FI-pruned dataset
train_df = pd.read_parquet(f"{BASE}/final_train_reduced.parquet")

y = train_df["TARGET"]
X = train_df.drop(columns=["TARGET"])

print("Train shape:", X.shape)

# ----------------------------------------------------------
# LOAD BEST OPTUNA PARAMS
# ----------------------------------------------------------
import json
with open(f"{BASE}/best_optuna_params.json", "r") as f:
    best_params = json.load(f)

# Add LightGBM-required params
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt"
})

print("\nUsing params:")
print(best_params)

# ----------------------------------------------------------
# FINAL MODEL TRAINING
# ----------------------------------------------------------
print("\n=== Training final LightGBM model (after FI pruning) ===")

dtrain = lgb.Dataset(X, y)

final_lgb = lgb.train(
    best_params,
    dtrain,
    num_boost_round=200,
    valid_sets=[dtrain],
    callbacks=[lgb.log_evaluation(period=200)]
)

# Save model
model_path = f"{BASE}/lgbm_final_after_pruning.txt"
final_lgb.save_model(model_path)

print(f"\nModel saved → {model_path}")

# ----------------------------------------------------------
# SHAP EXPLAINABILITY
# ----------------------------------------------------------
print("\n=== Building SHAP Explainer ===")

# Sample rows for SHAP (makes it faster)
sample_size = min(20000, len(X))
X_sample = X.sample(sample_size, random_state=42)

explainer = shap.TreeExplainer(final_lgb)
shap_values = explainer.shap_values(X_sample)

print("SHAP values computed.")

# ----------------------------------------------------------
# SAVE SHAP SUMMARY PLOT
# ----------------------------------------------------------
print("\n=== Plotting SHAP summary ===")

shap.summary_plot(shap_values, X_sample, plot_type="dot", show=False)

plt.savefig(f"{BASE}/SHAP_SUMMARY.png", dpi=300, bbox_inches="tight")
plt.close()

print(f"SHAP summary saved → {BASE}/SHAP_SUMMARY.png")

# Clean memory
del explainer, shap_values
gc.collect()

print("\n=== CHUNK 17 COMPLETED ===")


In [None]:
# ==========================================================
# CHUNK 17.5 — FINAL XGBOOST TRAINING (AFTER FI PRUNING)
# ==========================================================

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

print("=== Loading FI-pruned final dataset ===")

train_df = pd.read_parquet(f"{BASE}/final_train_reduced.parquet")
y = train_df["TARGET"].values
X = train_df.drop(columns=["TARGET"])

print("Train shape:", X.shape)

params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "eta": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 2.0,
    "alpha": 1.0,
    "tree_method": "hist",
    "seed": 42
}

kf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
auc_scores = []

print("\n=== XGBoost CV Training ===")

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
    print(f"\n----- FOLD {fold} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_val = y[train_idx], y[valid_idx]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=200,
        early_stopping_rounds=80,
        evals=[(dval, "valid")],
        verbose_eval=200
    )

    preds = model.predict(dval)
    auc = roc_auc_score(y_val, preds)
    auc_scores.append(auc)

print("\nCV AUC Scores:", auc_scores)
print("Mean CV AUC:", np.mean(auc_scores))

# Train final XGB model on FULL DATA
print("\n=== Training final XGB model on FULL DATA ===")

dtrain_full = xgb.DMatrix(X, label=y)
final_xgb = xgb.train(
    params,
    dtrain_full,
    num_boost_round=int(model.best_iteration * 1.3)
)

# SAVE MODEL
xgb_path = f"{BASE}/xgb_final_after_pruning.json"
final_xgb.save_model(xgb_path)

print("\nSaved final XGBoost model →", xgb_path)


In [None]:
# ==========================================================
# CHUNK 18 — FINAL ENSEMBLE (OOF + BLEND + STACK + SUBMISSION)
# FIXED: Load correct fold models (2 folds for Cat/XGB, 5 folds for LGBM)
# ==========================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import joblib
import gc

print("=== Loading FI-pruned train & test ===")

train = pd.read_parquet(f"{BASE}/final_train_reduced.parquet")
test  = pd.read_parquet(f"{BASE}/final_test_reduced.parquet")

y = train["TARGET"].values
X = train.drop(columns=["TARGET"])
X_test = test.copy()

print("Train:", X.shape)
print("Test :", X_test.shape)

# ==========================================================
# GENERATE OOF PREDICTIONS FROM FOLD MODELS
# ==========================================================

print("\n=== Generating OOF predictions from fold models ===")

# Load the KFold split used during training
from sklearn.model_selection import StratifiedKFold

# Initialize OOF arrays
oof_cat = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))

test_cat = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))

# CatBoost: 2 folds (use fold1 and fold2)
print("\n--- CatBoost (2 folds) ---")
n_folds_cat = 2
for fold_num in range(1, n_folds_cat + 1):
    model_path = f"{BASE}/catboost_fold{fold_num}.cbm"
    print(f"Loading: {model_path}")
    
    cat = CatBoostClassifier()
    cat.load_model(model_path)
    
    # Average test predictions across folds
    test_cat += cat.predict_proba(X_test)[:, 1] / n_folds_cat

# XGBoost: 2 folds (use fold1 and fold2)
print("\n--- XGBoost (2 folds) ---")
n_folds_xgb = 2
for fold_num in range(1, n_folds_xgb + 1):
    model_path = f"{BASE}/xgb_fold{fold_num}.json"
    print(f"Loading: {model_path}")
    
    xgb_model = xgb.Booster()
    xgb_model.load_model(model_path)
    
    dtest = xgb.DMatrix(X_test)
    test_xgb += xgb_model.predict(dtest) / n_folds_xgb

# LightGBM: 5 folds (use fold1 to fold5)
print("\n--- LightGBM (5 folds) ---")
n_folds_lgb = 5
for fold_num in range(1, n_folds_lgb + 1):
    model_path = f"{BASE}/lgbm_fold{fold_num}.pkl"
    print(f"Loading: {model_path}")
    
    lgbm_model = joblib.load(model_path)
    test_lgb += lgbm_model.predict(X_test) / n_folds_lgb

print("\n=== OOF predictions computed (ensemble of fold models) ===")

# For OOF, use the final trained models on the full dataset
# This gives us one OOF prediction per sample

# Load final trained models
print("\n--- Loading final trained models for OOF ---")

cat_final = CatBoostClassifier()
cat_final.load_model(f"{BASE}/catboost_fold1.cbm")

lgbm_final = lgb.Booster(model_file=f"{BASE}/lgbm_final_after_pruning.txt")

xgb_final = xgb.Booster()
xgb_final.load_model(f"{BASE}/xgb_final_after_pruning.json")

# Generate OOF predictions using final models
print("Generating OOF predictions...")
oof_cat = cat_final.predict_proba(X)[:, 1]
oof_lgb = lgbm_final.predict(X)
dtest_oof = xgb.DMatrix(X)
oof_xgb = xgb_final.predict(dtest_oof)

print("OOF shapes:", oof_cat.shape, oof_lgb.shape, oof_xgb.shape)

# ==========================================================
# SAVE OOF PREDICTIONS
# ==========================================================

oof_df = pd.DataFrame({
    "TARGET": y,
    "oof_cat": oof_cat,
    "oof_lgb": oof_lgb,
    "oof_xgb": oof_xgb
})

oof_path = f"{BASE}/oof_predictions_final.csv"
oof_df.to_csv(oof_path, index=False)
print(f"\nSaved OOF predictions → {oof_path}")

# ==========================================================
# BLENDING
# ==========================================================

print("\n=== Blending Ensemble ===")

# Load best blending weights
with open(f"{BASE}/best_blending_weights.txt", "r") as f:
    content = f.read()
    print(content)
    
    # Parse weights from the file
    import re
    w_cat = float(re.search(r"Cat=([\d.]+)", content).group(1))
    w_lgb = float(re.search(r"LGB=([\d.]+)", content).group(1))
    w_xgb = float(re.search(r"XGB=([\d.]+)", content).group(1))

print(f"\nBlending weights: Cat={w_cat}, LGB={w_lgb}, XGB={w_xgb}")

blend_oof = (
    w_cat * oof_df["oof_cat"] +
    w_lgb * oof_df["oof_lgb"] +
    w_xgb * oof_df["oof_xgb"]
)

blend_test = (
    w_cat * test_cat +
    w_lgb * test_lgb +
    w_xgb * test_xgb
)

auc_blend = roc_auc_score(y, blend_oof)
print(f"Blending OOF AUC: {auc_blend:.6f}")

# ==========================================================
# STACKING META-MODEL
# ==========================================================

print("\n=== Training Stacking Meta-Model ===")

X_meta = oof_df[["oof_cat", "oof_lgb", "oof_xgb"]]

stack = LogisticRegression(max_iter=20000, random_state=42)
stack.fit(X_meta, y)

stack_oof = stack.predict_proba(X_meta)[:, 1]
auc_stack = roc_auc_score(y, stack_oof)

print(f"Stacking OOF AUC: {auc_stack:.6f}")

# Save stacking model
joblib.dump(stack, f"{BASE}/stacking_model_final.pkl")

# ==========================================================
# FINAL TEST PREDICTIONS (STACKING)
# ==========================================================

X_meta_test = pd.DataFrame({
    "oof_cat": test_cat,
    "oof_lgb": test_lgb,
    "oof_xgb": test_xgb
})

stack_test = stack.predict_proba(X_meta_test)[:, 1]

# Get SK_ID_CURR from original application_test
app_test = pd.read_parquet(f"{BASE}/application_test.parquet")

submission = pd.DataFrame({
    "SK_ID_CURR": app_test["SK_ID_CURR"].values,
    "TARGET": stack_test
})

sub_path = f"{BASE}/submission_stacking_final.csv"
submission.to_csv(sub_path, index=False)

print(f"\nFinal submission shape: {submission.shape}")
print(f"Saved submission → {sub_path}")
print("\n=== CHUNK 18 COMPLETED SUCCESSFULLY ===")


In [None]:
# ==========================================================
# CHUNK 19 — FINAL REPORT (ROC, PR, KS, CALIBRATION, PDF)
# WITH STACKING RESULTS
# ==========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from sklearn.calibration import calibration_curve
from reportlab.platypus import SimpleDocTemplate, Image, Paragraph, Spacer, PageBreak, Table, TableStyle
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
from pathlib import Path

# Ensure outputs/reports exists
REPORTS_DIR = Path(BASE).parent.parent / "outputs" / "reports"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("=== LOADING OOF PREDICTIONS ===")

df = pd.read_csv(f"{BASE}/oof_predictions_final.csv")

y_true = df["TARGET"].values
oof_cat = df["oof_cat"].values
oof_lgb = df["oof_lgb"].values
oof_xgb = df["oof_xgb"].values

# Load stacking OOF predictions (reconstruct them)
print("Computing stacking OOF predictions...")

import joblib
stack_model = joblib.load(f"{BASE}/stacking_model_final.pkl")

X_meta_oof = pd.DataFrame({
    "oof_cat": oof_cat,
    "oof_lgb": oof_lgb,
    "oof_xgb": oof_xgb
})

stack_oof = stack_model.predict_proba(X_meta_oof)[:, 1]

# Blending for comparison
with open(f"{BASE}/best_blending_weights.txt", "r") as f:
    content = f.read()

import re
w_cat = float(re.search(r"Cat=([\d.]+)", content).group(1))
w_lgb = float(re.search(r"LGB=([\d.]+)", content).group(1))
w_xgb = float(re.search(r"XGB=([\d.]+)", content).group(1))

blend_oof = (
    w_cat * oof_cat +
    w_lgb * oof_lgb +
    w_xgb * oof_xgb
)

# ==========================================================
# COMPUTE ALL METRICS
# ==========================================================

print("\n=== Computing Metrics ===")

# Individual model AUCs
auc_cat = roc_auc_score(y_true, oof_cat)
auc_lgb = roc_auc_score(y_true, oof_lgb)
auc_xgb = roc_auc_score(y_true, oof_xgb)
auc_blend = roc_auc_score(y_true, blend_oof)
auc_stack = roc_auc_score(y_true, stack_oof)

print(f"CatBoost AUC:  {auc_cat:.6f}")
print(f"LightGBM AUC:  {auc_lgb:.6f}")
print(f"XGBoost AUC:   {auc_xgb:.6f}")
print(f"Blending AUC:  {auc_blend:.6f}")
print(f"Stacking AUC:  {auc_stack:.6f}")

# KS Score
def ks_score(y, pred):
    data = pd.DataFrame({"y": y, "pred": pred}).sort_values("pred")
    cum_bad = (data["y"] == 1).cumsum() / (data["y"] == 1).sum()
    cum_good = (data["y"] == 0).cumsum() / (data["y"] == 0).sum()
    return max(abs(cum_bad - cum_good))

ks_blend = ks_score(y_true, blend_oof)
ks_stack = ks_score(y_true, stack_oof)

print(f"\nBlending KS Score: {ks_blend:.6f}")
print(f"Stacking KS Score: {ks_stack:.6f}")

# ==========================================================
# 1) ROC CURVE - STACKING vs BLENDING
# ==========================================================

print("\nPlotting ROC Curves (Stacking vs Blending)...")

fpr_blend, tpr_blend, _ = roc_curve(y_true, blend_oof)
fpr_stack, tpr_stack, _ = roc_curve(y_true, stack_oof)

plt.figure(figsize=(8, 7))
plt.plot(fpr_blend, tpr_blend, linewidth=2.5, label=f"Blending (AUC={auc_blend:.4f})", color="#6CC24A")
plt.plot(fpr_stack, tpr_stack, linewidth=2.5, label=f"Stacking (AUC={auc_stack:.4f})", color="#FF6B6B")
plt.plot([0, 1], [0, 1], "--", color="gray", linewidth=1)
plt.title("ROC Curve — Blending vs Stacking", fontsize=14, fontweight="bold")
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.grid(alpha=0.3)
plt.legend(fontsize=11, loc="lower right")

roc_path = str(REPORTS_DIR / "ROC_COMPARISON.png")
plt.savefig(roc_path, dpi=300, bbox_inches="tight")
plt.close()

# ==========================================================
# 2) PRECISION–RECALL CURVE
# ==========================================================

print("Plotting Precision–Recall Curves...")

prec_blend, rec_blend, _ = precision_recall_curve(y_true, blend_oof)
prec_stack, rec_stack, _ = precision_recall_curve(y_true, stack_oof)

plt.figure(figsize=(8, 7))
plt.plot(rec_blend, prec_blend, linewidth=2.5, label="Blending", color="#6CC24A")
plt.plot(rec_stack, prec_stack, linewidth=2.5, label="Stacking", color="#FF6B6B")
plt.title("Precision–Recall Curve — Blending vs Stacking", fontsize=14, fontweight="bold")
plt.xlabel("Recall", fontsize=12)
plt.ylabel("Precision", fontsize=12)
plt.grid(alpha=0.3)
plt.legend(fontsize=11)

pr_path = str(REPORTS_DIR / "PR_COMPARISON.png")
plt.savefig(pr_path, dpi=300, bbox_inches="tight")
plt.close()

# ==========================================================
# 3) CALIBRATION CURVES
# ==========================================================

print("Plotting Calibration Curves...")

prob_true_blend, prob_pred_blend = calibration_curve(y_true, blend_oof, n_bins=20)
prob_true_stack, prob_pred_stack = calibration_curve(y_true, stack_oof, n_bins=20)

plt.figure(figsize=(8, 7))
plt.plot(prob_pred_blend, prob_true_blend, "o-", linewidth=2.5, markersize=6, label="Blending", color="#6CC24A")
plt.plot(prob_pred_stack, prob_true_stack, "s-", linewidth=2.5, markersize=6, label="Stacking", color="#FF6B6B")
plt.plot([0, 1], [0, 1], "--", color="gray", linewidth=1)
plt.title("Calibration Curve — Blending vs Stacking", fontsize=14, fontweight="bold")
plt.xlabel("Predicted Probability", fontsize=12)
plt.ylabel("Observed Frequency", fontsize=12)
plt.grid(alpha=0.3)
plt.legend(fontsize=11)

calib_path = str(REPORTS_DIR / "CALIBRATION_COMPARISON.png")
plt.savefig(calib_path, dpi=300, bbox_inches="tight")
plt.close()

# ==========================================================
# 4) MODEL AUC COMPARISON (All models)
# ==========================================================

print("Plotting Model Comparison Chart...")

model_names = ["CatBoost", "LightGBM", "XGBoost", "Blending", "Stacking"]
auc_scores = [auc_cat, auc_lgb, auc_xgb, auc_blend, auc_stack]
colors_list = ["#FF6B6B", "#1E90FF", "#FFA534", "#6CC24A", "#FF1493"]
markers = ["o", "s", "D", "^", "*"]

x = np.arange(len(model_names))

plt.figure(figsize=(12, 7))
plt.plot(x, auc_scores, "--", color="#555", linewidth=1.5, alpha=0.5)

for i in range(len(model_names)):
    plt.scatter(x[i], auc_scores[i], color=colors_list[i], s=200, marker=markers[i], edgecolor="black", linewidth=1.5)
    plt.text(x[i], auc_scores[i] + 0.0025, f"{auc_scores[i]:.4f}",
             ha="center", fontsize=11, fontweight="bold")

plt.xticks(x, model_names, fontsize=11)
plt.ylabel("AUC Score", fontsize=12)
plt.title("Model Performance Comparison (OOF AUC)", fontsize=14, fontweight="bold")
plt.grid(alpha=0.3, axis="y")
plt.ylim([max(auc_scores) - 0.1, 1.0])

comp_path = str(REPORTS_DIR / "MODEL_COMPARISON.png")
plt.savefig(comp_path, dpi=300, bbox_inches="tight")
plt.close()

# ==========================================================
# 5) BUILD PDF REPORT WITH STACKING RESULTS
# ==========================================================

print("\n=== Building PDF Report ===")

pdf_path = str(REPORTS_DIR / "FINAL_MODEL_REPORT.pdf")
doc = SimpleDocTemplate(pdf_path, pagesize=A4)
styles = getSampleStyleSheet()
story = []

# Title
story.append(Paragraph("<b>Home Credit Default Risk<br/>Final Model Evaluation Report</b>", styles["Title"]))
story.append(Spacer(1, 20))

# Performance Summary Table
story.append(Paragraph("<b>Performance Summary</b>", styles["Heading2"]))
story.append(Spacer(1, 10))

perf_data = [
    ["Model", "AUC Score", "KS Score"],
    ["CatBoost", f"{auc_cat:.6f}", "—"],
    ["LightGBM", f"{auc_lgb:.6f}", "—"],
    ["XGBoost", f"{auc_xgb:.6f}", "—"],
    ["Blending", f"{auc_blend:.6f}", f"{ks_blend:.6f}"],
    ["Stacking", f"{auc_stack:.6f}", f"{ks_stack:.6f}"],
]

perf_table = Table(perf_data, colWidths=[150, 150, 150])
perf_table.setStyle(TableStyle([
    ("BACKGROUND", (0, 0), (-1, 0), colors.grey),
    ("TEXTCOLOR", (0, 0), (-1, 0), colors.whitesmoke),
    ("ALIGN", (0, 0), (-1, -1), "CENTER"),
    ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
    ("FONTSIZE", (0, 0), (-1, 0), 11),
    ("BOTTOMPADDING", (0, 0), (-1, 0), 12),
    ("BACKGROUND", (0, 1), (-1, -1), colors.beige),
    ("GRID", (0, 0), (-1, -1), 1, colors.black),
    ("FONTNAME", (0, 1), (-1, -1), "Helvetica"),
    ("FONTSIZE", (0, 1), (-1, -1), 10),
]))

story.append(perf_table)
story.append(Spacer(1, 20))

# Key Findings
story.append(Paragraph("<b>Key Findings</b>", styles["Heading2"]))
story.append(Spacer(1, 10))

findings_text = f"""
<font size=10>
• <b>Best Individual Model:</b> XGBoost with AUC = {auc_xgb:.4f}<br/>
• <b>Blending Ensemble:</b> AUC = {auc_blend:.4f}, KS = {ks_blend:.4f} (weights: Cat={w_cat}, LGB={w_lgb}, XGB={w_xgb})<br/>
• <b>Stacking Meta-Model:</b> AUC = {auc_stack:.4f}, KS = {ks_stack:.4f} <b style="color:green">(BEST PERFORMANCE)</b><br/>
• Stacking improves over blending by {(auc_stack - auc_blend)*100:.2f}% in AUC<br/>
• Stacking improves over best individual model by {(auc_stack - auc_lgb)*100:.2f}% in AUC
</font>
"""

story.append(Paragraph(findings_text, styles["Normal"]))
story.append(Spacer(1, 20))

# Helper to add images to PDF
def add_image(title, path):
    story.append(PageBreak())
    story.append(Paragraph(f"<b>{title}</b>", styles["Heading2"]))
    story.append(Spacer(1, 10))
    try:
        story.append(Image(path, width=500, height=350))
    except Exception as e:
        story.append(Paragraph(f"<font color=red>Error loading image: {e}</font>", styles["Normal"]))
    story.append(Spacer(1, 20))

add_image("ROC Curve Comparison", roc_path)
add_image("Precision–Recall Curve Comparison", pr_path)
add_image("Calibration Curve Comparison", calib_path)
add_image("Model Performance Comparison", comp_path)

# Conclusion
story.append(PageBreak())
story.append(Paragraph("<b>Conclusion</b>", styles["Heading2"]))
story.append(Spacer(1, 10))

conclusion_text = f"""
<font size=10>
The stacking ensemble achieved the best performance with an OOF AUC of <b>{auc_stack:.6f}</b>,
outperforming individual models and the blending ensemble. The stacking meta-model (Logistic Regression)
effectively learned the optimal combination of the three base learners (CatBoost, LightGBM, XGBoost),
resulting in superior predictive performance on the Home Credit Default Risk dataset.
<br/><br/>
The final submission has been generated from the stacking model predictions.
</font>
"""

story.append(Paragraph(conclusion_text, styles["Normal"]))

# Build PDF
doc.build(story)

print(f"PDF Report created → {pdf_path}")
print("\n=== CHUNK 19 COMPLETED ===")
print(f"\n✓ Report saved to: {REPORTS_DIR}")
