In [None]:
"""
03_NGBoost_real_vs_RGC_augmented.ipynb

NGBoost models for geothermal reservoir temperature prediction from hydrogeochemical data:
- Real-only NGBoost baseline
- NGBoost with RGC-augmented training data

Expected inputs (CSV files) in the `data/` directory:
- training_dataset.csv          : real training wells (features + target)
- testing_dataset.csv           : real test wells (features + target)
- synthetic_rgc_train_only.csv  : RGC-synthetic samples (features + target)
"""

# If needed, install ngboost in your environment (uncomment the next line):
# !pip install ngboost

# =========================
# 0) Imports & config
# =========================
import os
import zipfile
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error

from ngboost import NGBRegressor
from ngboost.distns import Normal

# -------------------------
# Feature and target definitions
# -------------------------
FEATURES_BASE = [
    "pH",
    "EC (microS/cm)",
    "K (mg/l)",
    "Na (mg/l)",
    "Boron (mg/l)",
    "SiO2 (mg/l)",
    "Cl (mg/l)",
]
TARGET = "Reservoir temperature (°C)"

DATA_DIR = "data"
TRAIN_CSV = os.path.join(DATA_DIR, "training_dataset.csv")
TEST_CSV  = os.path.join(DATA_DIR, "testing_dataset.csv")
SYNTH_CSV = os.path.join(DATA_DIR, "synthetic_rgc_train_only.csv")

OUT_DIR = "ngb_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)

# =========================
# 1) Load datasets
# =========================
real_train = pd.read_csv(TRAIN_CSV)
real_test  = pd.read_csv(TEST_CSV)

# IMPORTANT:
# synthetic_rgc_train_only.csv must have been generated by the RGC notebook
# using ONLY the training_dataset wells (no information from the test set).
synthetic_train = pd.read_csv(SYNTH_CSV)

# Ensure only required columns are kept (drop any extras)
cols_needed = FEATURES_BASE + [TARGET]
synthetic_train = synthetic_train[cols_needed].copy()

print("Real train shape:", real_train.shape)
print("Real test shape :", real_test.shape)
print("Synthetic train shape:", synthetic_train.shape)

# =========================
# 2) Feature-engineering function
# =========================
def build_feature_matrix(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    Construct the feature matrix used by NGBoost, including:
    - base hydrogeochemical variables,
    - log-transformed variables for heavy-tailed ions,
    - ion ratios and normalised concentrations.
    """
    df = df_in.copy()
    X = df[FEATURES_BASE].copy()

    # Log-transforms (using log1p for numerical stability near zero)
    for col in ["EC (microS/cm)", "Na (mg/l)", "Cl (mg/l)", "SiO2 (mg/l)"]:
        X[f"log_{col}"] = np.log1p(np.clip(df[col].values, a_min=0, a_max=None))

    # Ratios normalised by Cl and EC (Section 4.3.2)
    eps = 1e-6
    X["Na_plus_K_over_Cl"] = (df["Na (mg/l)"] + df["K (mg/l)"]) / (
        df["Cl (mg/l)"] + eps
    )
    X["Na_over_Cl"] = df["Na (mg/l)"] / (df["Cl (mg/l)"] + eps)
    X["B_over_Cl"] = df["Boron (mg/l)"] / (df["Cl (mg/l)"] + eps)
    X["B_over_EC"] = df["Boron (mg/l)"] / (df["EC (microS/cm)"] + eps)
    X["SiO2_over_EC"] = df["SiO2 (mg/l)"] / (df["EC (microS/cm)"] + eps)

    return X

# =========================
# 3) Real-only and RGC-augmented designs
# =========================
# --- Real-only design ---
X_train_real = build_feature_matrix(real_train)
y_train_real = real_train[TARGET].values

X_test_real  = build_feature_matrix(real_test)
y_test_real  = real_test[TARGET].values

# --- RGC-augmented design ---
# Only the training subset is augmented with RGC-synthetic samples.
# The test set is never augmented (no leakage).
X_train_rgc = pd.concat(
    [X_train_real, build_feature_matrix(synthetic_train)],
    axis=0,
    ignore_index=True,
)
y_train_rgc = np.concatenate(
    [y_train_real, synthetic_train[TARGET].values],
    axis=0,
)

print("\nFeature-matrix shapes:")
print("Real-only train   :", X_train_real.shape)
print("RGC-augmented train:", X_train_rgc.shape)
print("Test              :", X_test_real.shape)

# =========================
# 4) NGBoost models
# =========================
def fit_ngb(X_train, y_train, random_state=0):
    """
    Fit an NGBoost regressor with a Normal predictive distribution
    using default tree parameters and 500 boosting iterations.
    """
    ngb = NGBRegressor(
        Dist=Normal,
        n_estimators=500,
        learning_rate=0.03,
        minibatch_frac=1.0,
        col_sample=1.0,
        verbose=False,
        random_state=random_state,
    )
    ngb.fit(X_train, y_train)
    return ngb


def rmse(y_true, y_hat):
    return np.sqrt(mean_squared_error(y_true, y_hat))


# --- Model 1: real-only NGBoost ---
ngb_real = fit_ngb(X_train_real, y_train_real, random_state=GLOBAL_SEED)
# Use the predictive mean (location parameter of the Normal distribution)
y_pred_real_only = ngb_real.pred_dist(X_test_real).loc

# --- Model 2: RGC-augmented NGBoost ---
ngb_rgc = fit_ngb(X_train_rgc, y_train_rgc, random_state=GLOBAL_SEED)
y_pred_rgc = ngb_rgc.pred_dist(X_test_real).loc

# =========================
# 5) Metrics
# =========================
metrics_real = {
    "rmse_test": rmse(y_test_real, y_pred_real_only),
    "mae_test": mean_absolute_error(y_test_real, y_pred_real_only),
}
metrics_rgc = {
    "rmse_test": rmse(y_test_real, y_pred_rgc),
    "mae_test": mean_absolute_error(y_test_real, y_pred_rgc),
}

print("\n=== NGBoost (real-only) ===")
for k, v in metrics_real.items():
    print(f"{k}: {v:.4f}")

print("\n=== NGBoost (RGC-augmented) ===")
for k, v in metrics_rgc.items():
    print(f"{k}: {v:.4f}")

# Comparison table (used for Table 6b)
compare_df = pd.DataFrame(
    {
        "model": ["NGBoost_real_only", "NGBoost_RGC_augmented"],
        "rmse_test": [metrics_real["rmse_test"], metrics_rgc["rmse_test"]],
        "mae_test": [metrics_real["mae_test"], metrics_rgc["mae_test"]],
    }
)
compare_path = os.path.join(OUT_DIR, "metrics_ngb_compare.csv")
compare_df.to_csv(compare_path, index=False)

# Optional separate metric files
pd.DataFrame([metrics_real]).to_csv(
    os.path.join(OUT_DIR, "metrics_ngb_real_only.csv"),
    index=False,
)
pd.DataFrame([metrics_rgc]).to_csv(
    os.path.join(OUT_DIR, "metrics_ngb_rgc_augmented.csv"),
    index=False,
)

# =========================
# 6) Scatter plots
# =========================
def scatter_plot(y_true, y_pred, title, fname):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_true, y_pred, alpha=0.8)
    tmin, tmax = y_true.min(), y_true.max()
    plt.plot([tmin, tmax], [tmin, tmax], "r--", label="1:1 line")
    plt.xlabel("True reservoir temperature (°C)")
    plt.ylabel("Predicted reservoir temperature (°C)")
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    path = os.path.join(OUT_DIR, fname)
    plt.savefig(path, dpi=160)
    plt.close()
    return path


path_real = scatter_plot(
    y_test_real,
    y_pred_real_only,
    "True vs predicted (test) — NGBoost (real-only)",
    "scatter_test_ngb_real_only.png",
)
path_rgc = scatter_plot(
    y_test_real,
    y_pred_rgc,
    "True vs predicted (test) — NGBoost (RGC-augmented)",
    "scatter_test_ngb_rgc_augmented.png",
)

# Prediction CSVs (useful for reproducing Fig. 12–13)
pd.DataFrame(
    {
        "True_T": y_test_real,
        "Pred_T_real_only": y_pred_real_only,
    }
).to_csv(
    os.path.join(OUT_DIR, "preds_ngb_real_only.csv"),
    index=False,
)

pd.DataFrame(
    {
        "True_T": y_test_real,
        "Pred_T_rgc_aug": y_pred_rgc,
    }
).to_csv(
    os.path.join(OUT_DIR, "preds_ngb_rgc_augmented.csv"),
    index=False,
)

print("\nFigures and CSVs have been written to:", OUT_DIR)
print("Comparison metrics ->", compare_path)
print("Real-only scatter  ->", path_real)
print("RGC-augmented scatter ->", path_rgc)

# =========================
# 7) Optional: ZIP archive of outputs
# =========================
ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
zip_name = f"ngboost_results_{ts}.zip"
zip_path = os.path.join(OUT_DIR, "..", zip_name)

with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
    for fn in os.listdir(OUT_DIR):
        full_path = os.path.join(OUT_DIR, fn)
        zf.write(full_path, arcname=fn)

print(f"\nZIP archive created: {zip_path}")