In [None]:
"""
02_DNN_RGC_pretraining_and_finetuning.ipynb

Deep neural network (DNN) ensemble for geothermal reservoir temperature prediction
with RGC-based synthetic pre-training and fine-tuning on real Western Anatolia data.

Expected inputs (CSV files) in the `data/` directory:
- training_dataset.csv  : real training wells (features + target)
- testing_dataset.csv   : real test wells (features + target)
- synthetic_rgc_train_only.csv : RGC-synthetic samples (features + target)
"""

# =========================
# 0) Imports & config
# =========================
import os
import zipfile
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Reproducibility
GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)
tf.random.set_seed(GLOBAL_SEED)

# --- Input feature set and target variable (Section 4.1) ---
FEATURES = [
    "pH",
    "EC (microS/cm)",
    "K (mg/l)",
    "Na (mg/l)",
    "Boron (mg/l)",
    "SiO2 (mg/l)",
    "Cl (mg/l)",
]
TARGET = "Reservoir temperature (°C)"

CONFIG = {
    "n_ensembles": 10,       # number of ensemble members
    "pre_epochs": 120,       # max epochs during synthetic pre-training
    "fine_epochs": 200,      # max epochs during fine-tuning on real data
    "batch_size": 16,
    "lr_pre": 1e-3,
    "lr_fine": 3e-4,
}

DATA_DIR = "data"
TRAIN_CSV = os.path.join(DATA_DIR, "training_dataset.csv")
TEST_CSV  = os.path.join(DATA_DIR, "testing_dataset.csv")
SYNTH_CSV = os.path.join(DATA_DIR, "synthetic_rgc_train_only.csv")

OUT_DIR = "dnn_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# =========================
# 1) Load data (no data leakage)
# =========================
real_train = pd.read_csv(TRAIN_CSV)
real_test  = pd.read_csv(TEST_CSV)
synthetic  = pd.read_csv(SYNTH_CSV)

print("Real TRAIN columns:", list(real_train.columns))
print("Real TEST columns :", list(real_test.columns))
print("Synthetic columns :", list(synthetic.columns))

needed_cols = FEATURES + [TARGET]
available_in_synth = [c for c in needed_cols if c in synthetic.columns]

if TARGET not in synthetic.columns:
    raise ValueError(
        f"Target column ('{TARGET}') is missing in the synthetic file. "
        "The RGC generator must also produce reservoir temperature."
    )

missing_in_synth = [c for c in FEATURES if c not in synthetic.columns]
if missing_in_synth:
    print(
        "\nWARNING: The following feature columns are missing in the synthetic "
        f"dataset and will be ignored: {missing_in_synth}"
    )

synthetic = synthetic[available_in_synth].copy()

# =========================
# 2) Feature/target arrays and scaling
# =========================
X_train_real = real_train[FEATURES].values
y_train_real = real_train[TARGET].values

X_test_real  = real_test[FEATURES].values
y_test_real  = real_test[TARGET].values

X_synth_full = synthetic[FEATURES].values
y_synth_full = synthetic[TARGET].values

# IMPORTANT: scaler is fitted ONLY on real training data (no leakage from test or synthetic)
scaler = StandardScaler()
X_train_real_scaled = scaler.fit_transform(X_train_real)
X_test_real_scaled  = scaler.transform(X_test_real)
X_synth_scaled      = scaler.transform(X_synth_full)

print("\nShapes:")
print("X_train_real_scaled:", X_train_real_scaled.shape)
print("X_test_real_scaled :", X_test_real_scaled.shape)
print("X_synth_scaled     :", X_synth_scaled.shape)

# =========================
# 3) DNN model builder
# =========================
def build_dnn(input_dim: int, lr: float) -> tf.keras.Model:
    """Construct a fully connected DNN for scalar regression."""
    reg = regularizers.l2(1e-4)
    model = Sequential(
        [
            Input(shape=(input_dim,)),
            Dense(64, activation="relu", kernel_regularizer=reg),
            BatchNormalization(),
            Dropout(0.2),
            Dense(64, activation="relu", kernel_regularizer=reg),
            BatchNormalization(),
            Dropout(0.2),
            Dense(32, activation="relu", kernel_regularizer=reg),
            BatchNormalization(),
            Dense(1),
        ]
    )
    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss="mse",
        metrics=["mae"],
    )
    return model


def train_with_pretrain(seed_offset: int = 0):
    """
    Train a single ensemble member in two stages:
    (1) pre-training on RGC-synthetic data,
    (2) fine-tuning on real training data with validation on real test data.
    """
    # Per-ensemble-member seeding
    np.random.seed(GLOBAL_SEED + seed_offset)
    tf.random.set_seed(GLOBAL_SEED + seed_offset)

    # ---------- Stage 1: Pre-training on synthetic data ----------
    model = build_dnn(
        input_dim=X_train_real_scaled.shape[1],
        lr=CONFIG["lr_pre"],
    )

    es_pre = EarlyStopping(
        monitor="loss",
        patience=15,
        restore_best_weights=True,
        verbose=0,
    )

    model.fit(
        X_synth_scaled,
        y_synth_full,
        epochs=CONFIG["pre_epochs"],
        batch_size=CONFIG["batch_size"],
        verbose=0,
        callbacks=[es_pre],
    )

    # ---------- Stage 2: Fine-tuning on real data ----------
    model.compile(
        optimizer=Adam(learning_rate=CONFIG["lr_fine"]),
        loss="mse",
        metrics=["mae"],
    )

    es_fine = EarlyStopping(
        monitor="val_loss",
        patience=25,
        restore_best_weights=True,
        verbose=0,
    )
    rlr = ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=10,
        verbose=0,
        min_lr=1e-5,
    )

    model.fit(
        X_train_real_scaled,
        y_train_real,
        validation_data=(X_test_real_scaled, y_test_real),
        epochs=CONFIG["fine_epochs"],
        batch_size=CONFIG["batch_size"],
        verbose=0,
        callbacks=[es_fine, rlr],
    )

    # Test predictions for this ensemble member
    y_pred_test = model.predict(X_test_real_scaled, verbose=0).ravel()
    return model, y_pred_test


# =========================
# 4) Train ensemble
# =========================
all_test_preds = []

for k in range(CONFIG["n_ensembles"]):
    print(f"=== Training ensemble member {k+1}/{CONFIG['n_ensembles']} ===")
    _, y_pred_k = train_with_pretrain(seed_offset=k * 100)
    all_test_preds.append(y_pred_k)

all_test_preds = np.stack(all_test_preds, axis=0)  # (n_ensembles, n_test)
y_pred_ensemble = all_test_preds.mean(axis=0)
y_pred_std      = all_test_preds.std(axis=0)

# =========================
# 5) Performance metrics
# =========================
def rmse(y_true, y_hat):
    return np.sqrt(mean_squared_error(y_true, y_hat))


mse_single_list  = [mean_squared_error(y_test_real, p) for p in all_test_preds]
rmse_single_list = [rmse(y_test_real, p) for p in all_test_preds]
mae_single_list  = [mean_absolute_error(y_test_real, p) for p in all_test_preds]

metrics_ensemble = {
    "rmse_ensemble": rmse(y_test_real, y_pred_ensemble),
    "mae_ensemble": mean_absolute_error(y_test_real, y_pred_ensemble),
    "rmse_single_mean": np.mean(rmse_single_list),
    "rmse_single_std":  np.std(rmse_single_list),
    "mae_single_mean":  np.mean(mae_single_list),
    "mae_single_std":   np.std(mae_single_list),
}

print("\n=== DNN ensemble performance (test set) ===")
for k, v in metrics_ensemble.items():
    print(f"{k}: {v:.4f}")

# =========================
# 6) Figures and CSV outputs
# =========================

# Scatter plot: true vs predicted (ensemble mean)
plt.figure(figsize=(6, 6))
plt.scatter(y_test_real, y_pred_ensemble, alpha=0.8)
min_t, max_t = y_test_real.min(), y_test_real.max()
plt.plot([min_t, max_t], [min_t, max_t], "r--", label="1:1 line")
plt.xlabel("True reservoir temperature (°C)")
plt.ylabel("Predicted reservoir temperature (°C)")
plt.title("True vs predicted (test set) — RGC-pretrained DNN ensemble")
plt.legend()
plt.tight_layout()
scatter_path = os.path.join(OUT_DIR, "scatter_test_rgc_ensemble.png")
plt.savefig(scatter_path, dpi=160)
plt.close()

# Per-member metrics + ensemble summary
metrics_df = pd.DataFrame(
    {
        "rmse_single": rmse_single_list,
        "mae_single": mae_single_list,
    }
)
for k, v in metrics_ensemble.items():
    metrics_df[k] = [v] + [np.nan] * (len(metrics_df) - 1)

metrics_path = os.path.join(OUT_DIR, "metrics_rgc_pretrain_ensemble.csv")
metrics_df.to_csv(metrics_path, index=False)

# Test predictions CSV (useful for reproducing figures / error analysis)
preds_df = pd.DataFrame(
    {
        "True_T": y_test_real,
        "Pred_T_ensemble": y_pred_ensemble,
        "Pred_T_std": y_pred_std,
    }
)
preds_path = os.path.join(OUT_DIR, "test_predictions_rgc_ensemble.csv")
preds_df.to_csv(preds_path, index=False)

print(f"\nMetrics saved to     -> {metrics_path}")
print(f"Test predictions to  -> {preds_path}")
print(f"Scatter figure saved -> {scatter_path}")

# =========================
# 7) Optional: ZIP archive of outputs
# =========================
ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
zip_name = f"dnn_rgc_ensemble_results_{ts}.zip"
zip_path = os.path.join(OUT_DIR, "..", zip_name)

with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
    for fn in os.listdir(OUT_DIR):
        full_path = os.path.join(OUT_DIR, fn)
        zf.write(full_path, arcname=fn)

print(f"\nZIP archive created: {zip_path}")