In [51]:
if 'libraries_imported' not in globals():
    libraries_imported = False

if not libraries_imported:
    # --- Core Libraries ---
    import os
    import sys
    import json
    import random
    from datetime import datetime, timedelta

    # --- Data Manipulation ---
    import numpy as np
    import pandas as pd

    # --- Visualization ---
    import matplotlib.pyplot as plt
    import seaborn as sns
    import plotly.graph_objects as go

    # --- Scientific Computing ---
    from scipy.interpolate import make_interp_spline
    from scipy.signal import butter, filtfilt

    # --- Machine Learning (scikit-learn) ---
    from sklearn.model_selection import train_test_split, StratifiedKFold
    from sklearn.preprocessing import StandardScaler, RobustScaler
    from sklearn.metrics import (
        mean_squared_error, r2_score, mean_absolute_error,
        accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay,
        precision_score, recall_score, f1_score
    )
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.utils.class_weight import compute_class_weight

    # --- Imbalanced Data ---
    from imblearn.over_sampling import SMOTE

    # --- Deep Learning (TensorFlow / Keras) ---
    import tensorflow as tf
    from tensorflow.keras import Sequential
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import (
        Input, Dense, LSTM, Dropout, BatchNormalization
    )
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

    # --- Hyperparameter Tuning ---
    from keras_tuner import RandomSearch, HyperModel

    # --- Gradient Boosting ---
    import xgboost as xgb

    # --- Utilities ---
    import joblib

    # Mark libraries as imported
    libraries_imported = True


In [52]:
def run_pipeline_for_dataset(df_raw, dataset_tag):
    """
    Trains XGB on binned TARGET & saves only final model + scaler.
    """
    outdir = os.path.join(BASE_OUTDIR, dataset_tag)
    os.makedirs(outdir, exist_ok=True)

    df = df_raw.copy()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df.sort_values('date', inplace=True)

    # --- NEW: keep only 2nd half (time-ordered) ---
    df = df.iloc[len(df)//2:].copy()
    # ---------------------------------------------

    df = df.interpolate(method='ffill').ffill()

    if TARGET not in df.columns:
        raise ValueError(f"[{dataset_tag}] Target column '{TARGET}' not found.")
    df.dropna(subset=[TARGET], inplace=True)

    # bin target
    eq_cod_min = df[TARGET].min()
    eq_cod_max = df[TARGET].max()
    bins = np.linspace(eq_cod_min, eq_cod_max, N_BINS + 1)
    bins = [round_to_nearest(b, BIN_BASE) for b in bins]
    if len(set(bins)) < len(bins):  # degenerate case
        base = round_to_nearest(eq_cod_min, BIN_BASE)
        bins = [base + i * BIN_BASE for i in range(N_BINS + 1)]
    labels = [f"{low}-{high}" for low, high in zip(bins[:-1], bins[1:])]
    df['fostac_category'] = pd.cut(df[TARGET], bins=bins, labels=labels, include_lowest=True)
    df.dropna(subset=['fostac_category'], inplace=True)

    # features & labels  (EXCLUDE eq_cod from inputs)
    X = df.select_dtypes(include=[np.number]).drop(columns=[TARGET], errors="ignore")
    y = df['fostac_category'].cat.codes

    # split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    # pick hyperparams via CV, but don't save/print metrics
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    best_model = None
    best_acc = -np.inf
    best_params = None
    for tr_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        scaler = RobustScaler()
        X_tr_scaled = scaler.fit_transform(X_tr)
        X_val_scaled = scaler.transform(X_val)

        # safe SMOTE
        try:
            sm = SMOTE(random_state=RANDOM_STATE, k_neighbors=2)
            X_tr_sm, y_tr_sm = sm.fit_resample(X_tr_scaled, y_tr)
        except ValueError:
            X_tr_sm, y_tr_sm = X_tr_scaled, y_tr

        for _ in range(N_RANDOM_PARAM_ITER):
            params = random_param_sample(PARAM_GRID)
            model = xgb.XGBClassifier(
                use_label_encoder=False,
                eval_metric='mlogloss',
                objective='multi:softmax',
                num_class=len(labels),
                random_state=RANDOM_STATE,
                **params
            )
            model.fit(X_tr_sm, y_tr_sm)
            acc = accuracy_score(y_val, model.predict(X_val_scaled))
            if acc > best_acc:
                best_acc = acc
                best_model = (model, scaler)
                best_params = params  # kept in-memory only

    # retrain with best params on full train (still not saving metrics)
    final_scaler = RobustScaler()
    X_train_scaled_full = final_scaler.fit_transform(X_train)
    X_test_scaled = final_scaler.transform(X_test)

    try:
        sm = SMOTE(random_state=RANDOM_STATE, k_neighbors=2)
        X_train_full_sm, y_train_full_sm = sm.fit_resample(X_train_scaled_full, y_train)
    except ValueError:
        X_train_full_sm, y_train_full_sm = X_train_scaled_full, y_train

    final_model = xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        objective='multi:softmax',
        num_class=len(labels),
        random_state=RANDOM_STATE,
        **(best_params or random_param_sample(PARAM_GRID))
    )
    final_model.fit(X_train_full_sm, y_train_full_sm)
    print(final_scaler.feature_names_in_)

    # Save ONLY model & scaler (as requested)
    joblib.dump(final_model, os.path.join(outdir, f"{dataset_tag}_best_model.pkl"))
    joblib.dump(final_scaler, os.path.join(outdir, f"{dataset_tag}_scaler.pkl"))


In [None]:
# ===== Warm-start both ways: D1→D2 and D2→D1 =====
# Updated to report performance in the SAME STYLE as the baseline script:
#  - prints Best CV accuracy + Best hyperparameters
#  - fits final model
#  - evaluates once on held-out validation split
#  - plots confusion matrix

import os, gc, random
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# ------------------ Config ------------------
OUTDIR_BASE = "transfer_learning_model_params/warm_start"
os.makedirs(OUTDIR_BASE, exist_ok=True)

RANDOM_STATE = 42
TARGET = "eq_cod"
N_BINS = 4
BIN_BASE = 500
SPLIT_RATIO = 0.30
CLASS_LABELS = ["LL", "ML", "MH", "HH"]

# ---- tuning budget ----
N_ITER = 50

# ------------------ Helpers ------------------
def round_to_nearest(x, base=500):
    return int(base * round(float(x) / base))

def random_param_sample(grid):
    return {k: random.choice(v) for k, v in grid.items()}

def build_design_matrix(df, feature_names):
    """Exact order for scaler/model; fill missing features with 0.0."""
    cols = []
    for name in feature_names:
        if name in df.columns:
            cols.append(df[name])
        else:
            cols.append(pd.Series(0.0, index=df.index, name=name))
    return pd.concat(cols, axis=1)

def bin_and_encode(df, target_col, n_bins=4, base=500, class_labels=None):
    """Create bins on target and return encoded labels (0..3) aligned to class_labels order."""
    if class_labels is None:
        class_labels = ["LL", "ML", "MH", "HH"]
    vmin, vmax = df[target_col].min(), df[target_col].max()
    bins = np.linspace(vmin, vmax, n_bins + 1)
    bins = [round_to_nearest(b, base) for b in bins]
    bins[0]  = min(bins[0],  vmin)
    bins[-1] = max(bins[-1], vmax)

    df = df.copy()
    df["target_label"] = pd.cut(
        df[target_col], bins=bins, labels=class_labels, include_lowest=True
    )
    df = df.dropna(subset=["target_label"])
    df["target_label"] = (
        df["target_label"]
        .astype("category")
        .cat.set_categories(class_labels, ordered=True)
    )
    y_codes = df["target_label"].cat.codes
    return df, y_codes, bins

def prepare_df_for_transfer(raw_df, dayfirst=True):
    """Sort, ffill, drop only rows missing the target."""
    df = raw_df.copy()
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce", dayfirst=dayfirst)
        df.sort_values("date", inplace=True)
    df = df.interpolate(method="ffill").ffill()
    df = df.dropna(subset=[TARGET])
    return df

def smote_safe(X, y, random_state=42):
    """SMOTE with safety around tiny classes; returns possibly-resampled X,y."""
    y = pd.Series(y)
    counts = Counter(y)
    if not counts:
        return X, y.values
    maj = max(counts.values())
    minority_counts = [c for c in counts.values() if c > 1]
    min_count = min(minority_counts) if minority_counts else 0
    k_neighbors = max(1, min(3, min_count - 1)) if min_count > 0 else 1
    sampling_strategy = {cls: maj for cls, cnt in counts.items() if cnt > 1}
    if sampling_strategy and min_count > 0:
        sm = SMOTE(
            random_state=random_state,
            k_neighbors=k_neighbors,
            sampling_strategy=sampling_strategy
        )
        return sm.fit_resample(X, y)
    return X, y.values

def warm_start_direction(src_tag, src_df, src_model_path, src_scaler_path,
                         tgt_tag, tgt_df, outdir_base=OUTDIR_BASE):
    print(f"\n====================")
    print(f"Direction: {src_tag} → {tgt_tag}")
    print(f"====================")

    outdir = os.path.join(outdir_base, f"{src_tag}_to_{tgt_tag}")
    os.makedirs(outdir, exist_ok=True)
    finetuned_model_path = os.path.join(outdir, f"{tgt_tag}_warmstart_from_{src_tag}.pkl")
    sensor_csv_path      = os.path.join(outdir, f"{tgt_tag}_sensor_predictions_from_{src_tag}.csv")

    # 1) Prepare target df
    df_tgt = prepare_df_for_transfer(tgt_df, dayfirst=True)

    # 2) Bin & encode
    df_tgt, y_all, _ = bin_and_encode(df_tgt, TARGET, n_bins=N_BINS, base=BIN_BASE, class_labels=CLASS_LABELS)

    # 3) Load src scaler/model
    scaler = joblib.load(src_scaler_path)
    base_model = joblib.load(src_model_path)

    feature_order = list(getattr(scaler, "feature_names_in_", []))
    if not feature_order:
        raise ValueError(f"Scaler at {src_scaler_path} missing feature_names_in_.")

    X_all = build_design_matrix(df_tgt, feature_order)

    # ---- RANDOM SPLIT ----
    rng = np.random.default_rng(RANDOM_STATE)
    idx = rng.permutation(len(df_tgt))
    train_size = int(len(df_tgt) * SPLIT_RATIO)

    train_idx = idx[:train_size]
    val_idx   = idx[train_size:]

    X_train = X_all.iloc[train_idx].copy()
    X_val   = X_all.iloc[val_idx].copy()
    y_train = y_all.iloc[train_idx].copy()
    y_val   = y_all.iloc[val_idx].copy()

    val_dates = df_tgt.iloc[val_idx]["date"]

    # ---- Neutralize eq_cod (training mean) ----
    if "eq_cod" in X_train.columns:
        neutral = X_train["eq_cod"].mean()
        X_train.loc[:, "eq_cod"] = neutral
        X_val.loc[:, "eq_cod"] = neutral

    # Scale (use src scaler)
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled   = scaler.transform(X_val)

    # -----------------------------
    # Hyperparam search: 3-fold CV on TRAIN ONLY (same style as baseline)
    # -----------------------------
    param_grid = {
        "max_depth": [3, 4, 5, 6, 7, 8],
        "learning_rate": [0.005, 0.01, 0.05, 0.1, 0.2],
        "n_estimators": [100, 200, 400, 800],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "gamma": [0.0, 0.1, 0.2, 0.5],
        "min_child_weight": [1, 3, 5, 10],
        "reg_lambda": [0.5, 1.0, 2.0, 5.0],
    }

    NUM_CLASS = len(CLASS_LABELS)
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

    best_params = None
    best_cv_acc = -np.inf

    for i in range(N_ITER):
        params = random_param_sample(param_grid)
        fold_accs = []

        for tr_idx, va_idx in skf.split(X_train_scaled, y_train):
            X_tr = X_train_scaled[tr_idx]
            y_tr = y_train.iloc[tr_idx]
            X_va = X_train_scaled[va_idx]
            y_va = y_train.iloc[va_idx]

            # SMOTE on fold-train only
            X_tr_sm, y_tr_sm = smote_safe(X_tr, y_tr, random_state=RANDOM_STATE)

            model = xgb.XGBClassifier(
                use_label_encoder=False,
                eval_metric="mlogloss",
                objective="multi:softmax",
                num_class=NUM_CLASS,
                random_state=RANDOM_STATE,
                nthread=1,
                **params
            )
            model.fit(X_tr_sm, y_tr_sm, xgb_model=base_model.get_booster())
            fold_accs.append(accuracy_score(y_va, model.predict(X_va)))

        mean_acc = float(np.mean(fold_accs)) if fold_accs else -np.inf
        if mean_acc > best_cv_acc:
            best_cv_acc = mean_acc
            best_params = params

    print(f"Best CV accuracy: {best_cv_acc:.4f}")
    print(f"Best hyperparameters: {best_params}")

    # -----------------------------
    # Final train on full TRAIN, evaluate once on held-out VAL (same style as baseline)
    # -----------------------------
    X_train_sm, y_train_sm = smote_safe(X_train_scaled, y_train, random_state=RANDOM_STATE)

    final_model = xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric="mlogloss",
        objective="multi:softmax",
        num_class=NUM_CLASS,
        random_state=RANDOM_STATE,
        nthread=1,
        **(best_params or {})
    )
    final_model.fit(X_train_sm, y_train_sm, xgb_model=base_model.get_booster())

    y_pred = final_model.predict(X_val_scaled)
    acc = accuracy_score(y_val, y_pred)
    bacc = balanced_accuracy_score(y_val, y_pred)
    mf1 = f1_score(y_val, y_pred, average="macro")
    cm = confusion_matrix(y_val, y_pred, labels=[0, 1, 2, 3])

    print(f"Hold-out accuracy:         {acc:.4f}")
    print(f"Hold-out balanced accuracy:{bacc:.4f}")
    print(f"Hold-out macro-F1:         {mf1:.4f}")

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=CLASS_LABELS)
    disp.plot(cmap=plt.cm.Blues)
    plt.show()

    # -----------------------------
    # Save predictions + model (same outputs as before)
    # -----------------------------
    sensor_pred_codes = y_pred
    sensor_pred_labels = pd.Categorical(
        sensor_pred_codes, categories=[0, 1, 2, 3]
    ).rename_categories(CLASS_LABELS).astype(str)

    sensor_df = pd.DataFrame({
        "date": val_dates,
        "pred_bin_code": sensor_pred_codes,
        "pred_bin_label": sensor_pred_labels
    }).set_index("date")

    sensor_df.to_csv(sensor_csv_path)
    joblib.dump(final_model, finetuned_model_path)

    print(f"Saved model → {finetuned_model_path}")
    print(f"Saved predictions → {sensor_csv_path}")

    del base_model, final_model
    gc.collect()

# ------------------ Run both directions ------------------
D1_TAG = "D1"
D2_TAG = "D2"

D1_MODEL = "transfer_learning_model_params/D1/D1_best_model.pkl"
D1_SCALER= "transfer_learning_model_params/D1/D1_scaler.pkl"

D2_MODEL = "transfer_learning_model_params/D2/D2_best_model.pkl"
D2_SCALER= "transfer_learning_model_params/D2/D2_scaler.pkl"

warm_start_direction(D1_TAG, dataset1_df, D1_MODEL, D1_SCALER, D2_TAG, dataset2_df)
warm_start_direction(D2_TAG, dataset2_df, D2_MODEL, D2_SCALER, D1_TAG, dataset1_df)



Direction: D1 → D2
