In [None]:

!pip install -q xgboost scikit-learn boruta_shap boruta matplotlib seaborn tensorflow joblib


import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from boruta import BorutaPy
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

#
def load_data(path=None, target_col="target", date_col=None):
    """
    Load CSV data. If no path provided, creates a synthetic example.
    Expects a tabular dataset with numeric features and one target column.
    """
    if path is None:
        
        rng = np.random.default_rng(RANDOM_SEED)
        n = 5000
        X = pd.DataFrame(rng.normal(size=(n, 20)), columns=[f"f{i}" for i in range(20)])
        
        y = 3*X["f1"] - 2*X["f5"] + 0.5*X["f10"] + rng.normal(scale=0.5, size=n)
        df = X.copy()
        df[target_col] = y
        return df
    df = pd.read_csv(path, parse_dates=[date_col] if date_col else None)
    return df


DATA_PATH = None  
TARGET_COL = "target"
DATE_COL = None   

df = load_data(DATA_PATH, target_col=TARGET_COL, date_col=DATE_COL)
print("data shape:", df.shape)
display(df.head())


def prepare_X_y(df, target_col):
    X = df.drop(columns=[target_col])
    X = X.select_dtypes(include=[np.number]).copy()
    y = df[target_col].values
    return X, y

X, y = prepare_X_y(df, TARGET_COL)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_scaled, y, np.arange(len(y)), test_size=0.2, random_state=RANDOM_SEED
)


joblib.dump(scaler, "scaler.joblib")

# Cell 5: Hybrid Feature Selection (HFS) combining RFE + Boruta
def hybrid_feature_selection(X_train, y_train, feature_names,
                             rfe_estimator=None, n_features_to_select=None,
                             boruta_estimator=None, boruta_iter=100):
    """
    Returns selected feature names by intersection of RFE and Boruta.
    Optionally switch to union by changing the return logic.
    """
    # RFE with XGBoost
    if rfe_estimator is None:
        rfe_estimator = XGBRegressor(n_estimators=200, random_state=RANDOM_SEED, verbosity=0)
    if n_features_to_select is None:
        # heuristic: keep sqrt of features or 10 whichever smaller/larger based on context
        n_features_to_select = max(5, int(np.sqrt(len(feature_names))*2))
    rfe = RFE(estimator=rfe_estimator, n_features_to_select=n_features_to_select, step=0.1)
    rfe.fit(X_train, y_train)
    rfe_support = np.array(feature_names)[rfe.support_]

    # Boruta with RandomForest
    if boruta_estimator is None:
        boruta_estimator = RandomForestRegressor(n_jobs=-1, random_state=RANDOM_SEED, n_estimators=500)
    boruta = BorutaPy(estimator=boruta_estimator, n_estimators='auto', verbose=0, random_state=RANDOM_SEED, max_iter=boruta_iter)
    boruta.fit(X_train, y_train)
    boruta_support = np.array(feature_names)[boruta.support_]

    # HFS: intersection (only features both methods agree on)
    selected_intersection = sorted(set(rfe_support).intersection(set(boruta_support)))
    # Optionally union:
    selected_union = sorted(set(rfe_support).union(set(boruta_support)))

    return {
        "rfe": sorted(list(rfe_support)),
        "boruta": sorted(list(boruta_support)),
        "intersection": selected_intersection,
        "union": selected_union
    }

feature_names = list(X.columns)
hfs = hybrid_feature_selection(X_train, y_train, feature_names, boruta_iter=50)
print("RFE selected:", hfs["rfe"])
print("Boruta selected:", hfs["boruta"])
print("HFS intersection selected:", hfs["intersection"])
print("HFS union selected:", hfs["union"])

# Choose final set (use intersection for stricter selection; switch to "union" if desired)
final_features = hfs["intersection"] if len(hfs["intersection"])>0 else hfs["union"]
print("Final features used for modeling:", final_features)

# Map back indices for train/test using selected features
selected_idx = [feature_names.index(f) for f in final_features]
X_train_sel = X_train[:, selected_idx]
X_test_sel = X_test[:, selected_idx]

# Cell 6: Train XGBoost on selected features
xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.05, random_state=RANDOM_SEED, verbosity=0)
xgb_model.fit(X_train_sel, y_train, eval_set=[(X_test_sel, y_test)], early_stopping_rounds=20, verbose=False)
joblib.dump(xgb_model, "hfs_xgb_model.joblib")

# Cell 7: Build and train a Deep Neural Network for comparison
def build_dnn(input_shape):
    model = models.Sequential([
        layers.Input(shape=(input_shape,)),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(64, activation="relu"),
        layers.Dropout(0.1),
        layers.Dense(1)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss="mse", metrics=["mae"])
    return model

dnn = build_dnn(X_train_sel.shape[1])
es = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
history = dnn.fit(X_train_sel, y_train, validation_data=(X_test_sel, y_test),
                  epochs=200, batch_size=64, callbacks=[es], verbose=0)
dnn.save("hfs_dnn_model.keras")

# Cell 8: Evaluation
def evaluate_model(model, X_test, y_test, model_type="xgb"):
    if model_type == "dnn":
        preds = model.predict(X_test).ravel()
    else:
        preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = math.sqrt(mse)
    return {"mse": mse, "mae": mae, "rmse": rmse, "preds": preds}

res_xgb = evaluate_model(xgb_model, X_test_sel, y_test, model_type="xgb")
res_dnn = evaluate_model(dnn, X_test_sel, y_test, model_type="dnn")

print("XGBoost (HFS) -> MSE: {:.5f}, MAE: {:.5f}, RMSE: {:.5f}".format(res_xgb["mse"], res_xgb["mae"], res_xgb["rmse"]))
print("DNN    (HFS) -> MSE: {:.5f}, MAE: {:.5f}, RMSE: {:.5f}".format(res_dnn["mse"], res_dnn["mae"], res_dnn["rmse"]))

# Cell 9: Quick plots
plt.figure(figsize=(10,4))
plt.plot(y_test[:200], label="true")
plt.plot(res_xgb["preds"][:200], label="xgb_pred")
plt.plot(res_dnn["preds"][:200], label="dnn_pred", alpha=0.7)
plt.legend()
plt.title("Test predictions (first 200 samples)")
plt.show()


print("""

""")
