In [19]:
import numpy as np
import pandas as pd
import time, os, pickle
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from onnxruntime import InferenceSession
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.multioutput import MultiOutputRegressor

# ============================================================
# âœ… STEP 1: Load full pipeline and extract regressor
# ============================================================
rf_model = model_results['RandomForest_ManualTune']['pipeline']
rf_estimator = rf_model.named_steps['regressor']

# Handle multioutput wrapper
if not hasattr(rf_estimator, "n_outputs_"):
    rf_export = MultiOutputRegressor(rf_estimator.estimator)
    rf_export.estimators_ = rf_estimator.estimators_
else:
    rf_export = rf_estimator

# ============================================================
# âœ… STEP 2: Convert to ONNX (numeric only)
# ============================================================
# Transform input data first
X_transformed = rf_model.named_steps['preprocess'].transform(X_train)
n_features = X_transformed.shape[1]
initial_type = [("float_input", FloatTensorType([None, n_features]))]

onnx_model = convert_sklearn(rf_export, initial_types=initial_type, target_opset=15)

onnx_path = "random_forest_manualtune.onnx"
with open(onnx_path, "wb") as f:
    f.write(onnx_model.SerializeToString())
print(f"âœ… ONNX model saved at: {onnx_path}")

# ============================================================
# âœ… STEP 3: Save Pickle model
# ============================================================
pkl_path = "random_forest_manualtune.pkl"
with open(pkl_path, "wb") as f:
    pickle.dump(rf_estimator, f)

# ============================================================
# âœ… STEP 4: Inference and accuracy comparison
# ============================================================
X_sample = X_test[:200]
y_true = y_test[:200]

# --- Preprocess sample ---
X_sample_transformed = rf_model.named_steps['preprocess'].transform(X_sample).astype(np.float32)

# --- Sklearn inference (full pipeline) ---
start = time.time()
y_pred_sklearn = rf_model.predict(X_sample)
t_sklearn = time.time() - start

# --- ONNX inference ---
sess = InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
start = time.time()
y_pred_onnx = sess.run(None, {input_name: X_sample_transformed})[0]
t_onnx = time.time() - start

# ============================================================
# âœ… STEP 5: Accuracy metrics
# ============================================================
r2_sklearn = r2_score(y_true, y_pred_sklearn)
r2_onnx = r2_score(y_true, y_pred_onnx)
rmse_sklearn = mean_squared_error(y_true, y_pred_sklearn, squared=False)
rmse_onnx = mean_squared_error(y_true, y_pred_onnx, squared=False)
mae_sklearn = mean_absolute_error(y_true, y_pred_sklearn)
mae_onnx = mean_absolute_error(y_true, y_pred_onnx)

# ============================================================
# âœ… STEP 6: Benchmark summary
# ============================================================
size_pkl = os.path.getsize(pkl_path) / 1024
size_onnx = os.path.getsize(onnx_path) / 1024
speedup = (t_sklearn / t_onnx) if t_onnx > 0 else np.nan

summary = pd.DataFrame({
    "Format": ["Pickle (.pkl)", "ONNX (.onnx)"],
    "Size (KB)": [size_pkl, size_onnx],
    "Inference Time (s)": [t_sklearn, t_onnx],
    "Speed-up Factor": [1.0, speedup],
    "RÂ² Score": [r2_sklearn, r2_onnx],
    "RMSE": [rmse_sklearn, rmse_onnx],
    "MAE": [mae_sklearn, mae_onnx]
})

print("\nðŸ“Š ONNX vs Pickle Benchmark Summary:")
display(summary)



âœ… ONNX model saved at: random_forest_manualtune.onnx

ðŸ“Š ONNX vs Pickle Benchmark Summary:


Unnamed: 0,Format,Size (KB),Inference Time (s),Speed-up Factor,RÂ² Score,RMSE,MAE
0,Pickle (.pkl),38746.785156,0.143065,1.0,0.50062,1.0568,0.862435
1,ONNX (.onnx),20928.135742,0.0,,0.50062,1.0568,0.862435


In [28]:
import numpy as np
import pandas as pd
import time, os, pickle
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from onnxruntime import InferenceSession
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ============================================================
# âœ… STEP 1: Load trained HGB model from model_results
# ============================================================
hgb_model = model_results['HistGradientBoosting_Optuna']['pipeline']
hgb_estimator = hgb_model[-1]  # MultiOutputRegressor

print("âœ… Loaded Optuna-tuned HGB pipeline.")


# ============================================================
# âœ… STEP 2: Convert to ONNX (using transformed numeric features)
# ============================================================
# Transform training data using the pipelineâ€™s preprocessors (exclude final estimator)
X_train_transformed = hgb_model[:-1].transform(X_train).astype(np.float32)
n_features = X_train_transformed.shape[1]

initial_type = [("float_input", FloatTensorType([None, n_features]))]

onnx_model = convert_sklearn(hgb_estimator, initial_types=initial_type, target_opset=15)

onnx_path = "hist_gradient_boosting_optuna.onnx"
with open(onnx_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"âœ… ONNX model saved at: {onnx_path}")


# ============================================================
# âœ… STEP 3: Save Pickle version for comparison
# ============================================================
pkl_path = "hist_gradient_boosting_optuna.pkl"
with open(pkl_path, "wb") as f:
    pickle.dump(hgb_estimator, f)
print(f"âœ… Pickle model saved at: {pkl_path}")


# ============================================================
# âœ… STEP 4: Prepare test samples and preprocess
# ============================================================
X_sample = X_test[:200]
y_true = y_test[:200]

X_sample_transformed = hgb_model[:-1].transform(X_sample).astype(np.float32)


# ============================================================
# âœ… STEP 5: Inference timing comparison
# ============================================================
# --- Sklearn inference ---
start = time.time()
y_pred_sklearn = hgb_model.predict(X_sample)
t_sklearn = time.time() - start

# --- ONNX inference ---
sess = InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name

start = time.time()
y_pred_onnx = sess.run(None, {input_name: X_sample_transformed})[0]
t_onnx = time.time() - start

if t_onnx == 0:
    t_onnx = 1e-9  # avoid division by zero in speed-up calc


# ============================================================
# âœ… STEP 6: Metrics comparison
# ============================================================
r2_sklearn = r2_score(y_true, y_pred_sklearn)
r2_onnx = r2_score(y_true, y_pred_onnx)
rmse_sklearn = mean_squared_error(y_true, y_pred_sklearn, squared=False)
rmse_onnx = mean_squared_error(y_true, y_pred_onnx, squared=False)
mae_sklearn = mean_absolute_error(y_true, y_pred_sklearn)
mae_onnx = mean_absolute_error(y_true, y_pred_onnx)


# ============================================================
# âœ… STEP 7: Benchmark summary
# ============================================================
size_pkl = os.path.getsize(pkl_path) / 1024
size_onnx = os.path.getsize(onnx_path) / 1024
speedup = t_sklearn / t_onnx

summary = pd.DataFrame({
    "Format": ["Pickle (.pkl)", "ONNX (.onnx)"],
    "Size (KB)": [size_pkl, size_onnx],
    "Inference Time (s)": [t_sklearn, t_onnx],
    "Speed-up Factor": [1.0, speedup],
    "RÂ² Score": [r2_sklearn, r2_onnx],
    "RMSE": [rmse_sklearn, rmse_onnx],
    "MAE": [mae_sklearn, mae_onnx]
})

print("\nðŸ“Š ONNX vs Pickle Benchmark Summary:")
display(summary)


âœ… Loaded Optuna-tuned HGB pipeline.
âœ… ONNX model saved at: hist_gradient_boosting_optuna.onnx
âœ… Pickle model saved at: hist_gradient_boosting_optuna.pkl

ðŸ“Š ONNX vs Pickle Benchmark Summary:


Unnamed: 0,Format,Size (KB),Inference Time (s),Speed-up Factor,RÂ² Score,RMSE,MAE
0,Pickle (.pkl),3196.573242,0.03962779,1.0,0.594508,0.952904,0.753722
1,ONNX (.onnx),1198.697266,1e-09,39627790.0,0.593202,0.954418,0.755488
