# Helper Notebook

This notebook provides utility code used across the project.

Its primary purpose is to demonstrate how preprocessed model inputs can be converted back into their original, human-readable form. This is especially important for interpretability, debugging, and explainability analyses, where standardized or one-hot-encoded values are not meaningful on their own.

In addition, the notebook defines a consistent procedure for loading trained models, preprocessing pipelines, and feature metadata, ensuring that predictions and explanations are generated using the exact artifacts produced during training.


In [4]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import joblib
from sklearn.base import BaseEstimator, ClassifierMixin
np.random.seed(42)
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

# This class must be defined in the notebook so joblib can deserialize the model
class SklearnCompatibleWrapper(ClassifierMixin, BaseEstimator):
    def __init__(self, model):
        self.model = model
        self._estimator_type = "classifier"

    @property
    def classes_(self):
        return getattr(self.model, "classes_", np.array([0, 1]))

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)


In [7]:
#bundle = joblib.load("best_model.joblib") # uncomment when there is a model
#estimator = bundle["model"]

X_test = joblib.load("processed/X_test_ready.joblib")
y_test = joblib.load("processed/y_test_ready.joblib")

feature_names_path = "processed/feature_names.joblib"
feature_names = joblib.load(feature_names_path) if os.path.exists(feature_names_path) else None

#print("estimator:", type(estimator))
print("X_test type:", type(X_test), "shape:", getattr(X_test, "shape", None))
print("y_test type:", type(y_test), "len:", len(y_test))
print("feature_names loaded:", feature_names is not None)

if feature_names is not None:
    print("n features:", len(feature_names))
    print("feature names:", feature_names)


X_test type: <class 'numpy.ndarray'> shape: (61771, 29)
y_test type: <class 'pandas.Series'> len: 61771
feature_names loaded: True
n features: 29
feature names: ['num__Height_(cm)', 'num__Weight_(kg)', 'num__BMI', 'num__Alcohol_Consumption', 'num__Fruit_Consumption', 'num__Green_Vegetables_Consumption', 'num__FriedPotato_Consumption', 'num__Age_num', 'cat__General_Health_Excellent', 'cat__General_Health_Fair', 'cat__General_Health_Good', 'cat__General_Health_Poor', 'cat__General_Health_Very Good', 'cat__Checkup_5 or more years ago', 'cat__Checkup_Never', 'cat__Checkup_Within the past 2 years', 'cat__Checkup_Within the past 5 years', 'cat__Checkup_Within the past year', 'cat__Diabetes_No', 'cat__Diabetes_No, pre-diabetes or borderline diabetes', 'cat__Diabetes_Yes', 'cat__Diabetes_Yes, but female told only during pregnancy', 'bin__Exercise', 'bin__Skin_Cancer', 'bin__Other_Cancer', 'bin__Depression', 'bin__Arthritis', 'bin__Sex', 'bin__Smoking_History']


In [8]:
import joblib
import pandas as pd
import numpy as np

preprocessor = joblib.load("processed/preprocessor.joblib")
feature_names = joblib.load("processed/feature_names.joblib")

X_test_ready = joblib.load("processed/X_test_ready.joblib")
X_test_raw = pd.read_csv("processed/X_test_raw.csv.gz")


In [9]:
num_transformer = preprocessor.named_transformers_["num"]
cat_transformer = preprocessor.named_transformers_["cat"]
bin_transformer = preprocessor.named_transformers_.get("bin", None)

num_cols = preprocessor.transformers_[0][2]
cat_cols = preprocessor.transformers_[1][2]
bin_cols = preprocessor.transformers_[2][2] if bin_transformer else []


In [10]:
def inverse_preprocess(X_ready, preprocessor):
    """
    Convert preprocessed (ready) data back to human-readable form.
    """
    X_ready = pd.DataFrame(X_ready)

    # --- numeric ---
    num_pipe = preprocessor.named_transformers_["num"]
    scaler = num_pipe.named_steps["scaler"]
    num_cols = preprocessor.transformers_[0][2]

    n_num = len(num_cols)
    X_num = scaler.inverse_transform(X_ready.iloc[:, :n_num])
    df_num = pd.DataFrame(X_num, columns=num_cols)

    # --- categorical ---
    cat_pipe = preprocessor.named_transformers_["cat"]
    ohe = cat_pipe.named_steps["onehot"]
    cat_cols = preprocessor.transformers_[1][2]

    cat_start = n_num
    cat_end = cat_start + ohe.transform(pd.DataFrame({c: ["x"] for c in cat_cols})).shape[1]
    X_cat = X_ready.iloc[:, cat_start:cat_end].values

    cat_values = {}
    idx = 0
    for col, cats in zip(cat_cols, ohe.categories_):
        width = len(cats)
        cat_values[col] = cats[np.argmax(X_cat[:, idx:idx+width], axis=1)]
        idx += width

    df_cat = pd.DataFrame(cat_values)

    # --- binary ---
    df_bin = pd.DataFrame()
    if "bin" in preprocessor.named_transformers_:
        bin_cols = preprocessor.transformers_[2][2]
        X_bin = X_ready.iloc[:, cat_end:cat_end+len(bin_cols)]
        df_bin = pd.DataFrame(X_bin.values, columns=bin_cols)

    return pd.concat([df_num, df_cat, df_bin], axis=1)


This warning indicates that the preprocessing pipeline was fitted on array-based inputs and is later applied to data with feature names. As column order is preserved, the transformation remains correct (I hope).

In [14]:
import numpy as np
import pandas as pd
from scipy import sparse
x0_ready = X_test_ready[[0], :]
x0_human = inverse_preprocess(x0_ready, preprocessor)
x0_human



Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,Age_num,General_Health,Checkup,Diabetes,Exercise,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Smoking_History
0,170.0,63.5,21.93,30.0,12.0,4.0,30.0,52.0,Good,Within the past year,No,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
x0_ready = X_test_ready[0:10, :]
x0_human = inverse_preprocess(x0_ready, preprocessor)
x0_human



Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,Age_num,General_Health,Checkup,Diabetes,Exercise,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Smoking_History
0,170.0,63.5,21.93,30.0,12.0,4.0,30.0,52.0,Good,Within the past year,No,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,163.0,55.79,21.11,20.0,90.0,12.0,0.0,62.0,Excellent,Within the past 2 years,No,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2,170.0,89.81,31.01,0.0,120.0,90.0,8.0,32.0,Excellent,Within the past 2 years,No,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,165.0,70.76,25.96,20.0,3.552714e-15,3.0,0.0,72.0,Fair,Within the past year,Yes,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,183.0,72.57,21.7,1.0,90.0,12.0,4.0,67.0,Very Good,Within the past year,No,1.0,0.0,0.0,0.0,1.0,1.0,1.0
5,185.0,120.2,34.96,15.0,1.0,0.0,4.0,47.0,Good,Within the past 5 years,Yes,1.0,0.0,0.0,1.0,0.0,1.0,0.0
6,163.0,52.16,19.74,0.0,60.0,8.0,4.0,72.0,Good,Within the past year,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,183.0,124.74,37.3,8.0,20.0,60.0,8.0,37.0,Good,5 or more years ago,No,1.0,0.0,0.0,0.0,0.0,1.0,1.0
8,157.0,70.31,28.35,6.0,24.0,12.0,12.0,37.0,Good,Within the past 5 years,No,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,183.0,88.0,26.31,30.0,8.0,0.0,3.0,72.0,Good,Within the past year,No,1.0,0.0,0.0,0.0,1.0,1.0,1.0
