
# Classical ML Model Development Pipeline (with MLflow)
>  **`column_3C_processed.csv`** the scipt works with this file

## 1) Setup & Imports

In [71]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    train_test_split, cross_val_score, 
    RandomizedSearchCV, StratifiedKFold
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score, roc_curve,
    precision_recall_curve, average_precision_score
)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import warnings
warnings.filterwarnings("ignore")

import json
import pickle
from typing import Dict, List, Tuple, Any
import shap
from sklearn.model_selection import learning_curve
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.stattools import durbin_watson
from scipy import stats
from sklearn.pipeline import Pipeline

# --- ENSURE OUTPUT DIRECTORY EXISTS ---
os.makedirs('outputs_ml', exist_ok=True)

print("Imports complete.")

Imports complete.


## 2) Config

In [72]:
# create a configuration class for the model development pipeline
class Config:
    """Configuration class for model development pipeline"""
    EXPERIMENT_NAME = "Orthopedic_Patients_Classification"
    MODEL_REGISTRY_NAME = "orthopedic_classifier"
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    VALIDATION_SIZE = 0.2  # From training set
    CV_FOLDS = 5
    MAX_EVALS = 50  # For hyperparameter tuning
    
    # Class imbalance handling
    IMBALANCE_STRATEGY = "SMOTE"  # Options: "SMOTE", "UNDERSAMPLING", "SMOTEENN", "WEIGHTED"
    
    # Linear model preprocessing
    POWER_TRANSFORM = True  # Apply Yeo-Johnson transformation
    VIF_THRESHOLD = 5.0  # Variance Inflation Factor threshold
    OUTLIER_REMOVAL = True  # Remove outliers for linear models
    
    # MLflow tracking
    TRACKING_URI = "sqlite:///mlflow.db"  # Use SQLite for local tracking
    ARTIFACT_ROOT = "./mlruns"

cfg = Config()
cfg.__dict__

{}

## 3) Pipeline Class — Initialization & MLflow Setup

In [73]:

class ModelDevelopmentPipeline:
    """Comprehensive model development pipeline with MLflow tracking"""
    
    def __init__(self, config: Config):
        self.config = config
        self.df = None
        self.X_train = None
        self.X_val = None
        self.X_test = None
        self.y_train = None
        self.y_val = None
        self.y_test = None
        self.scaler = None
        self.standard_scaler = None
        self.power_transformer = None
        self.label_encoder = None
        self.feature_names = None
        self.selected_features = None
        self.models_performance = {}
        self.smote = None
        self.outlier_mask = None
        
        # Setup MLflow
        self.setup_mlflow()
    
    def setup_mlflow(self):
        """Initialize MLflow tracking"""
        mlflow.set_tracking_uri(self.config.TRACKING_URI)
        
        # Create experiment if it doesn't exist
        try:
            experiment_id = mlflow.create_experiment(
                name=self.config.EXPERIMENT_NAME,
                artifact_location=self.config.ARTIFACT_ROOT
            )
        except mlflow.exceptions.MlflowException:
            experiment = mlflow.get_experiment_by_name(self.config.EXPERIMENT_NAME)
            experiment_id = experiment.experiment_id if experiment else None
        
        mlflow.set_experiment(self.config.EXPERIMENT_NAME)
        print(f"MLflow experiment: {self.config.EXPERIMENT_NAME}")
        print(f"Experiment ID: {experiment_id}")

pipeline = ModelDevelopmentPipeline(cfg)

MLflow experiment: Orthopedic_Patients_Classification
Experiment ID: 1


### 3.1 Load & Prepare Data

In [74]:
# Load and prepare the dataset
def _mdp_load_and_prepare_data(self):
    """Load and prepare the dataset"""
    print("Loading and preparing data...")
    self.df = pd.read_csv('column_3C_processed.csv')
    print(f"Dataset shape: {self.df.shape}")
    print(f"Target distribution:\n{self.df['binary_class'].value_counts()}")
    
    
    with mlflow.start_run(run_name="data_preparation"):
        mlflow.log_param("dataset_shape", self.df.shape)
        mlflow.log_param("n_features", len(self.df.select_dtypes(include=[np.number]).columns))
        mlflow.log_param("target_classes", list(self.df['binary_class'].unique()))
        mlflow.log_param("class_distribution", dict(self.df['binary_class'].value_counts()))
        mlflow.log_metric("missing_values", self.df.isnull().sum().sum())
        mlflow.log_metric("duplicate_rows", self.df.duplicated().sum())
        
        dataset_info = {
            "shape": self.df.shape,
            "columns": list(self.df.columns),
            "dtypes": {col: str(dtype) for col, dtype in self.df.dtypes.items()},
            "missing_values": self.df.isnull().sum().to_dict(),
            "class_distribution": self.df['binary_class'].value_counts().to_dict()
        }
        with open("outputs_ml/dataset_info.json", "w") as f:
            json.dump(dataset_info, f, indent=2)
        mlflow.log_artifact("outputs_ml/dataset_info.json")
        os.remove("outputs_ml/dataset_info.json")
ModelDevelopmentPipeline.load_and_prepare_data = _mdp_load_and_prepare_data

pipeline.load_and_prepare_data()

pipeline.df.head()



Loading and preparing data...
Dataset shape: (310, 8)
Target distribution:
binary_class
Abnormal    210
Normal      100
Name: count, dtype: int64


Unnamed: 0,pelvic_tilt,sacral_slope,lumbar_lordosis_angle,pelvic_radius,pi_ss_ratio,class,binary_class,degree_spondylolisthesis_PowerTransformer
0,22.552586,40.475232,39.609117,98.672917,1.557195,Hernia,Abnormal,-0.267585
1,10.060991,28.99596,25.015378,114.405425,1.346979,Hernia,Abnormal,2.922868
2,22.218482,46.613539,50.092194,105.985135,1.476653,Hernia,Abnormal,-5.347396
3,24.652878,44.64413,44.311238,101.868495,1.552209,Hernia,Abnormal,5.581202
4,9.652075,40.060784,28.317406,108.168725,1.240936,Hernia,Abnormal,4.373008


### 3.2 Check Linear Model Assumptions

In [75]:
# check linear model assumptions: multicollinearity - VIF , normality - shapiro test, outliers - Quantile method
def _mdp_check_linear_model_assumptions(self, X, y):
    print("Checking linear model assumptions...")
    assumptions_results = {}
    # VIF
    print("  Checking multicollinearity...")
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    high_vif_features = vif_data[vif_data["VIF"] > self.config.VIF_THRESHOLD]["Feature"].tolist()
    assumptions_results["high_vif_features"] = high_vif_features
    assumptions_results["vif_data"] = vif_data
    print(f"    Features with VIF > {self.config.VIF_THRESHOLD}: {len(high_vif_features)}")
    # Normality
    print("  Checking feature normality...")
    normality_results = {}
    for col in X.columns:
        stat, p_value = stats.shapiro(X[col])
        normality_results[col] = {"statistic": float(stat), "p_value": float(p_value), "is_normal": bool(p_value > 0.05)}
    non_normal_features = [col for col, r in normality_results.items() if not r["is_normal"]]
    assumptions_results["non_normal_features"] = non_normal_features
    assumptions_results["normality_results"] = normality_results
    print(f"    Non-normal features: {len(non_normal_features)}")
    # Outliers
    print("  Checking for outliers...")
    outlier_counts = {}
    for col in X.columns:
        Q1, Q3 = X[col].quantile(0.25), X[col].quantile(0.75)
        IQR = Q3 - Q1
        lb, ub = Q1 - 1.5*IQR, Q3 + 1.5*IQR
        outliers = ((X[col] < lb) | (X[col] > ub)).sum()
        outlier_counts[col] = int(outliers)
    assumptions_results["outlier_counts"] = outlier_counts
    total_outliers = sum(outlier_counts.values())
    print(f"    Total outliers detected: {total_outliers}")
    # Save summaries
    vif_data.to_csv("outputs_ml/vif_data.csv", index=False)
    with open("outputs_ml/normality_results.json", "w") as f: json.dump(normality_results, f, indent=2)
    with open("outputs_ml/outlier_counts.json", "w") as f: json.dump(outlier_counts, f, indent=2)
    return assumptions_results
ModelDevelopmentPipeline.check_linear_model_assumptions = _mdp_check_linear_model_assumptions
X = pipeline.df.drop(columns=['binary_class', 'class'])
y = pipeline.df['binary_class']
pipeline.check_linear_model_assumptions(X,y)

Checking linear model assumptions...
  Checking multicollinearity...
    Features with VIF > 5.0: 5
  Checking feature normality...
    Non-normal features: 6
  Checking for outliers...
    Total outliers detected: 48


{'high_vif_features': ['pelvic_tilt',
  'sacral_slope',
  'lumbar_lordosis_angle',
  'pelvic_radius',
  'pi_ss_ratio'],
 'vif_data':                                      Feature        VIF
 0                                pelvic_tilt  15.641706
 1                               sacral_slope  17.965248
 2                      lumbar_lordosis_angle  20.399892
 3                              pelvic_radius  54.438131
 4                                pi_ss_ratio  65.882886
 5  degree_spondylolisthesis_PowerTransformer   2.925936,
 'non_normal_features': ['pelvic_tilt',
  'sacral_slope',
  'lumbar_lordosis_angle',
  'pelvic_radius',
  'pi_ss_ratio',
  'degree_spondylolisthesis_PowerTransformer'],
 'normality_results': {'pelvic_tilt': {'statistic': 0.9663857998124333,
   'p_value': 1.320714460490842e-06,
   'is_normal': False},
  'sacral_slope': {'statistic': 0.9639800182045095,
   'p_value': 5.886692035435651e-07,
   'is_normal': False},
  'lumbar_lordosis_angle': {'statistic': 0.9718123872

### 3.3 Apply Transformations

In [76]:

def _mdp_apply_transformations(self, X_train, X_val, X_test, for_linear_models=False):
    print("Applying data transformations...")
    X_train_t = X_train.copy(); X_val_t = X_val.copy(); X_test_t = X_test.copy()

    if for_linear_models:
        print("  Applying standard scaling to all columns...")
        self.standard_scaler = StandardScaler() # For linear models, we use StandardScaler
        X_train_t = pd.DataFrame(self.standard_scaler.fit_transform(X_train_t), columns=X_train_t.columns, index=X_train_t.index)
        X_val_t   = pd.DataFrame(self.standard_scaler.transform(X_val_t), columns=X_val_t.columns, index=X_val_t.index)
        X_test_t  = pd.DataFrame(self.standard_scaler.transform(X_test_t), columns=X_test_t.columns, index=X_test_t.index)
        print("  Applying feature selection...")
        k_best = min(len(X_train_t.columns) - 1, 6) # Limit to 4 features for linear models
        selector = SelectKBest(score_func=f_classif, k=k_best) # Select top k features based on ANOVA F-value
        X_train_t = selector.fit_transform(X_train_t, self.y_train)
        X_val_t   = selector.transform(X_val_t)
        X_test_t  = selector.transform(X_test_t)
        self.selected_features = [self.feature_names[i] for i in selector.get_support(indices=True)]
        print(f"    Selected features: {self.selected_features}")
    else:
        print("  Applying standard scaling to all columns...")
        self.scaler = StandardScaler()
        X_train_t = self.scaler.fit_transform(X_train_t)
        X_val_t   = self.scaler.transform(X_val_t)
        X_test_t  = self.scaler.transform(X_test_t)
    return X_train_t, X_val_t, X_test_t
ModelDevelopmentPipeline.apply_transformations = _mdp_apply_transformations

print(pipeline.selected_features)


None


### 3.4 Handle Class Imbalance

In [77]:

def _mdp_handle_class_imbalance(self, X_train, y_train):
    print(f"Handling class imbalance using {self.config.IMBALANCE_STRATEGY}...")
    print("  Original distribution:", dict(pd.Series(y_train).value_counts().sort_index()))
    X_res, y_res = X_train, y_train
    if self.config.IMBALANCE_STRATEGY == "SMOTE":
        self.smote = SMOTE(random_state=self.config.RANDOM_STATE)
        X_res, y_res = self.smote.fit_resample(X_train, y_train)
    elif self.config.IMBALANCE_STRATEGY == "UNDERSAMPLING":
        rus = RandomUnderSampler(random_state=self.config.RANDOM_STATE)
        X_res, y_res = rus.fit_resample(X_train, y_train)
    elif self.config.IMBALANCE_STRATEGY == "SMOTEENN":
        smote_enn = SMOTEENN(random_state=self.config.RANDOM_STATE)
        X_res, y_res = smote_enn.fit_resample(X_train, y_train)
    elif self.config.IMBALANCE_STRATEGY == "WEIGHTED":
        print("  Using class weights in models...")
        return X_train, y_train
    print("  New distribution:", dict(pd.Series(y_res).value_counts().sort_index()))
    return X_res, y_res
ModelDevelopmentPipeline.handle_class_imbalance = _mdp_handle_class_imbalance


### 3.5 Split & Scale Data

In [78]:

def _mdp_split_and_scale_data(self, target_col='binary_class'):
    print("Splitting and scaling data...")
    numerical_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
    X = self.df[numerical_cols]; y = self.df[target_col]
    self.label_encoder = LabelEncoder(); y_enc = self.label_encoder.fit_transform(y)
    self.feature_names = numerical_cols
    X_temp, self.X_test, y_temp, self.y_test = train_test_split(
        X, y_enc, test_size=self.config.TEST_SIZE, random_state=self.config.RANDOM_STATE, stratify=y_enc
    )
    val_size_adj = self.config.VALIDATION_SIZE / (1 - self.config.TEST_SIZE)
    self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
        X_temp, y_temp, test_size=val_size_adj, random_state=self.config.RANDOM_STATE, stratify=y_temp
    )
    print(f"Training set: {self.X_train.shape}")
    print(f"Validation set: {self.X_val.shape}")
    print(f"Test set: {self.X_test.shape}")
    assumptions = self.check_linear_model_assumptions(self.X_train, self.y_train)
    self.X_train_linear, self.X_val_linear, self.X_test_linear = self.apply_transformations(self.X_train, self.X_val, self.X_test, for_linear_models=True)
    self.X_train_scaled, self.X_val_scaled, self.X_test_scaled = self.apply_transformations(self.X_train, self.X_val, self.X_test, for_linear_models=False)
    self.X_train_balanced, self.y_train_balanced = self.handle_class_imbalance(self.X_train_scaled, self.y_train)
    self.X_train_linear_balanced, self.y_train_linear_balanced = self.handle_class_imbalance(self.X_train_linear, self.y_train)
    with mlflow.start_run(run_name="data_splitting"):
        mlflow.log_param("train_size", len(self.X_train))
        mlflow.log_param("val_size", len(self.X_val))
        mlflow.log_param("test_size", len(self.X_test))
        mlflow.log_param("n_features", len(self.feature_names))
        mlflow.log_param("feature_names", self.feature_names)
        mlflow.log_param("target_encoding", dict(zip(self.label_encoder.classes_, self.label_encoder.transform(self.label_encoder.classes_))))
        mlflow.log_param("imbalance_strategy", self.config.IMBALANCE_STRATEGY)
        mlflow.log_param("power_transform", self.config.POWER_TRANSFORM)
        mlflow.log_param("vif_threshold", self.config.VIF_THRESHOLD)
        mlflow.log_param("high_vif_features", assumptions["high_vif_features"])
        mlflow.log_param("non_normal_features", assumptions["non_normal_features"])
        mlflow.log_metric("total_outliers", sum(assumptions["outlier_counts"].values()))
        mlflow.log_metric("balanced_train_size", len(self.X_train_balanced))
        if self.selected_features:
            mlflow.log_param("selected_features", self.selected_features)
ModelDevelopmentPipeline.split_and_scale_data = _mdp_split_and_scale_data

### 3.6 Model Configurations

In [79]:

def _mdp_get_model_configurations(self) -> Dict[str, Dict]:
    use_class_weights = self.config.IMBALANCE_STRATEGY == "WEIGHTED"
    return {
        'logistic_regression': {
            'model': LogisticRegression(random_state=self.config.RANDOM_STATE, max_iter=2000), #  increased max_iter for convergence
            'params': {
                'C': [0.001, 0.01, 0.1, 1, 10, 100], # regularization strength
                'penalty': ['l1', 'l2', 'elasticnet'], # regularization type
                'solver': ['liblinear', 'saga'], #  solvers that support l1 and elasticnet
                'class_weight': ['balanced'] if use_class_weights else [None, 'balanced'] # class weights
            },
            'is_linear': True
        },
        'random_forest': {
            'model': RandomForestClassifier(random_state=self.config.RANDOM_STATE),
            'params': {
                'n_estimators': [100, 200, 300], # number of trees
                'max_depth': [None, 10, 20, 30], # maximum depth of trees
                'min_samples_split': [2, 5, 10], # minimum samples required to split an internal node
                'min_samples_leaf': [1, 2, 4], # minimum samples required to be at a leaf node
                'max_features': ['sqrt', 'log2', None], # maximum number of features to consider when looking for the best split
                'class_weight': ['balanced'] if use_class_weights else [None, 'balanced'] # class weights
            },
            'is_linear': False
        },
        'gradient_boosting': {
            'model': GradientBoostingClassifier(random_state=self.config.RANDOM_STATE),
            'params': {
                'n_estimators': [100, 200, 300], # number of boosting stages to be run
                'learning_rate': [0.01, 0.1, 0.2], # learning rate shrinks the contribution of each tree
                'max_depth': [3, 5, 7], # maximum depth of the individual regression estimators
                'min_samples_split': [2, 5, 10], # minimum number of samples required to split an internal node
                'min_samples_leaf': [1, 2, 4], # minimum number of samples required to be at a leaf node
                'subsample': [0.8, 0.9, 1.0] # fraction of samples to be used for fitting the individual base learners
            },
            'is_linear': False # Gradient Boosting is not a linear model
        },
        'svm': {
            'model': SVC(random_state=self.config.RANDOM_STATE, probability=True), # probability=True for ROC/AUC
            'params': {
                'C': [0.1, 1, 10, 100], # regularization parameter
                'kernel': ['rbf', 'poly', 'linear'], #  
                'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], # kernel coefficient for 'rbf', 'poly', and 'sigmoid'
                'class_weight': ['balanced'] if use_class_weights else [None, 'balanced'] # 
            },
            'is_linear': True
        },
        'naive_bayes': {
            'model': GaussianNB(), # Gaussian Naive Bayes
            'params': {
                'var_smoothing': np.logspace(-10, -6, 10) #  variance smoothing parameter
            },
            'is_linear': True
        },
        'decision_tree': {
            'model': DecisionTreeClassifier(random_state=self.config.RANDOM_STATE),
            'params': {
                'max_depth': [None, 5, 10, 15, 20], #  maximum depth of the tree
                'min_samples_split': [2, 5, 10, 20], # minimum number of samples required to split an internal node
                'min_samples_leaf': [1, 2, 5, 10], # minimum number of samples required to be at a leaf node
                'max_features': ['sqrt', 'log2', None], # maximum number of features to consider when looking for the best split
                'class_weight': ['balanced'] if use_class_weights else [None, 'balanced'] # class weights
            },
            'is_linear': False
        },
        'knn': {
            'model': KNeighborsClassifier(),
            'params': {
                'n_neighbors': [3, 5, 7, 9, 11, 15], # number of neighbors to use
                'weights': ['uniform', 'distance'], #   weighting function used in prediction
                'metric': ['euclidean', 'manhattan', 'minkowski'] # distance metric
            },
            'is_linear': False
        },
    }
ModelDevelopmentPipeline.get_model_configurations = _mdp_get_model_configurations

### 3.7 Evaluate Model

In [80]:

def _mdp_evaluate_model(self, model, X_test, y_test, model_name: str) -> Dict[str, float]:
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1_score': f1_score(y_test, y_pred, average='weighted'),
        'precision_macro': precision_score(y_test, y_pred, average='macro'),
        'recall_macro': recall_score(y_test, y_pred, average='macro'),
        'f1_macro': f1_score(y_test, y_pred, average='macro'),
    }
    if y_prob is not None:
        metrics['roc_auc'] = roc_auc_score(y_test, y_prob)
        metrics['avg_precision'] = average_precision_score(y_test, y_prob)
    cv_scores = cross_val_score(model, X_test, y_test, cv=self.config.CV_FOLDS, scoring='accuracy')
    metrics['cv_accuracy_mean'] = cv_scores.mean()
    metrics['cv_accuracy_std'] = cv_scores.std()
    return metrics
ModelDevelopmentPipeline.evaluate_model = _mdp_evaluate_model

### 3.8 Create Evaluation Plots

In [81]:

def _mdp_create_evaluation_plots(self, model, X_test, y_test, model_name: str, feature_names: List[str]):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0])
    axes[0, 0].set_title(f'Confusion Matrix - {model_name}')
    axes[0, 0].set_ylabel('True Label')
    axes[0, 0].set_xlabel('Predicted Label')
    if y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        auc_score = roc_auc_score(y_test, y_prob)
        axes[0, 1].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})')
        axes[0, 1].plot([0, 1], [0, 1], 'k--')
        axes[0, 1].set_xlabel('False Positive Rate')
        axes[0, 1].set_ylabel('True Positive Rate')
        axes[0, 1].set_title(f'ROC Curve - {model_name}')
        axes[0, 1].legend()
        precision, recall, _ = precision_recall_curve(y_test, y_prob)
        avg_precision = average_precision_score(y_test, y_prob)
        axes[1, 0].plot(recall, precision, label=f'PR Curve (AP = {avg_precision:.3f})')
        axes[1, 0].set_xlabel('Recall')
        axes[1, 0].set_ylabel('Precision')
        axes[1, 0].set_title(f'Precision-Recall Curve - {model_name}')
        axes[1, 0].legend()
    if hasattr(model, 'feature_importances_'):
        feature_importance = pd.DataFrame({'feature': feature_names,'importance': model.feature_importances_}).sort_values('importance', ascending=True)
        axes[1, 1].barh(feature_importance['feature'], feature_importance['importance'])
        axes[1, 1].set_title(f'Feature Importance - {model_name}'); axes[1, 1].set_xlabel('Importance')
    elif hasattr(model, 'coef_'):
        coef_importance = pd.DataFrame({'feature': feature_names,'coefficient': np.abs(model.coef_[0])}).sort_values('coefficient', ascending=True)
        axes[1, 1].barh(coef_importance['feature'], coef_importance['coefficient'])
        axes[1, 1].set_title(f'Feature Coefficients - {model_name}'); axes[1, 1].set_xlabel('Absolute Coefficient')
    plt.tight_layout()
    out_path = f'outputs_ml/{model_name}_evaluation.png'
    plt.savefig(out_path, dpi=300, bbox_inches='tight'); plt.close()
    return out_path
ModelDevelopmentPipeline.create_evaluation_plots = _mdp_create_evaluation_plots

### 3.9 Train & Evaluate Models

In [82]:

def _mdp_train_and_evaluate_models(self):
    print("Training and evaluating models...")
    model_configs = self.get_model_configurations()
    for model_name, config in model_configs.items():
        print(f"\n{'='*50}\nTraining {model_name.upper()}\n{'='*50}")
        with mlflow.start_run(run_name=f"{model_name}_training"):
            try:
                mlflow.log_param("model_type", model_name)
                mlflow.log_param("random_state", self.config.RANDOM_STATE)
                mlflow.log_param("is_linear_model", config.get('is_linear', False))
                is_linear = config.get('is_linear', False)
                if is_linear:
                    if self.config.IMBALANCE_STRATEGY == "WEIGHTED":
                        X_train_use, y_train_use = self.X_train_linear, self.y_train
                    else:
                        X_train_use, y_train_use = self.X_train_linear_balanced, self.y_train_linear_balanced
                    X_val_use, X_test_use = self.X_val_linear, self.X_test_linear
                    feature_names = self.selected_features if self.selected_features else self.feature_names
                else:
                    if self.config.IMBALANCE_STRATEGY == "WEIGHTED":
                        X_train_use, y_train_use = self.X_train_scaled, self.y_train
                    else:
                        X_train_use, y_train_use = self.X_train_balanced, self.y_train_balanced
                    X_val_use, X_test_use = self.X_val_scaled, self.X_test_scaled
                    feature_names = self.feature_names
                mlflow.log_param("data_preprocessing", "linear_transformed" if is_linear else "standard_scaled")
                mlflow.log_param("train_samples", len(X_train_use))
                mlflow.log_param("features_used", feature_names)
                print("Performing hyperparameter tuning...")
                search = RandomizedSearchCV(
                    estimator=config['model'],
                    param_distributions=config['params'],
                    n_iter=min(self.config.MAX_EVALS, np.prod([len(v) if isinstance(v, list) else 1 for v in config['params'].values()])), 
                    cv=StratifiedKFold(n_splits=self.config.CV_FOLDS, shuffle=True, random_state=self.config.RANDOM_STATE),
                    scoring='f1_weighted', n_jobs=-1, random_state=self.config.RANDOM_STATE, verbose=1
                )
                search.fit(X_train_use, y_train_use)
                best_model = search.best_estimator_
                mlflow.log_params(search.best_params_); mlflow.log_metric("best_cv_score", search.best_score_)
                val_metrics = self.evaluate_model(best_model, X_val_use, self.y_val, model_name)
                for k,v in val_metrics.items(): mlflow.log_metric(f"val_{k}", v)
                test_metrics = self.evaluate_model(best_model, X_test_use, self.y_test, model_name)
                for k,v in test_metrics.items(): mlflow.log_metric(f"test_{k}", v)
                self.models_performance[model_name] = {
                    'model': best_model, 'best_params': search.best_params_,
                    'val_metrics': val_metrics, 'test_metrics': test_metrics,
                    'is_linear': is_linear, 'X_test_use': X_test_use, 'feature_names': feature_names
                }
                plot_path = self.create_evaluation_plots(best_model, X_test_use, self.y_test, model_name, feature_names)
                mlflow.log_artifact(plot_path); os.remove(plot_path)
                signature = infer_signature(X_train_use, y_train_use)
                mlflow.sklearn.log_model(sk_model=best_model, artifact_path=f"model_{model_name}",
                                         signature=signature, input_example=X_train_use[:5])
                if hasattr(best_model, 'feature_importances_'):
                    fi = pd.DataFrame({'feature': feature_names, 'importance': best_model.feature_importances_}).sort_values('importance', ascending=False)
                    top_features = fi.head(5)['feature'].tolist()
                    mlflow.log_param("top_5_features", top_features)
                    fi_path = f'outputs_ml/{model_name}_feature_importance.csv'
                    fi.to_csv(fi_path, index=False); mlflow.log_artifact(fi_path); os.remove(fi_path)
                print(f"✓ {model_name} training completed")
                print(f"  Best validation F1: {val_metrics['f1_score']:.4f}")
                print(f"  Test F1: {test_metrics['f1_score']:.4f}")
            except Exception as e:
                print(f"✗ Error training {model_name}: {str(e)}")
                mlflow.log_param("error", str(e))
                continue
ModelDevelopmentPipeline.train_and_evaluate_models = _mdp_train_and_evaluate_models

### 3.10 Compare Models

In [83]:

def _mdp_compare_models(self):
    print("\n" + "="*60); print("MODEL COMPARISON AND SELECTION"); print("="*60)
    with mlflow.start_run(run_name="model_comparison"):
        comparison_data = []
        for model_name, perf in self.models_performance.items():
            row = {
                'Model': model_name,
                'Val_Accuracy': perf['val_metrics']['accuracy'],
                'Val_Precision': perf['val_metrics']['precision'],
                'Val_Recall': perf['val_metrics']['recall'],
                'Val_F1': perf['val_metrics']['f1_score'],
                'Test_Accuracy': perf['test_metrics']['accuracy'],
                'Test_Precision': perf['test_metrics']['precision'],
                'Test_Recall': perf['test_metrics']['recall'],
                'Test_F1': perf['test_metrics']['f1_score'],
            }
            if 'roc_auc' in perf['test_metrics']:
                row['Test_ROC_AUC'] = perf['test_metrics']['roc_auc']
            comparison_data.append(row)
        comparison_df = pd.DataFrame(comparison_data).sort_values('Test_F1', ascending=False)
        print("Model Performance Comparison:"); print(comparison_df.round(4))
        comp_csv_path = 'outputs_ml/model_comparison.csv'; comparison_df.to_csv(comp_csv_path, index=False); mlflow.log_artifact(comp_csv_path)
        best_model_name = comparison_df.iloc[0]['Model']; best_model_f1 = comparison_df.iloc[0]['Test_F1']
        mlflow.log_param("best_model", best_model_name); mlflow.log_metric("best_model_f1", best_model_f1)
        print(f"\n🏆 Best Model: {best_model_name} (Test F1: {best_model_f1:.4f})")
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        metrics_to_plot = ['Test_Accuracy', 'Test_Precision', 'Test_Recall', 'Test_F1']
        for i, metric in enumerate(metrics_to_plot):
            ax = axes[i//2, i%2]; bars = ax.bar(comparison_df['Model'], comparison_df[metric])
            ax.set_title(f'{metric.replace("_", " ")} Comparison'); ax.set_ylabel(metric.replace("_", " ")); ax.tick_params(axis='x', rotation=45)
            best_idx = comparison_df[metric].idxmax(); bars[list(comparison_df.index).index(best_idx)].set_color('gold')
            for j, v in enumerate(comparison_df[metric]): ax.text(j, v + 0.005, f'{v:.3f}', ha='center', va='bottom')
        plt.tight_layout(); comp_png_path = 'outputs_ml/model_comparison_chart.png'
        plt.savefig(comp_png_path, dpi=300, bbox_inches='tight'); mlflow.log_artifact(comp_png_path); plt.close()
        os.remove(comp_csv_path); os.remove(comp_png_path)
        return best_model_name
ModelDevelopmentPipeline.compare_models = _mdp_compare_models

### 3.11 Advanced Model Analysis

In [84]:

def _mdp_advanced_model_analysis(self, best_model_name: str):
    print("\n" + "="*60); print(f"ADVANCED ANALYSIS - {best_model_name.upper()}"); print("="*60)
    best = self.models_performance[best_model_name]
    best_model = best['model']; is_linear = best['is_linear']; X_test_use = best['X_test_use']; feature_names = best['feature_names']
    with mlflow.start_run(run_name=f"{best_model_name}_advanced_analysis"):
        print("Generating learning curves...")
        train_sizes, train_scores, val_scores = learning_curve(best_model, X_test_use, self.y_test, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes, train_scores.mean(axis=1), 'o-', label='Training Score')
        plt.plot(train_sizes, val_scores.mean(axis=1), 'o-', label='Validation Score')
        plt.fill_between(train_sizes, train_scores.mean(axis=1) - train_scores.std(axis=1),
                         train_scores.mean(axis=1) + train_scores.std(axis=1), alpha=0.1)
        plt.fill_between(train_sizes, val_scores.mean(axis=1) - val_scores.std(axis=1),
                         val_scores.mean(axis=1) + val_scores.std(axis=1), alpha=0.1)
        plt.xlabel('Training Set Size'); plt.ylabel('Score'); plt.title(f'Learning Curves - {best_model_name}'); plt.legend(); plt.grid(True, alpha=0.3)
        lc_path = 'outputs_ml/learning_curves.png'; plt.savefig(lc_path, dpi=300, bbox_inches='tight'); mlflow.log_artifact(lc_path); plt.close()
        print("Calculating permutation importance...")
        perm = permutation_importance(best_model, X_test_use, self.y_test, n_repeats=10, random_state=self.config.RANDOM_STATE)
        perm_df = pd.DataFrame({'feature': feature_names, 'importance_mean': perm.importances_mean, 'importance_std': perm.importances_std}).sort_values('importance_mean', ascending=False)
        plt.figure(figsize=(10, 6))
        plt.barh(perm_df['feature'], perm_df['importance_mean'], xerr=perm_df['importance_std'])
        plt.xlabel('Permutation Importance'); plt.title(f'Permutation Importance - {best_model_name}'); plt.tight_layout()
        perm_path = 'outputs_ml/permutation_importance.png'; plt.savefig(perm_path, dpi=300, bbox_inches='tight'); mlflow.log_artifact(perm_path); plt.close()
        print("Analyzing model calibration...")
        if hasattr(best_model, 'predict_proba'):
            from sklearn.calibration import calibration_curve
            y_prob = best_model.predict_proba(X_test_use)[:, 1]
            fop, mpv = calibration_curve(self.y_test, y_prob, n_bins=10)
            plt.figure(figsize=(10, 6))
            plt.plot(mpv, fop, "s-", label=f"{best_model_name}"); plt.plot([0,1],[0,1],"k:",label="Perfectly calibrated")
            plt.xlabel("Mean Predicted Probability"); plt.ylabel("Fraction of Positives"); plt.title(f'Calibration Plot - {best_model_name}')
            plt.legend(); plt.grid(True, alpha=0.3)
            calib_path = 'outputs_ml/calibration_plot.png'; plt.savefig(calib_path, dpi=300, bbox_inches='tight'); mlflow.log_artifact(calib_path); plt.close()
        try:
            print("Computing SHAP values...")
            explainer = shap.Explainer(best_model, X_test_use)
            shap_values = explainer(X_test_use)
            shap.summary_plot(shap_values, X_test_use, feature_names=feature_names, show=False)
            plt.tight_layout(); shap_path = 'outputs_ml/shap_summary.png'; plt.savefig(shap_path, dpi=300, bbox_inches='tight'); mlflow.log_artifact(shap_path); plt.close()
        except Exception as e:
            print(f"SHAP analysis skipped: {e}")
        if is_linear:
            print("Validating linear model assumptions...")
            try:
                y_pred = best_model.predict(X_test_use); residuals = self.y_test - y_pred
                plt.figure(figsize=(12, 8))
                plt.subplot(2,2,1); plt.scatter(y_pred, residuals, alpha=0.6); plt.axhline(0, color='r', ls='--'); plt.title('Residuals vs Fitted'); plt.xlabel('Fitted'); plt.ylabel('Residuals')
                plt.subplot(2,2,2); stats.probplot(residuals, dist="norm", plot=plt); plt.title('Q-Q Plot')
                plt.subplot(2,2,3); plt.hist(residuals, bins=20, density=True, alpha=0.7); plt.title('Residuals Distribution'); plt.xlabel('Residuals'); plt.ylabel('Density')
                plt.subplot(2,2,4); plt.scatter(y_pred, np.sqrt(np.abs(residuals)), alpha=0.6); plt.title('Scale-Location'); plt.xlabel('Fitted'); plt.ylabel('√|Residuals|')
                plt.tight_layout(); lav_path = 'outputs_ml/linear_assumptions_validation.png'; plt.savefig(lav_path, dpi=300, bbox_inches='tight'); mlflow.log_artifact(lav_path); plt.close()
                mlflow.log_metric("residuals_mean", float(np.mean(residuals))); mlflow.log_metric("residuals_std", float(np.std(residuals)))
                dw_stat = durbin_watson(residuals); mlflow.log_metric("durbin_watson_stat", float(dw_stat))
                print(f"    Residuals mean: {np.mean(residuals):.4f}\n    Residuals std: {np.std(residuals):.4f}\n    Durbin-Watson: {dw_stat:.4f}")
            except Exception as e:
                print(f"Linear assumption validation skipped: {e}")
        print("Advanced analysis complete.\n")
ModelDevelopmentPipeline.advanced_model_analysis = _mdp_advanced_model_analysis

## 4) Run Steps

In [85]:

# Instantiate pipeline
pipeline = ModelDevelopmentPipeline(cfg)
pipeline

MLflow experiment: Orthopedic_Patients_Classification
Experiment ID: 1


<__main__.ModelDevelopmentPipeline at 0x165014910>

In [86]:

# Load data
pipeline.load_and_prepare_data()

Loading and preparing data...
Dataset shape: (310, 8)
Target distribution:
binary_class
Abnormal    210
Normal      100
Name: count, dtype: int64


In [87]:

# Split & scale
pipeline.split_and_scale_data()

Splitting and scaling data...
Training set: (186, 6)
Validation set: (62, 6)
Test set: (62, 6)
Checking linear model assumptions...
  Checking multicollinearity...
    Features with VIF > 5.0: 5
  Checking feature normality...
    Non-normal features: 6
  Checking for outliers...
    Total outliers detected: 42
Applying data transformations...
  Applying standard scaling to all columns...
  Applying feature selection...
    Selected features: ['pelvic_tilt', 'sacral_slope', 'lumbar_lordosis_angle', 'pelvic_radius', 'degree_spondylolisthesis_PowerTransformer']
Applying data transformations...
  Applying standard scaling to all columns...
Handling class imbalance using SMOTE...
  Original distribution: {0: np.int64(126), 1: np.int64(60)}
  New distribution: {0: np.int64(126), 1: np.int64(126)}
Handling class imbalance using SMOTE...
  Original distribution: {0: np.int64(126), 1: np.int64(60)}
  New distribution: {0: np.int64(126), 1: np.int64(126)}


In [88]:

# Train & evaluate models (with tuning)
pipeline.train_and_evaluate_models()

Training and evaluating models...

Training LOGISTIC_REGRESSION
Performing hyperparameter tuning...
Fitting 5 folds for each of 50 candidates, totalling 250 fits




✓ logistic_regression training completed
  Best validation F1: 0.8124
  Test F1: 0.8127

Training RANDOM_FOREST
Performing hyperparameter tuning...
Fitting 5 folds for each of 50 candidates, totalling 250 fits




✓ random_forest training completed
  Best validation F1: 0.8184
  Test F1: 0.7938

Training GRADIENT_BOOSTING
Performing hyperparameter tuning...
Fitting 5 folds for each of 50 candidates, totalling 250 fits




✓ gradient_boosting training completed
  Best validation F1: 0.8363
  Test F1: 0.7938

Training SVM
Performing hyperparameter tuning...
Fitting 5 folds for each of 50 candidates, totalling 250 fits




✓ svm training completed
  Best validation F1: 0.7709
  Test F1: 0.8406

Training NAIVE_BAYES
Performing hyperparameter tuning...
Fitting 5 folds for each of 1 candidates, totalling 5 fits




✓ naive_bayes training completed
  Best validation F1: 0.7239
  Test F1: 0.7815

Training DECISION_TREE
Performing hyperparameter tuning...
Fitting 5 folds for each of 50 candidates, totalling 250 fits




✓ decision_tree training completed
  Best validation F1: 0.7618
  Test F1: 0.7596

Training KNN
Performing hyperparameter tuning...
Fitting 5 folds for each of 36 candidates, totalling 180 fits




✓ knn training completed
  Best validation F1: 0.8572
  Test F1: 0.8420


In [89]:

# Compare models
best_model_name = pipeline.compare_models()
best_model_name


MODEL COMPARISON AND SELECTION
Model Performance Comparison:
                 Model  Val_Accuracy  Val_Precision  Val_Recall  Val_F1  \
6                  knn        0.8548         0.8637      0.8548  0.8572   
3                  svm        0.7742         0.7693      0.7742  0.7709   
0  logistic_regression        0.8065         0.8449      0.8065  0.8124   
1        random_forest        0.8226         0.8187      0.8226  0.8184   
2    gradient_boosting        0.8387         0.8359      0.8387  0.8363   
4          naive_bayes        0.7258         0.7224      0.7258  0.7239   
5        decision_tree        0.7742         0.7667      0.7742  0.7618   

   Test_Accuracy  Test_Precision  Test_Recall  Test_F1  Test_ROC_AUC  
6         0.8387          0.8524       0.8387   0.8420        0.9214  
3         0.8387          0.8443       0.8387   0.8406        0.9345  
0         0.8065          0.8606       0.8065   0.8127        0.8893  
1         0.7903          0.8009       0.7903   0.793

'knn'

In [90]:

# Advanced analysis on best model
pipeline.advanced_model_analysis(best_model_name)


ADVANCED ANALYSIS - KNN
Generating learning curves...
Calculating permutation importance...
Analyzing model calibration...
Computing SHAP values...
SHAP analysis skipped: The passed model is not callable and cannot be analyzed directly with the given masker! Model: KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='distance')
Advanced analysis complete.

