# Model Training and Evaluation Notebook
This notebook handles model training, cross-validation, and performance evaluation

## 1. Import Required Libraries

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import (
    get_xgb_classifier,
    get_xgb_regressor,
    evaluate_classifier,
    evaluate_regressor,
    plot_confusion_matrix,
    plot_roc_curves,
    plot_feature_importance
)
from sklearn.model_selection import StratifiedKFold, KFold
import xgboost as xgb
import joblib

## 2. Load Preprocessed Data
Make sure you've run the preprocessing notebook first

In [12]:
try:
    final_features = pd.read_csv("preprocessed_data_root_all.csv")
    print("Data loaded successfully!")
    print(f"Shape: {final_features.shape}")
    print("\nColumns available:")
    print(final_features.columns.tolist())
    
    # Check for required columns
    required_columns = {'ID', 'max_diameter'}
    missing_columns = required_columns - set(final_features.columns)
    
    if missing_columns:
        print(f"\nERROR: Missing required columns: {missing_columns}")
    else:
        print("\nAll required columns present")
        
except FileNotFoundError:
    print("Error: Could not find preprocessed_data.csv")
    print("Please run the data preprocessing notebook first")

Data loaded successfully!
Shape: (215, 461)

Columns available:
['ID', 'centroid_area', 'centroid_offset', 'c2c_dist_lcs', 'c2c_dist_rcs', 'c2c_dist_ncs', 's2c_dist_lcs', 's2c_dist_rcs', 's2c_dist_ncs', 's2s_dist_lcs_rcs', 's2s_dist_rcs_ncs', 's2s_dist_ncs_lcs', 'width_lcs', 'width_rcs', 'width_ncs', 'radius_lcs', 'radius_rcs', 'radius_ncs', 'angle_lcs', 'angle_rcs', 'angle_ncs', 'contour_length_lcs', 'contour_length_rcs', 'contour_length_ncs', 'area_lcs', 'area_rcs', 'area_ncs', 'volume_lcs', 'volume_rcs', 'volume_ncs', 'height_lcs', 'height_rcs', 'height_ncs', 'original_shape_Elongation_LCS', 'original_shape_Flatness_LCS', 'original_shape_LeastAxisLength_LCS', 'original_shape_MajorAxisLength_LCS', 'original_shape_Maximum2DDiameterColumn_LCS', 'original_shape_Maximum2DDiameterRow_LCS', 'original_shape_Maximum2DDiameterSlice_LCS', 'original_shape_Maximum3DDiameter_LCS', 'original_shape_MeshVolume_LCS', 'original_shape_MinorAxisLength_LCS', 'original_shape_Sphericity_LCS', 'original_sha

## 3. Prepare Features and Targets

In [6]:
if 'final_features' in locals() and not missing_columns:
    # Create Label column if it doesn't exist
    if 'Label' not in final_features.columns:
        print("\nCreating 'Label' column from 'max_diameter'")
        final_features['Label'] = final_features['max_diameter'].apply(
            lambda x: 0 if x < 40 else (1 if x < 45 else (2 if x < 50 else 3))
        )
    
    # Prepare features and targets
    cols_to_drop = [col for col in ["ID", "Label", "max_diameter"] 
                   if col in final_features.columns]
    X = final_features.drop(columns=cols_to_drop, errors='ignore')
    y_class = final_features["Label"]  # For classification
    y_reg = final_features["max_diameter"]  # For regression
    
    print("\nFeature matrix shape:", X.shape)
    print("Class labels distribution:")
    print(y_class.value_counts().sort_index())
    print("\nDiameter statistics (mm):")
    print(y_reg.describe())
else:
    print("\nCannot prepare features - missing required data")


Cannot prepare features - missing required data


## 4. Model Training and Evaluation

In [None]:
def train_and_evaluate(X, y, task="classification", n_splits=5, random_state=42):
    """
    Perform cross-validated training and evaluation
    
    Args:
        X: Feature matrix
        y: Target values
        task: "classification" or "regression"
        n_splits: Number of CV folds
        random_state: Random seed
    """
    if task == "classification":
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        model = get_xgb_classifier()
    else:  # "regression"
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        model = get_xgb_regressor()
    
    metrics = []
    
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y if task == "classification" else X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        model.fit(X_train, y_train)
        
        if task == "classification":
            acc, cm, roc_data = evaluate_classifier(model, X_test, y_test)
            metrics.append({"fold": fold+1, "accuracy": acc})
            
            if fold == 0:  # Plot for first fold only to avoid repetition
                plot_confusion_matrix(cm)
                plot_roc_curves(roc_data)
        else: # "regression"
            rmse, mae, r2 = evaluate_regressor(model, X_test, y_test)
            metrics.append({"fold": fold+1, "rmse": rmse, "mae": mae, "r2": r2})
    
    # Display average metrics
    metrics_df = pd.DataFrame(metrics)
    print(f"\nAverage {task} metrics across {n_splits} folds:")
    display(metrics_df.mean().to_frame().T)
    
    # Plot feature importance
    plot_feature_importance(model)
    
    return model

In [None]:
# Only run if data is prepared
if 'X' in locals() and 'y_class' in locals():
    print("Training classification model...")
    clf_model = train_and_evaluate(X, y_class, task="classification")
    
    print("\nTraining regression model...")
    reg_model = train_and_evaluate(X, y_reg, task="regression")
else:
    print("Cannot train models - features not prepared")

## 5. Save Models

In [None]:
if 'clf_model' in locals() and 'reg_model' in locals():
    joblib.dump(clf_model, "aortic_classifier.joblib")
    joblib.dump(reg_model, "aortic_regressor.joblib")
    print("Models saved successfully!")
elif 'X' not in locals():
    print("Cannot save models - models not trained")