In [34]:
import pandas as pd
from data_preprocessing_pipeline import load_preprocessed_data
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [18]:
data = load_preprocessed_data('Proprocessed_Dataset.csv')
X_train = data['X_train']
y_train = data['y_train']

Loading preprocessed data from Proprocessed_Dataset.csv/...
✓ Loaded 173407 train, 24773 val, 49545 test samples
✓ Target: ACTUAL_SURGERY_DURATION
✓ Features: 38


In [35]:
# Get categorical feature names
cat_feature_names = data['categorical_features']

print("=== DEBUGGING FEATURE TYPES ===")
print(f"Total features: {len(data['X_train'].columns)}")
print(f"Categorical features from metadata: {len(cat_feature_names)}")
print(f"Numerical features from metadata: {len(data['numerical_features'])}")

# DEBUG: Check which columns have string values
print("\n=== CHECKING FOR STRING VALUES ===")
for col in data['X_train'].columns:
    if data['X_train'][col].dtype == 'object':
        print(f"Column '{col}' has object dtype (should be categorical)")
        sample_values = data['X_train'][col].head(3).tolist()
        print(f"  Sample values: {sample_values}")
        # Add to categorical features if not already there
        if col not in cat_feature_names:
            print(f"  ⚠️  WARNING: '{col}' not in categorical_features list! Adding it.")
            cat_feature_names.append(col)

print(f"\n✓ Final categorical features count: {len(cat_feature_names)}")

# Convert all categorical columns to string type explicitly
X_train = data['X_train'].copy()
X_val = data['X_val'].copy()
X_test = data['X_test'].copy()

for col in cat_feature_names:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype(str)
        X_val[col] = X_val[col].astype(str)
        X_test[col] = X_test[col].astype(str)

print("✓ Converted categorical columns to string type")

# Create CatBoost Pools with column names
print("\n=== CREATING CATBOOST POOLS ===")
train_pool = Pool(
    X_train, 
    data['y_train'], 
    cat_features=cat_feature_names
)
print("✓ Train pool created")

val_pool = Pool(
    X_val, 
    data['y_val'], 
    cat_features=cat_feature_names
)
print("✓ Validation pool created")

test_pool = Pool(
    X_test, 
    data['y_test'], 
    cat_features=cat_feature_names
)
print("✓ Test pool created")

# Train model
print("\n=== TRAINING CATBOOST MODEL ===")
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50
)

model.fit(
    train_pool, 
    eval_set=val_pool,
    plot=False
)

# Evaluate on validation set
print("\n=== VALIDATION SET PERFORMANCE ===")
y_val_pred = model.predict(val_pool)
val_mse = mean_squared_error(data['y_val'], y_val_pred)
val_rmse = np.sqrt(val_mse)  # Calculate RMSE manually
val_r2 = r2_score(data['y_val'], y_val_pred)
val_mae = mean_absolute_error(data['y_val'], y_val_pred)

print(f"Validation RMSE: {val_rmse:.2f} minutes")
print(f"Validation MAE:  {val_mae:.2f} minutes")
print(f"Validation R²:   {val_r2:.3f}")

# Evaluate on test set
print("\n=== TEST SET PERFORMANCE ===")
y_test_pred = model.predict(test_pool)
test_mse = mean_squared_error(data['y_test'], y_test_pred)
test_rmse = np.sqrt(test_mse)  # Calculate RMSE manually
test_r2 = r2_score(data['y_test'], y_test_pred)
test_mae = mean_absolute_error(data['y_test'], y_test_pred)

print(f"Test RMSE: {test_rmse:.2f} minutes")
print(f"Test MAE:  {test_mae:.2f} minutes")
print(f"Test R²:   {test_r2:.3f}")

# Prediction analysis
print("\n=== PREDICTION ANALYSIS ===")
print(f"Mean actual duration: {data['y_test'].mean():.2f} minutes")
print(f"Mean predicted duration: {y_test_pred.mean():.2f} minutes")
print(f"Std actual duration: {data['y_test'].std():.2f} minutes")
print(f"Std predicted duration: {y_test_pred.std():.2f} minutes")

# Feature importance
print("\n=== TOP 15 MOST IMPORTANT FEATURES ===")
feature_importance = model.get_feature_importance()
feature_names = X_train.columns
importance_df = sorted(
    zip(feature_names, feature_importance), 
    key=lambda x: x[1], 
    reverse=True
)

for i, (feature, importance) in enumerate(importance_df[:15], 1):
    print(f"{i:2d}. {feature:35s} {importance:8.2f}")

# Save model
model.save_model('catboost_model.cbm')
print("\n✓ Model saved to 'catboost_model.cbm'")

# Optional: Create a results dataframe for further analysis
results_df = pd.DataFrame({
    'actual': data['y_test'],
    'predicted': y_test_pred,
    'error': data['y_test'] - y_test_pred,
    'abs_error': np.abs(data['y_test'] - y_test_pred)
})

results_df.to_csv('catboost_predictions.csv', index=False)
print("✓ Predictions saved to 'catboost_predictions.csv'")

print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)

=== DEBUGGING FEATURE TYPES ===
Total features: 38
Categorical features from metadata: 25
Numerical features from metadata: 15

=== CHECKING FOR STRING VALUES ===
Column 'PATIENT_CODE' has object dtype (should be categorical)
  Sample values: ['RARE', 'RARE', 'RARE']
Column 'ADMISSION_BED' has object dtype (should be categorical)
  Sample values: ['N09B024', 'Not Admitted', 'Not Admitted']

✓ Final categorical features count: 25
✓ Converted categorical columns to string type

=== CREATING CATBOOST POOLS ===
✓ Train pool created
✓ Validation pool created
✓ Test pool created

=== TRAINING CATBOOST MODEL ===
0:	learn: 55.1673507	test: 55.3207195	best: 55.3207195 (0)	total: 141ms	remaining: 2m 21s
100:	learn: 27.2109858	test: 27.1516629	best: 27.1516629 (100)	total: 23.6s	remaining: 3m 29s
200:	learn: 26.5723289	test: 26.5959539	best: 26.5959539 (200)	total: 49.4s	remaining: 3m 16s
300:	learn: 26.1522916	test: 26.2819793	best: 26.2819793 (300)	total: 1m 14s	remaining: 2m 53s
400:	learn: 25