# Advanced Modeling with PyGeomodeling

This tutorial covers advanced features:
- Model serialization and versioning
- Spatial cross-validation
- Hyperparameter tuning
- Parallel model training

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

from pygeomodeling import (
    load_spe9_data,
    UnifiedSPE9Toolkit,
    SpatialKFold,
    cross_validate_spatial,
    ParallelModelTrainer,
    save_model,
    load_model,
)

print("✓ Imports successful!")

## 1. Load and Prepare Data

In [None]:
# Load data
data = load_spe9_data('../../data/sample_small.grdecl')

# Prepare features
toolkit = UnifiedSPE9Toolkit()
toolkit.load_spe9_data(data)
X_train, X_test, y_train, y_test = toolkit.create_train_test_split(test_size=0.2)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

## 2. Spatial Cross-Validation

Regular K-Fold CV can overestimate performance for spatial data due to autocorrelation.

In [None]:
# Create model
model = RandomForestRegressor(n_estimators=50, random_state=42)

# Spatial cross-validation
cv = SpatialKFold(n_splits=5, shuffle=True, random_state=42)
results = cross_validate_spatial(
    model=model,
    X=X_train,
    y=y_train,
    cv=cv,
    scoring='r2',
    return_train_score=True
)

print(f"Test R²: {results['test_score'].mean():.4f} ± {results['test_score'].std():.4f}")
print(f"Train R²: {results['train_score'].mean():.4f} ± {results['train_score'].std():.4f}")

## 3. Parallel Model Training

Train multiple models simultaneously for faster experimentation.

In [None]:
# Define models to compare
models = {
    'rf_50': RandomForestRegressor(n_estimators=50, random_state=42),
    'rf_100': RandomForestRegressor(n_estimators=100, random_state=42),
    'gpr': GaussianProcessRegressor(random_state=42),
}

# Train in parallel
trainer = ParallelModelTrainer(n_jobs=-1, verbose=1)
results = trainer.train_and_evaluate(
    models=models,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

# Display results
for name, result in sorted(results.items(), key=lambda x: x[1]['metrics']['r2'], reverse=True):
    print(f"\n{name}:")
    print(f"  R²: {result['metrics']['r2']:.4f}")
    print(f"  MSE: {result['metrics']['mse']:.4f}")
    print(f"  Time: {result['training_time']:.2f}s")

## 4. Model Serialization

Save models with metadata for reproducibility.

In [None]:
# Find best model
best_name = max(results.keys(), key=lambda k: results[k]['metrics']['r2'])
best_model = results[best_name]['model']
best_metrics = results[best_name]['metrics']

# Save with metadata
model_path = save_model(
    model=best_model,
    model_name=f'tutorial_{best_name}',
    model_type=best_name,
    backend='sklearn',
    metrics=best_metrics,
    description='Best model from tutorial',
    save_dir='../../saved_models'
)

print(f"✓ Saved: {best_name}")
print(f"  Location: {model_path}")
print(f"  R²: {best_metrics['r2']:.4f}")

In [None]:
# Load the saved model
loaded_model, metadata, scaler = load_model(
    f'tutorial_{best_name}',
    save_dir='../../saved_models'
)

print(f"Loaded model: {metadata.model_name}")
print(f"Version: {metadata.version}")
print(f"Created: {metadata.created_at}")
print(f"Metrics: {metadata.performance_metrics}")

## 5. Visualize Model Comparison

In [None]:
# Compare model performance
model_names = list(results.keys())
r2_scores = [results[name]['metrics']['r2'] for name in model_names]
mse_scores = [results[name]['metrics']['mse'] for name in model_names]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# R² comparison
ax1.barh(model_names, r2_scores, color='steelblue')
ax1.set_xlabel('R² Score')
ax1.set_title('Model Comparison - R² Score')
ax1.grid(True, alpha=0.3)

# MSE comparison
ax2.barh(model_names, mse_scores, color='coral')
ax2.set_xlabel('MSE')
ax2.set_title('Model Comparison - MSE')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Summary

You've learned:

✓ Spatial cross-validation for geostatistical data  
✓ Parallel model training for efficiency  
✓ Model serialization with metadata  
✓ Model comparison and selection  

## Next Steps

- Try hyperparameter tuning with Optuna
- Experiment with different CV strategies
- Use batch predictions for large datasets