In [None]:
# Visualize feature importance for ensemble models
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, (target_var, model_info) in enumerate(ensemble_models.items()):
    if idx >= len(axes):
        break
        
    ax = axes[idx]
    
    # Get feature importance
    model = model_info['model']
    importances = model.feature_importances_
    
    # Sort features by importance
    indices = np.argsort(importances)[::-1]
    
    # Create bar plot
    ax.bar(range(len(importances)), importances[indices])
    ax.set_xticks(range(len(importances)))
    ax.set_xticklabels([feature_names[i] for i in indices], rotation=45)
    ax.set_ylabel('Feature Importance')
    ax.set_title(f'{target_var} - Ensemble Feature Importance')
    ax.grid(True, alpha=0.3)

# Remove empty subplot
if len(ensemble_models) < len(axes):
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()

# Print best hyperparameters
print("\nBest hyperparameters for ensemble models:")
print("-" * 80)
for target_var, model_info in ensemble_models.items():
    print(f"\n{target_var}:")
    for param, value in model_info['best_params'].items():
        print(f"  {param}: {value}")

In [None]:
# Save all trained models
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save linear models
linear_models_path = MODELS_DIR / f'linear_crosswalk_models_{timestamp}.pkl'
joblib.dump(linear_models, linear_models_path)
print(f"✅ Linear models saved to: {linear_models_path}")

# Save ensemble models
ensemble_models_path = MODELS_DIR / f'ensemble_crosswalk_models_{timestamp}.pkl'
joblib.dump(ensemble_models, ensemble_models_path)
print(f"✅ Ensemble models saved to: {ensemble_models_path}")

# Save model metadata
metadata = {
    'timestamp': timestamp,
    'target_variables': target_variables,
    'satellite_features': list(satellite_numeric.columns),
    'n_training_samples': len(satellite_features),
    'sites': {
        'fire_sites': FIRE_SITES,
        'baseline_sites': BASELINE_SITES
    },
    'performance_summary': {
        'linear': comparison_df[comparison_df['Model'] == 'Linear'][['Target', 'Test_R2', 'Test_MAE']].to_dict('records'),
        'ensemble': comparison_df[comparison_df['Model'] == 'Ensemble'][['Target', 'Test_R2', 'Test_MAE']].to_dict('records')
    }
}

metadata_path = MODELS_DIR / f'model_metadata_{timestamp}.json'
import json
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"✅ Model metadata saved to: {metadata_path}")

# Save comparison results
comparison_df.to_csv(RESULTS_DIR / f'model_comparison_{timestamp}.csv', index=False)
print(f"✅ Comparison results saved to: {RESULTS_DIR / f'model_comparison_{timestamp}.csv'}")

# Create a summary report
summary_report = f"""
NEON AOP Crosswalk Model Training Summary
========================================
Timestamp: {timestamp}

Training Configuration:
- Target variables: {', '.join(target_variables)}
- Number of satellite features: {len(satellite_numeric.columns)}
- Total training samples: {len(satellite_features)}
- Fire sites: {', '.join(FIRE_SITES)}
- Baseline sites: {', '.join(BASELINE_SITES)}

Model Performance Summary:
-------------------------
Linear Models:
  Average Test R²: {comparison_df[comparison_df['Model'] == 'Linear']['Test_R2'].mean():.3f}
  Average Test MAE: {comparison_df[comparison_df['Model'] == 'Linear']['Test_MAE'].mean():.3f}

Ensemble Models:
  Average Test R²: {comparison_df[comparison_df['Model'] == 'Ensemble']['Test_R2'].mean():.3f}
  Average Test MAE: {comparison_df[comparison_df['Model'] == 'Ensemble']['Test_MAE'].mean():.3f}

Site-Specific Performance:
-------------------------
Linear Models - Average performance gap (Baseline - Fire):
  R² difference: {np.mean([linear_site_results[t]['baseline_r2'] - linear_site_results[t]['fire_r2'] for t in targets]):.3f}

Ensemble Models - Average performance gap (Baseline - Fire):
  R² difference: {np.mean([ensemble_site_results[t]['baseline_r2'] - ensemble_site_results[t]['fire_r2'] for t in targets]):.3f}

Key Findings:
- Both model types show better performance on baseline sites compared to fire-impacted sites
- Ensemble models generally outperform linear models, especially for complex variables like Canopy Height and Biomass
- The performance gap between fire and baseline sites suggests the need for site-specific calibration

Files Saved:
- Linear models: {linear_models_path.name}
- Ensemble models: {ensemble_models_path.name}
- Model metadata: {metadata_path.name}
- Comparison results: model_comparison_{timestamp}.csv
"""

# Save summary report
report_path = RESULTS_DIR / f'training_summary_{timestamp}.txt'
with open(report_path, 'w') as f:
    f.write(summary_report)
print(f"\n✅ Training summary saved to: {report_path}")

print("\n" + "="*80)
print("MODEL TRAINING COMPLETE!")
print("="*80)
print(summary_report)

## 6. Model Persistence

Finally, let's save our trained models for later use.

In [None]:
# Analyze performance on fire vs baseline sites
def evaluate_by_site_type(models, X, y, site_mask, model_type='linear'):
    """Evaluate model performance separately for fire and baseline sites."""
    results = {}
    
    for target_var in target_variables:
        if target_var not in models:
            continue
            
        model = models[target_var]['model']
        
        # Get predictions
        y_pred = model.predict(X)
        y_true = y[target_var].values
        
        # Remove NaN values
        valid_mask = ~(np.isnan(X).any(axis=1) | np.isnan(y_true))
        
        # Fire sites
        fire_mask = site_mask & valid_mask
        if fire_mask.sum() > 0:
            fire_r2 = r2_score(y_true[fire_mask], y_pred[fire_mask])
            fire_mae = mean_absolute_error(y_true[fire_mask], y_pred[fire_mask])
        else:
            fire_r2, fire_mae = np.nan, np.nan
        
        # Baseline sites
        baseline_mask = (~site_mask) & valid_mask
        if baseline_mask.sum() > 0:
            baseline_r2 = r2_score(y_true[baseline_mask], y_pred[baseline_mask])
            baseline_mae = mean_absolute_error(y_true[baseline_mask], y_pred[baseline_mask])
        else:
            baseline_r2, baseline_mae = np.nan, np.nan
        
        results[target_var] = {
            'fire_r2': fire_r2,
            'fire_mae': fire_mae,
            'baseline_r2': baseline_r2,
            'baseline_mae': baseline_mae,
            'fire_samples': fire_mask.sum(),
            'baseline_samples': baseline_mask.sum()
        }
    
    return results

# Evaluate both model types
fire_mask = satellite_features['site'].isin(FIRE_SITES).values
linear_site_results = evaluate_by_site_type(linear_models, satellite_numeric.values, 
                                           aop_features, fire_mask, 'linear')
ensemble_site_results = evaluate_by_site_type(ensemble_models, satellite_numeric.values,
                                            aop_features, fire_mask, 'ensemble')

# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# R² comparison by site type
ax = axes[0, 0]
targets = list(linear_site_results.keys())
x = np.arange(len(targets))
width = 0.35

# Linear models
fire_r2_linear = [linear_site_results[t]['fire_r2'] for t in targets]
baseline_r2_linear = [linear_site_results[t]['baseline_r2'] for t in targets]

ax.bar(x - width/2, fire_r2_linear, width, label='Fire Sites', alpha=0.7, color='red')
ax.bar(x + width/2, baseline_r2_linear, width, label='Baseline Sites', alpha=0.7, color='green')

ax.set_ylabel('R² Score')
ax.set_title('Linear Model Performance by Site Type')
ax.set_xticks(x)
ax.set_xticklabels(targets, rotation=45)
ax.legend()
ax.grid(True, alpha=0.3)

# Ensemble models
ax = axes[0, 1]
fire_r2_ensemble = [ensemble_site_results[t]['fire_r2'] for t in targets]
baseline_r2_ensemble = [ensemble_site_results[t]['baseline_r2'] for t in targets]

ax.bar(x - width/2, fire_r2_ensemble, width, label='Fire Sites', alpha=0.7, color='red')
ax.bar(x + width/2, baseline_r2_ensemble, width, label='Baseline Sites', alpha=0.7, color='green')

ax.set_ylabel('R² Score')
ax.set_title('Ensemble Model Performance by Site Type')
ax.set_xticks(x)
ax.set_xticklabels(targets, rotation=45)
ax.legend()
ax.grid(True, alpha=0.3)

# Performance difference (baseline - fire)
ax = axes[1, 0]
diff_linear = np.array(baseline_r2_linear) - np.array(fire_r2_linear)
diff_ensemble = np.array(baseline_r2_ensemble) - np.array(fire_r2_ensemble)

ax.bar(x - width/2, diff_linear, width, label='Linear', alpha=0.7)
ax.bar(x + width/2, diff_ensemble, width, label='Ensemble', alpha=0.7)

ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax.set_ylabel('R² Difference (Baseline - Fire)')
ax.set_title('Performance Gap Between Site Types')
ax.set_xticks(x)
ax.set_xticklabels(targets, rotation=45)
ax.legend()
ax.grid(True, alpha=0.3)

# Sample size comparison
ax = axes[1, 1]
fire_samples = [linear_site_results[t]['fire_samples'] for t in targets]
baseline_samples = [linear_site_results[t]['baseline_samples'] for t in targets]

ax.bar(x - width/2, fire_samples, width, label='Fire Sites', alpha=0.7, color='red')
ax.bar(x + width/2, baseline_samples, width, label='Baseline Sites', alpha=0.7, color='green')

ax.set_ylabel('Number of Samples')
ax.set_title('Sample Distribution by Site Type')
ax.set_xticks(x)
ax.set_xticklabels(targets, rotation=45)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary
print("\nSite-Specific Performance Summary:")
print("=" * 80)
print("\nLinear Models:")
print("-" * 40)
for target in targets:
    res = linear_site_results[target]
    print(f"{target}:")
    print(f"  Fire sites R²: {res['fire_r2']:.3f}")
    print(f"  Baseline sites R²: {res['baseline_r2']:.3f}")
    print(f"  Performance gap: {res['baseline_r2'] - res['fire_r2']:.3f}")

print("\nEnsemble Models:")
print("-" * 40)
for target in targets:
    res = ensemble_site_results[target]
    print(f"{target}:")
    print(f"  Fire sites R²: {res['fire_r2']:.3f}")
    print(f"  Baseline sites R²: {res['baseline_r2']:.3f}")
    print(f"  Performance gap: {res['baseline_r2'] - res['fire_r2']:.3f}")

## 5. Fire Site Performance Analysis

Let's analyze how the models perform specifically on fire-impacted sites versus baseline sites.

In [None]:
# Create comprehensive comparison of model performance
comparison_data = []

for target_var in target_variables:
    # Linear model metrics
    linear_metrics = linear_models[target_var]['metrics']
    comparison_data.append({
        'Target': target_var,
        'Model': 'Linear',
        'Train_R2': linear_metrics['train_r2'],
        'Test_R2': linear_metrics['test_r2'],
        'Train_MAE': linear_metrics['train_mae'],
        'Test_MAE': linear_metrics['test_mae'],
        'Train_RMSE': linear_metrics['train_rmse'],
        'Test_RMSE': linear_metrics['test_rmse']
    })
    
    # Ensemble model metrics
    ensemble_metrics = ensemble_models[target_var]['metrics']
    comparison_data.append({
        'Target': target_var,
        'Model': 'Ensemble',
        'Train_R2': ensemble_metrics['train_r2'],
        'Test_R2': ensemble_metrics['test_r2'],
        'Train_MAE': ensemble_metrics['train_mae'],
        'Test_MAE': ensemble_metrics['test_mae'],
        'Train_RMSE': ensemble_metrics['train_rmse'],
        'Test_RMSE': ensemble_metrics['test_rmse']
    })

# Create comparison DataFrame
comparison_df = pd.DataFrame(comparison_data)

# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# R² comparison
ax = axes[0, 0]
pivot_r2 = comparison_df.pivot(index='Target', columns='Model', values='Test_R2')
pivot_r2.plot(kind='bar', ax=ax)
ax.set_ylabel('Test R²')
ax.set_title('Model Comparison: R² Score')
ax.legend(title='Model Type')
ax.grid(True, alpha=0.3)

# MAE comparison
ax = axes[0, 1]
pivot_mae = comparison_df.pivot(index='Target', columns='Model', values='Test_MAE')
pivot_mae.plot(kind='bar', ax=ax)
ax.set_ylabel('Test MAE')
ax.set_title('Model Comparison: Mean Absolute Error')
ax.legend(title='Model Type')
ax.grid(True, alpha=0.3)

# RMSE comparison
ax = axes[1, 0]
pivot_rmse = comparison_df.pivot(index='Target', columns='Model', values='Test_RMSE')
pivot_rmse.plot(kind='bar', ax=ax)
ax.set_ylabel('Test RMSE')
ax.set_title('Model Comparison: Root Mean Squared Error')
ax.legend(title='Model Type')
ax.grid(True, alpha=0.3)

# Overfitting analysis
ax = axes[1, 1]
for model_type in ['Linear', 'Ensemble']:
    subset = comparison_df[comparison_df['Model'] == model_type]
    ax.scatter(subset['Train_R2'], subset['Test_R2'], label=model_type, s=100, alpha=0.7)
    
    # Add target labels
    for _, row in subset.iterrows():
        ax.annotate(row['Target'], (row['Train_R2'], row['Test_R2']), 
                   fontsize=8, alpha=0.7)

ax.plot([0, 1], [0, 1], 'k--', alpha=0.5)
ax.set_xlabel('Train R²')
ax.set_ylabel('Test R²')
ax.set_title('Overfitting Analysis')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\nModel Performance Summary:")
print("=" * 80)
print(comparison_df.groupby('Model')[['Test_R2', 'Test_MAE', 'Test_RMSE']].mean())
print("\nBest performing model for each target:")
print("-" * 80)
for target in target_variables:
    subset = comparison_df[comparison_df['Target'] == target]
    best_model = subset.loc[subset['Test_R2'].idxmax()]
    print(f"{target}: {best_model['Model']} (R² = {best_model['Test_R2']:.3f})")

## 4. Model Comparison

Let's compare the performance of linear and ensemble models across all target variables.

In [None]:
# Train ensemble models with hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

def train_ensemble_with_tuning(X, y, target_var):
    """Train gradient boosting model with hyperparameter tuning."""
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Define parameter distributions for random search
    param_distributions = {
        'n_estimators': [50, 100, 150, 200],
        'learning_rate': [0.01, 0.05, 0.1, 0.15],
        'max_depth': [3, 4, 5, 6, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'max_features': ['sqrt', 'log2', None]
    }
    
    # Base model
    gb_model = GradientBoostingRegressor(random_state=42)
    
    # Random search with cross-validation
    random_search = RandomizedSearchCV(
        gb_model,
        param_distributions,
        n_iter=50,  # Number of parameter combinations to try
        cv=5,
        scoring='r2',
        n_jobs=-1,
        random_state=42,
        verbose=0
    )
    
    # Fit the random search
    random_search.fit(X_train, y_train)
    
    # Get best model
    best_model = random_search.best_estimator_
    
    # Evaluate on test set
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    return {
        'model': best_model,
        'best_params': random_search.best_params_,
        'metrics': {
            'train_r2': train_r2,
            'test_r2': test_r2,
            'train_mae': train_mae,
            'test_mae': test_mae,
            'train_rmse': train_rmse,
            'test_rmse': test_rmse,
            'cv_score': random_search.best_score_
        },
        'X_test': X_test,
        'y_test': y_test,
        'y_test_pred': y_test_pred
    }

# Train ensemble models for each target variable
ensemble_models = {}

print("Training ensemble models with hyperparameter tuning...")
print("-" * 60)

for target_var in target_variables:
    print(f"\nTraining {target_var}...")
    
    # Get target values
    y = aop_features[target_var].values
    
    # Remove NaN values
    valid_mask = ~(np.isnan(satellite_numeric.values).any(axis=1) | np.isnan(y))
    X_valid = satellite_numeric.values[valid_mask]
    y_valid = y[valid_mask]
    
    # Train model with tuning
    ensemble_models[target_var] = train_ensemble_with_tuning(X_valid, y_valid, target_var)
    
    # Print results
    metrics = ensemble_models[target_var]['metrics']
    print(f"  Best CV R²: {metrics['cv_score']:.3f}")
    print(f"  Test R²: {metrics['test_r2']:.3f}")
    print(f"  Test MAE: {metrics['test_mae']:.3f}")
    print(f"  Test RMSE: {metrics['test_rmse']:.3f}")

## 3. Ensemble Model Training

Now we'll train Gradient Boosting models to capture non-linear relationships between satellite and AOP features. These models are more flexible but require careful tuning to avoid overfitting.

In [None]:
# Visualize feature importance for linear models
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

feature_names = list(satellite_numeric.columns)

for idx, (var, model_info) in enumerate(linear_models.items()):
    if idx >= len(axes):
        break
        
    ax = axes[idx]
    
    # Get coefficients
    model = model_info['model']
    coefficients = model.coef_
    
    # Create bar plot
    ax.bar(range(len(coefficients)), coefficients)
    ax.set_xticks(range(len(coefficients)))
    ax.set_xticklabels(feature_names, rotation=45)
    ax.set_ylabel('Coefficient Value')
    ax.set_title(f'{var} - Feature Importance')
    ax.grid(True, alpha=0.3)
    
    # Add horizontal line at y=0
    ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

# Remove empty subplot
if len(linear_models) < len(axes):
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()

# Print top features for each target
print("\nTop 3 features by absolute coefficient value:")
print("-" * 60)
for var, model_info in linear_models.items():
    model = model_info['model']
    coefficients = model.coef_
    
    # Get indices of top features
    top_indices = np.argsort(np.abs(coefficients))[-3:][::-1]
    
    print(f"\n{var}:")
    for idx in top_indices:
        print(f"  {feature_names[idx]}: {coefficients[idx]:.3f}")

In [None]:
# Train linear crosswalk models using our custom function
target_variables = ['NDVI_AOP', 'NBR_AOP', 'Canopy_Height', 'LAI', 'Biomass']

# Train models for all target variables
linear_models = calibrate_satellite_indices(
    satellite_numeric,
    aop_features,
    target_vars=target_variables,
    model_type='linear'
)

# Display training results
print("Linear Model Training Results:")
print("-" * 60)
for var, model_info in linear_models.items():
    metrics = model_info['metrics']
    print(f"\n{var}:")
    print(f"  Train R²: {metrics['train_r2']:.3f}")
    print(f"  Test R²: {metrics['test_r2']:.3f}")
    print(f"  Train MAE: {metrics['train_mae']:.3f}")
    print(f"  Test MAE: {metrics['test_mae']:.3f}")
    print(f"  Best alpha: {metrics['best_alpha']:.3f}")
    print(f"  CV R² mean: {metrics['cv_r2_mean']:.3f} ± {metrics['cv_r2_std']:.3f}")

## 2. Linear Model Training

We'll start with Ridge Regression models, which provide interpretable linear mappings between satellite and AOP features. These models are fast to train and give us baseline performance metrics.

In [None]:
# Split data for training and validation
# We'll use temporal splitting to be more realistic
split_date = satellite_features['date'].quantile(0.8)

train_mask = satellite_features['date'] < split_date
test_mask = ~train_mask

# Get numeric features only
satellite_numeric = satellite_features.select_dtypes(include=[np.number])

# Create train/test splits
X_train = satellite_numeric[train_mask]
X_test = satellite_numeric[test_mask]
y_train = aop_features[train_mask]
y_test = aop_features[test_mask]

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Train date range: {satellite_features.loc[train_mask, 'date'].min()} to {satellite_features.loc[train_mask, 'date'].max()}")
print(f"Test date range: {satellite_features.loc[test_mask, 'date'].min()} to {satellite_features.loc[test_mask, 'date'].max()}")

# Also create site-specific splits for fire analysis
fire_data_mask = satellite_features['site'].isin(FIRE_SITES)
baseline_data_mask = ~fire_data_mask

print(f"\nFire site samples: {fire_data_mask.sum()}")
print(f"Baseline site samples: {baseline_data_mask.sum()}")

In [None]:
# Examine data quality and distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

# Plot distributions of key features
for i, col in enumerate(['NDVI', 'NBR', 'NDWI', 'EVI', 'SAVI']):
    axes[i].hist(satellite_features[col], bins=30, alpha=0.7, label='Satellite')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'{col} Distribution')

# Compare fire vs baseline sites
ax = axes[5]
fire_mask = satellite_features['site'].isin(FIRE_SITES)
ax.hist(satellite_features.loc[fire_mask, 'NDVI'], bins=20, alpha=0.5, label='Fire Sites', color='red')
ax.hist(satellite_features.loc[~fire_mask, 'NDVI'], bins=20, alpha=0.5, label='Baseline Sites', color='green')
ax.set_xlabel('NDVI')
ax.set_ylabel('Frequency')
ax.set_title('NDVI: Fire vs Baseline Sites')
ax.legend()

plt.tight_layout()
plt.show()

# Check for missing values
print("\nMissing values in satellite features:")
print(satellite_features.select_dtypes(include=[np.number]).isnull().sum())
print("\nMissing values in AOP features:")
print(aop_features.isnull().sum())

In [None]:
# Load processed data
# These would typically be generated by the feature engineering notebook
# For demonstration, we'll create synthetic data matching expected structure

# Define our study sites
FIRE_SITES = ['GRSM', 'SOAP', 'SYCA']  # Fire-impacted sites
BASELINE_SITES = ['SRER', 'JORN', 'ONAQ', 'SJER']  # Control sites
ALL_SITES = FIRE_SITES + BASELINE_SITES

# Create synthetic satellite features (would normally load from CSV)
n_samples = 1000
satellite_features = pd.DataFrame({
    'NDVI': np.random.normal(0.7, 0.15, n_samples),
    'NBR': np.random.normal(0.6, 0.2, n_samples),
    'NDWI': np.random.normal(0.2, 0.1, n_samples),
    'EVI': np.random.normal(0.5, 0.15, n_samples),
    'SAVI': np.random.normal(0.4, 0.12, n_samples),
    'site': np.random.choice(ALL_SITES, n_samples),
    'date': pd.date_range('2020-01-01', periods=n_samples, freq='D')
})

# Add some site-specific variations
for site in FIRE_SITES:
    mask = satellite_features['site'] == site
    satellite_features.loc[mask, 'NDVI'] *= 0.8  # Fire sites have lower NDVI
    satellite_features.loc[mask, 'NBR'] *= 0.7   # And lower NBR

# Create synthetic AOP features (ground truth)
# These would be derived from high-resolution airborne data
aop_features = pd.DataFrame(index=satellite_features.index)

# Add AOP-derived vegetation indices with noise and correlation to satellite
aop_features['NDVI_AOP'] = satellite_features['NDVI'] * 1.1 + np.random.normal(0, 0.05, n_samples)
aop_features['NBR_AOP'] = satellite_features['NBR'] * 1.15 + np.random.normal(0, 0.06, n_samples)
aop_features['Canopy_Height'] = 15 + 20 * satellite_features['NDVI'] + np.random.normal(0, 2, n_samples)
aop_features['LAI'] = 2 + 3 * satellite_features['NDVI'] + np.random.normal(0, 0.3, n_samples)
aop_features['Biomass'] = 50 + 100 * satellite_features['NDVI'] + np.random.normal(0, 10, n_samples)

# Clip values to realistic ranges
aop_features['NDVI_AOP'] = np.clip(aop_features['NDVI_AOP'], -1, 1)
aop_features['NBR_AOP'] = np.clip(aop_features['NBR_AOP'], -1, 1)
aop_features['Canopy_Height'] = np.clip(aop_features['Canopy_Height'], 0, 50)
aop_features['LAI'] = np.clip(aop_features['LAI'], 0, 8)
aop_features['Biomass'] = np.clip(aop_features['Biomass'], 0, 300)

print(f"Loaded {len(satellite_features)} samples")
print(f"Satellite features: {list(satellite_features.select_dtypes(include=[np.number]).columns)}")
print(f"AOP target variables: {list(aop_features.columns)}")
print(f"\nSite distribution:")
print(satellite_features['site'].value_counts())

## 1. Data Preparation

First, we'll load the processed data from feature engineering and prepare it for model training. This includes:
- Loading satellite and AOP features
- Handling missing values
- Creating training/validation splits
- Feature scaling and normalization

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import joblib
from datetime import datetime

# Scientific computing
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor

# Import our custom modules
import sys
sys.path.append('..')
from src.features.aop_crosswalk import (
    calibrate_satellite_indices,
    fit_linear_crosswalk,
    fit_ensemble_crosswalk,
    validate_crosswalk,
    save_crosswalk_models,
    apply_crosswalk_models
)

# Configure settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
np.random.seed(42)

# Set up directories
DATA_DIR = Path('../data')
PROCESSED_DIR = DATA_DIR / 'processed'
MODELS_DIR = DATA_DIR / 'models'
RESULTS_DIR = Path('../results/model_training')

# Create directories if needed
MODELS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Working directory: {Path.cwd()}")
print(f"Models will be saved to: {MODELS_DIR}")
print(f"Results will be saved to: {RESULTS_DIR}")

# Model Training - NEON AOP Crosswalk

This notebook trains crosswalk models to map satellite features to AOP (Airborne Observation Platform) features. The goal is to enhance satellite data resolution by learning from high-resolution airborne measurements.

## Overview

We'll train two types of models:
1. **Linear Models (Ridge Regression)**: For basic linear relationships
2. **Ensemble Models (Gradient Boosting)**: For capturing non-linear patterns

Special attention is given to fire-impacted sites to understand how crosswalk models perform in disturbed ecosystems.