# Notebook 10: Heuristic Validation

## Purpose
Validate that scoring heuristics (nutrition, security, etc.) align with actual elk behavior and are predictive of presence.

## Key Questions
- Which heuristic scores are available in the dataset?
- Do heuristic scores correlate with their source features?
- Are heuristics discriminative for elk presence?
- Do composite scores outperform individual heuristics?
- Should heuristics be kept as features or replaced with raw features?

## Key Observations to Look For
- **Score-Feature Correlation**: Heuristics should correlate with source features (>0.5)
- **Discriminative Power**: AUC-ROC > 0.5 (better than random)
- **Composite Performance**: Should composite score improve on individual scores?
- **Weight Validation**: Do heuristic weights align with data-driven importance?


In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_score, recall_score, f1_score
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Determine project root and output directories
possible_roots = [
    Path('.'),  # If running from project root
    Path('..'),  # If running from notebooks directory
    Path('../..'),  # If running from subdirectory
]

data_root = None
for root in possible_roots:
    if (root / 'data' / 'features').exists():
        data_root = root / 'data'
        break

if data_root is None:
    data_root = Path('../data')

# Create output directories relative to project root
figures_dir = data_root / 'figures'
reports_dir = data_root / 'reports'
figures_dir.mkdir(parents=True, exist_ok=True)
reports_dir.mkdir(parents=True, exist_ok=True)

print(f'✓ Setup complete')
print(f'  Output directory: {data_root.absolute()}')


## 1. Load Data and Identify Heuristic Scores


In [None]:
# Load data
from pathlib import Path

# Try multiple possible paths
possible_paths = [
    Path('data/features/complete_context.csv'),  # From project root
    Path('../data/features/complete_context.csv'),  # From notebooks directory
    Path('../../data/features/complete_context.csv'),  # From subdirectory
]

data_path = None
for path in possible_paths:
    if path.exists():
        data_path = path
        break

if data_path is None:
    raise FileNotFoundError(
        f'Data file not found. Tried: {[str(p) for p in possible_paths]}\n'
        f'Please run: python scripts/combine_feature_files.py\n'
        f'Or ensure you are running the notebook from the project root directory.'
    )

print(f'Loading data from: {data_path}')
df = pd.read_csv(data_path)

# Automatically detect heuristic score columns
heuristic_cols = [col for col in df.columns if 'score' in col.lower()]

print(f'Dataset shape: {df.shape}')
print(f'\nFound {len(heuristic_cols)} heuristic score columns:')
for col in heuristic_cols:
    print(f'  - {col}')

if len(heuristic_cols) == 0:
    print('\n⚠ No heuristic score columns found. Skipping heuristic validation.')
    print('This notebook requires columns containing "score" in the name.')
else:
    # Detect presence column
    presence_col = None
    for col in df.columns:
        if col.lower() in ['presence', 'target', 'label', 'is_presence', 'elk_present']:
            presence_col = col
            break
    
    print(f'\nPresence column: {presence_col}')
    
    # Display heuristic score summary statistics
    print('\nHeuristic score statistics:')
    print(df[heuristic_cols].describe())


## 2. Heuristic Score Distributions


In [None]:
# Create histograms for all heuristic scores
if len(heuristic_cols) > 0:
    n_cols = 3
    n_rows = (len(heuristic_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for idx, col in enumerate(heuristic_cols):
        ax = axes[idx]
        data = df[col].dropna()
        
        if len(data) > 0:
            # Histogram
            ax.hist(data, bins=50, alpha=0.7, color='steelblue', edgecolor='black')
            
            # Add mean/median lines
            mean_val = data.mean()
            median_val = data.median()
            ax.axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
            ax.axvline(median_val, color='blue', linestyle='--', linewidth=2, label=f'Median: {median_val:.2f}')
            
            ax.set_xlabel(col, fontsize=10)
            ax.set_ylabel('Frequency', fontsize=10)
            ax.set_title(col, fontsize=11)
            ax.legend(fontsize=8)
            ax.grid(alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(heuristic_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.suptitle('Heuristic Score Distributions', fontsize=16, y=1.00)
    plt.tight_layout()
    plt.savefig(figures_dir / 'heuristic_distributions.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved heuristic distributions')
    
    # Compare presence vs absence if available
    if presence_col:
        print('\n### Presence vs Absence Comparison:')
        for col in heuristic_cols:
            presence_mean = df[df[presence_col] == 1][col].mean()
            absence_mean = df[df[presence_col] == 0][col].mean()
            diff = presence_mean - absence_mean
            print(f'  {col}: Presence={presence_mean:.3f}, Absence={absence_mean:.3f}, Diff={diff:.3f}')
else:
    print('⚠ No heuristic scores to analyze')


## 3. Score vs Source Features Validation


In [None]:
# Validate heuristic scores against their source features
if len(heuristic_cols) > 0:
    # Map heuristic names to likely source features
    source_mapping = {}
    for score_col in heuristic_cols:
        score_lower = score_col.lower()
        source_features = []
        
        # Try to identify source features based on heuristic name
        for col in df.columns:
            col_lower = col.lower()
            if col == score_col:
                continue
            
            # Nutrition score -> NDVI
            if 'nutrition' in score_lower and 'ndvi' in col_lower:
                source_features.append(col)
            # Water score -> water_distance
            elif 'water' in score_lower and 'water' in col_lower and 'distance' in col_lower:
                source_features.append(col)
            # Elevation score -> elevation
            elif 'elevation' in score_lower and ('elev' in col_lower or 'altitude' in col_lower):
                source_features.append(col)
            # Security score -> security_habitat or road_distance
            elif 'security' in score_lower and ('security' in col_lower or 'road' in col_lower):
                source_features.append(col)
            # Snow score -> snow_depth or snow_water_equiv
            elif 'snow' in score_lower and 'snow' in col_lower:
                source_features.append(col)
        
        if source_features:
            source_mapping[score_col] = source_features
    
    # Create scatter plots for score vs source features
    if len(source_mapping) > 0:
        n_pairs = sum(len(sources) for sources in source_mapping.values())
        n_cols = 3
        n_rows = (n_pairs + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
        axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
        
        plot_idx = 0
        validation_results = []
        
        for score_col, source_features in source_mapping.items():
            for source_col in source_features:
                ax = axes[plot_idx]
                
                # Sample if too many points
                sample_size = min(5000, len(df))
                df_sample = df.sample(n=sample_size, random_state=42)
                
                # Scatter plot
                ax.scatter(df_sample[source_col], df_sample[score_col], 
                          alpha=0.3, s=10, edgecolors='none')
                
                # Calculate correlation
                corr = df[[source_col, score_col]].corr().iloc[0, 1]
                
                # Add regression line
                z = np.polyfit(df_sample[source_col].dropna(), 
                              df_sample[score_col].dropna(), 1)
                p = np.poly1d(z)
                x_line = np.linspace(df_sample[source_col].min(), 
                                    df_sample[source_col].max(), 100)
                ax.plot(x_line, p(x_line), "r--", alpha=0.8, linewidth=2)
                
                ax.set_xlabel(source_col, fontsize=10)
                ax.set_ylabel(score_col, fontsize=10)
                ax.set_title(f'{score_col} vs {source_col}\nr={corr:.3f}', fontsize=11)
                ax.grid(alpha=0.3)
                
                validation_results.append({
                    'heuristic': score_col,
                    'source_feature': source_col,
                    'correlation': corr,
                    'status': 'PASS' if abs(corr) > 0.5 else 'FAIL'
                })
                
                plot_idx += 1
        
        # Hide extra subplots
        for idx in range(plot_idx, len(axes)):
            axes[idx].axis('off')
        
        plt.suptitle('Heuristic Score Validation (vs Source Features)', fontsize=16, y=1.00)
        plt.tight_layout()
        plt.savefig(figures_dir / 'heuristic_validation.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print('✓ Saved heuristic validation plots')
        
        # Print validation results
        validation_df = pd.DataFrame(validation_results)
        print('\n### Validation Results:')
        print(validation_df)
        
        failed = validation_df[validation_df['status'] == 'FAIL']
        if len(failed) > 0:
            print(f'\n⚠ WARNING: {len(failed)} heuristic-source pairs have low correlation (<0.5):')
            for _, row in failed.iterrows():
                print(f"  - {row['heuristic']} vs {row['source_feature']}: r={row['correlation']:.3f}")
        else:
            print('\n✓ All heuristic-source correlations meet threshold (>0.5)')
    else:
        print('⚠ Could not identify source features for heuristics')
else:
    print('⚠ No heuristic scores to validate')


In [None]:
# Calculate AUC-ROC for each heuristic
if len(heuristic_cols) > 0 and presence_col:
    auc_results = []
    
    plt.figure(figsize=(12, 8))
    
    for col in heuristic_cols:
        # Remove NaN values
        valid_mask = df[[col, presence_col]].notna().all(axis=1)
        y_true = df.loc[valid_mask, presence_col]
        y_scores = df.loc[valid_mask, col]
        
        if len(y_true) > 0 and len(y_scores) > 0:
            # Calculate ROC curve
            fpr, tpr, thresholds = roc_curve(y_true, y_scores)
            roc_auc = auc(fpr, tpr)
            
            auc_results.append({
                'heuristic': col,
                'auc': roc_auc
            })
            
            # Plot ROC curve
            plt.plot(fpr, tpr, linewidth=2, label=f'{col} (AUC = {roc_auc:.3f})')
    
    # Add diagonal line (random classifier)
    plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random (AUC = 0.5)')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves for Heuristic Scores', fontsize=14, pad=20)
    plt.legend(loc='lower right', fontsize=10)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(figures_dir / 'heuristic_roc_curves.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved ROC curves')
    
    # Create AUC comparison bar chart
    if len(auc_results) > 0:
        auc_df = pd.DataFrame(auc_results).sort_values('auc', ascending=False)
        
        plt.figure(figsize=(12, 6))
        colors = ['green' if x > 0.5 else 'red' for x in auc_df['auc']]
        plt.barh(range(len(auc_df)), auc_df['auc'], color=colors, alpha=0.7, edgecolor='black')
        plt.axvline(0.5, color='red', linestyle='--', linewidth=2, label='Random (0.5)')
        plt.yticks(range(len(auc_df)), auc_df['heuristic'])
        plt.xlabel('AUC-ROC', fontsize=12)
        plt.ylabel('Heuristic Score', fontsize=12)
        plt.title('Heuristic Discriminative Power (AUC-ROC)', fontsize=14, pad=20)
        plt.legend()
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.savefig(figures_dir / 'heuristic_auc_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print('\n### AUC-ROC Results (sorted by AUC):')
        print(auc_df)
        
        # Flag poor performers
        poor_performers = auc_df[auc_df['auc'] < 0.55]
        if len(poor_performers) > 0:
            print(f'\n⚠ WARNING: {len(poor_performers)} heuristics perform no better than random (AUC < 0.55):')
            for _, row in poor_performers.iterrows():
                print(f"  - {row['heuristic']}: AUC={row['auc']:.3f}")
        else:
            print('\n✓ All heuristics perform better than random')
else:
    print('⚠ Cannot calculate AUC without heuristic scores and presence column')


## 5. Confusion Matrix for Top Heuristic


In [None]:
# Create confusion matrix for best heuristic
if len(heuristic_cols) > 0 and presence_col and len(auc_results) > 0:
    # Get best heuristic
    best_heuristic = auc_df.iloc[0]['heuristic']
    best_auc = auc_df.iloc[0]['auc']
    
    print(f'Using best heuristic: {best_heuristic} (AUC={best_auc:.3f})')
    
    # Create binary classifier using median threshold
    valid_mask = df[[best_heuristic, presence_col]].notna().all(axis=1)
    y_true = df.loc[valid_mask, presence_col]
    y_scores = df.loc[valid_mask, best_heuristic]
    
    threshold = y_scores.median()
    y_pred = (y_scores > threshold).astype(int)
    
    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Calculate metrics
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    accuracy = (y_pred == y_true).mean()
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
                xticklabels=['Absence', 'Presence'],
                yticklabels=['Absence', 'Presence'])
    plt.xlabel('Predicted', fontsize=12)
    plt.ylabel('Actual', fontsize=12)
    plt.title(f'Confusion Matrix: {best_heuristic}\n'
              f'Threshold={threshold:.3f}, Accuracy={accuracy:.3f}, F1={f1:.3f}', 
              fontsize=14, pad=20)
    plt.tight_layout()
    plt.savefig(figures_dir / 'heuristic_confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f'\n### Performance Metrics:')
    print(f'  Accuracy: {accuracy:.3f}')
    print(f'  Precision: {precision:.3f}')
    print(f'  Recall: {recall:.3f}')
    print(f'  F1-Score: {f1:.3f}')
    print(f'  Baseline (always majority): {(y_true == y_true.mode()[0]).mean():.3f}')
    
    if accuracy > (y_true == y_true.mode()[0]).mean():
        print('\n✓ Heuristic outperforms majority class baseline')
    else:
        print('\n⚠ Heuristic does not outperform majority class baseline')
else:
    print('⚠ Cannot create confusion matrix without heuristic scores and presence column')


## 6. Heuristic Recommendations


In [None]:
# Generate recommendations
if len(heuristic_cols) > 0:
    recommendations = []
    
    # Compile all analysis results
    if 'auc_df' in locals():
        for _, row in auc_df.iterrows():
            heuristic = row['heuristic']
            auc_val = row['auc']
            
            rec = {
                'heuristic': heuristic,
                'auc': auc_val,
                'recommendation': 'KEEP' if auc_val > 0.55 else 'DROP',
                'reason': f'AUC={auc_val:.3f} {"above" if auc_val > 0.55 else "below"} 0.55 threshold'
            }
            
            # Check validation if available
            if 'validation_df' in locals():
                validation = validation_df[validation_df['heuristic'] == heuristic]
                if len(validation) > 0:
                    low_corr = validation[validation['status'] == 'FAIL']
                    if len(low_corr) > 0:
                        rec['recommendation'] = 'REVIEW'
                        rec['reason'] += f'; Low correlation with source features'
            
            recommendations.append(rec)
    
    if len(recommendations) > 0:
        rec_df = pd.DataFrame(recommendations)
        
        print('### Heuristic Recommendations:')
        print(rec_df)
        
        # Save recommendations
        report = f'''# Heuristic Validation Recommendations

Generated: {pd.Timestamp.now()}

## Summary

Total heuristics analyzed: {len(heuristic_cols)}

## Recommendations

'''
        
        for _, rec in rec_df.iterrows():
            report += f'''
### {rec['heuristic']}
- **AUC-ROC**: {rec['auc']:.3f}
- **Recommendation**: {rec['recommendation']}
- **Reason**: {rec['reason']}

'''
        
        report += '''
## General Guidelines

1. **KEEP**: Heuristics with AUC > 0.55 and good source feature correlation
2. **REVIEW**: Heuristics with low correlation to source features (may need recalibration)
3. **DROP**: Heuristics with AUC < 0.55 (no better than random)

## Next Steps

- Consider keeping high-performing heuristics as features
- Review and potentially recalibrate heuristics with low source correlation
- Replace low-performing heuristics with raw features in modeling
'''
        
        with open(reports_dir / 'heuristic_recommendations.md', 'w') as f:
            f.write(report)
        
        print('\n✓ Saved recommendations to data/reports/heuristic_recommendations.md')
    else:
        print('⚠ Could not generate recommendations')
else:
    print('⚠ No heuristics to recommend on')


## Summary

This notebook validated heuristic scores across multiple dimensions:

1. **Score Distributions**: Examined distribution of each heuristic
2. **Source Feature Validation**: Checked correlation with source features
3. **Discriminative Power**: Calculated AUC-ROC for each heuristic
4. **Confusion Matrix**: Evaluated best heuristic as binary classifier
5. **Recommendations**: Generated actionable recommendations

**Key Findings**:
- Review `data/reports/heuristic_recommendations.md` for detailed recommendations
- Heuristics with AUC > 0.55 are potentially useful features
- Heuristics with low source correlation may need recalibration

**Next Steps**:
- Proceed to Notebook 11 for target variable analysis
- Consider keeping high-performing heuristics as model features
- Review and fix heuristics with validation issues
