# Model Evaluation and Results Analysis

This notebook covers:
- Comprehensive model evaluation
- Performance visualization
- Error analysis
- Model interpretation
- Final recommendations

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('../src')

from evaluate_model import ModelEvaluator, OutcomeEvaluator, TreatmentEvaluator
from recommend import TreatmentRecommendationSystem

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

# Display options
pd.set_option('display.max_columns', None)

## 1. Load Models and Data

In [None]:
# Load trained models
with open('../models/outcome_prediction_model.pkl', 'rb') as f:
    outcome_model = pickle.load(f)

with open('../models/treatment_recommendation_model.pkl', 'rb') as f:
    treatment_model = pickle.load(f)

# Load model metadata
with open('../models/model_metadata.json', 'r') as f:
    model_metadata = json.load(f)

print("Models loaded successfully!")
print(f"\nOutcome Model: {model_metadata['outcome_model']['name']}")
print(f"Treatment Model: {model_metadata['treatment_model']['name']}")

# Load test data
X_test = pd.read_csv('../data/processed/X_test_final.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')['y_test']

print(f"\nTest data loaded: {X_test.shape}")

## 2. Comprehensive Model Evaluation

In [None]:
print("=" * 60)
print("COMPREHENSIVE MODEL EVALUATION")
print("=" * 60)

# Initialize evaluators
outcome_evaluator = OutcomeEvaluator()

# Define labels
outcome_labels = ['Improved', 'Not Improved', 'Stable']
feature_names = X_test.columns.tolist()

# Comprehensive evaluation of outcome model
outcome_results, outcome_cm, outcome_feature_imp = outcome_evaluator.generate_evaluation_report(
    outcome_model, X_test, y_test, feature_names, outcome_labels, 
    model_metadata['outcome_model']['name']
)

## 3. Error Analysis

In [None]:
# Detailed error analysis
print("\n" + "=" * 60)
print("ERROR ANALYSIS")
print("=" * 60)

# Get predictions and probabilities
y_pred = outcome_model.predict(X_test)
y_pred_proba = outcome_model.predict_proba(X_test) if hasattr(outcome_model, 'predict_proba') else None

# Create error analysis dataframe
error_df = X_test.copy()
error_df['true_outcome'] = y_test
error_df['predicted_outcome'] = y_pred
error_df['correct_prediction'] = (y_test == y_pred)

if y_pred_proba is not None:
    error_df['prediction_confidence'] = np.max(y_pred_proba, axis=1)

# Analyze misclassifications
misclassified = error_df[error_df['correct_prediction'] == False]
print(f"Total misclassifications: {len(misclassified)} out of {len(error_df)} ({len(misclassified)/len(error_df)*100:.1f}%)")

# Misclassification patterns
print("\nMisclassification patterns:")
misclass_patterns = pd.crosstab(misclassified['true_outcome'], misclassified['predicted_outcome'], 
                               margins=True, margins_name='Total')
print(misclass_patterns)

# Visualize error patterns
plt.figure(figsize=(15, 10))

# Confusion matrix
plt.subplot(2, 3, 1)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
           xticklabels=outcome_labels, yticklabels=outcome_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')

# Prediction confidence distribution
if y_pred_proba is not None:
    plt.subplot(2, 3, 2)
    plt.hist(error_df[error_df['correct_prediction']]['prediction_confidence'], 
             alpha=0.7, label='Correct', bins=20)
    plt.hist(error_df[~error_df['correct_prediction']]['prediction_confidence'], 
             alpha=0.7, label='Incorrect', bins=20)
    plt.xlabel('Prediction Confidence')
    plt.ylabel('Frequency')
    plt.title('Confidence Distribution')
    plt.legend()

# Error rate by confidence threshold
if y_pred_proba is not None:
    plt.subplot(2, 3, 3)
    confidence_thresholds = np.arange(0.3, 1.0, 0.05)
    error_rates = []
    sample_sizes = []
    
    for threshold in confidence_thresholds:
        high_conf_mask = error_df['prediction_confidence'] >= threshold
        if high_conf_mask.sum() > 0:
            error_rate = 1 - error_df[high_conf_mask]['correct_prediction'].mean()
            error_rates.append(error_rate)
            sample_sizes.append(high_conf_mask.sum())
        else:
            error_rates.append(np.nan)
            sample_sizes.append(0)
    
    plt.plot(confidence_thresholds, error_rates, 'o-')
    plt.xlabel('Confidence Threshold')
    plt.ylabel('Error Rate')
    plt.title('Error Rate vs Confidence Threshold')
    plt.grid(alpha=0.3)

# Feature importance for misclassified samples
if outcome_feature_imp is not None:
    plt.subplot(2, 3, 4)
    top_features = outcome_feature_imp.head(10)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 10 Most Important Features')

# Class-wise performance
plt.subplot(2, 3, 5)
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred, target_names=outcome_labels, output_dict=True)
metrics_df = pd.DataFrame(report).T.iloc[:-3]  # Exclude avg rows
metrics_df[['precision', 'recall', 'f1-score']].plot(kind='bar', ax=plt.gca())
plt.title('Class-wise Performance Metrics')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Sample size by confidence
if y_pred_proba is not None:
    plt.subplot(2, 3, 6)
    plt.plot(confidence_thresholds, sample_sizes, 'o-', color='orange')
    plt.xlabel('Confidence Threshold')
    plt.ylabel('Number of Samples')
    plt.title('Sample Size vs Confidence Threshold')
    plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Model Interpretation

In [None]:
print("\n" + "=" * 60)
print("MODEL INTERPRETATION")
print("=" * 60)

# Feature importance analysis
if outcome_feature_imp is not None:
    print("\nTop 10 Most Important Features:")
    print("-" * 40)
    for idx, row in outcome_feature_imp.head(10).iterrows():
        print(f"{row['feature']:<25}: {row['importance']:.4f}")
    
    # Feature importance visualization
    plt.figure(figsize=(15, 8))
    
    plt.subplot(1, 2, 1)
    top_15_features = outcome_feature_imp.head(15)
    plt.barh(range(len(top_15_features)), top_15_features['importance'])
    plt.yticks(range(len(top_15_features)), top_15_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 15 Feature Importances')
    plt.gca().invert_yaxis()
    
    plt.subplot(1, 2, 2)
    cumulative_importance = np.cumsum(outcome_feature_imp['importance'])
    plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance)
    plt.axhline(y=0.8, color='r', linestyle='--', label='80% threshold')
    plt.axhline(y=0.9, color='orange', linestyle='--', label='90% threshold')
    plt.xlabel('Number of Features')
    plt.ylabel('Cumulative Importance')
    plt.title('Cumulative Feature Importance')
    plt.legend()
    plt.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Find number of features for 80% and 90% importance
    features_80 = np.argmax(cumulative_importance >= 0.8) + 1
    features_90 = np.argmax(cumulative_importance >= 0.9) + 1
    
    print(f"\nFeatures needed for 80% importance: {features_80}")
    print(f"Features needed for 90% importance: {features_90}")
    print(f"Total features: {len(outcome_feature_imp)}")

## 5. Recommendation System Testing

In [None]:
print("\n" + "=" * 60)
print("RECOMMENDATION SYSTEM TESTING")
print("=" * 60)

# Create sample patients for testing
sample_patients = [
    {
        'patient_id': 1001,
        'age': 35,
        'gender': 'Female',
        'symptoms': 'Fatigue',
        'diagnosis': 'Diabetes',
        'previous_treatment': 'None',
        'severity': 'Mild'
    },
    {
        'patient_id': 1002,
        'age': 65,
        'gender': 'Male',
        'symptoms': 'Cough',
        'diagnosis': 'Hypertension',
        'previous_treatment': 'Medication A',
        'severity': 'Severe'
    },
    {
        'patient_id': 1003,
        'age': 45,
        'gender': 'Female',
        'symptoms': 'Headache',
        'diagnosis': 'Depression',
        'previous_treatment': 'Therapy',
        'severity': 'Moderate'
    }
]

print("Testing recommendation system with sample patients...")
print("\nNote: This is a demonstration of the recommendation system structure.")
print("In a production environment, you would:")
print("1. Load the trained models using the TreatmentRecommendationSystem class")
print("2. Preprocess new patient data")
print("3. Generate recommendations and outcome predictions")

# Display sample patient profiles
for i, patient in enumerate(sample_patients, 1):
    print(f"\nSample Patient {i}:")
    print("-" * 20)
    for key, value in patient.items():
        if key != 'patient_id':
            print(f"{key.replace('_', ' ').title()}: {value}")

# Demonstrate prediction on test data
print(f"\n\nDemonstration with actual test data:")
print("-" * 40)

# Get predictions for first 5 test samples
sample_predictions = outcome_model.predict(X_test.head())
sample_probabilities = outcome_model.predict_proba(X_test.head()) if hasattr(outcome_model, 'predict_proba') else None

for i in range(5):
    true_outcome = y_test.iloc[i]
    pred_outcome = sample_predictions[i]
    
    print(f"\nTest Sample {i+1}:")
    print(f"  True Outcome: {outcome_labels[true_outcome]}")
    print(f"  Predicted Outcome: {outcome_labels[pred_outcome]}")
    print(f"  Correct: {'âœ“' if true_outcome == pred_outcome else 'âœ—'}")
    
    if sample_probabilities is not None:
        confidence = sample_probabilities[i][pred_outcome]
        print(f"  Confidence: {confidence:.2%}")

## 6. Performance Benchmarking

In [None]:
print("\n" + "=" * 60)
print("PERFORMANCE BENCHMARKING")
print("=" * 60)

# Compare with baseline models
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create baseline models
baseline_models = {
    'Random': DummyClassifier(strategy='uniform', random_state=42),
    'Most Frequent': DummyClassifier(strategy='most_frequent'),
    'Stratified': DummyClassifier(strategy='stratified', random_state=42)
}

# Train baseline models
baseline_results = {}
for name, model in baseline_models.items():
    model.fit(X_test, y_test)  # Using test set for baseline (just for comparison)
    y_pred_baseline = model.predict(X_test)
    
    baseline_results[name] = {
        'accuracy': accuracy_score(y_test, y_pred_baseline),
        'precision': precision_score(y_test, y_pred_baseline, average='weighted'),
        'recall': recall_score(y_test, y_pred_baseline, average='weighted'),
        'f1': f1_score(y_test, y_pred_baseline, average='weighted')
    }

# Our model performance
our_model_results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, average='weighted'),
    'recall': recall_score(y_test, y_pred, average='weighted'),
    'f1': f1_score(y_test, y_pred, average='weighted')
}

# Create comparison dataframe
comparison_data = []
for model_name, metrics in baseline_results.items():
    for metric_name, value in metrics.items():
        comparison_data.append({
            'Model': f'Baseline - {model_name}',
            'Metric': metric_name.title(),
            'Score': value
        })

for metric_name, value in our_model_results.items():
    comparison_data.append({
        'Model': f'Our Model - {model_metadata["outcome_model"]["name"]}',
        'Metric': metric_name.title(),
        'Score': value
    })

comparison_df = pd.DataFrame(comparison_data)

# Visualize comparison
plt.figure(figsize=(15, 8))

plt.subplot(1, 2, 1)
pivot_df = comparison_df.pivot(index='Model', columns='Metric', values='Score')
sns.heatmap(pivot_df, annot=True, fmt='.3f', cmap='RdYlGn', center=0.5)
plt.title('Model Performance Comparison')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.barplot(data=comparison_df, x='Metric', y='Score', hue='Model')
plt.title('Performance Metrics Comparison')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# Print comparison table
print("\nPerformance Comparison:")
print(pivot_df.round(4))

# Calculate improvement over best baseline
best_baseline_acc = max([results['accuracy'] for results in baseline_results.values()])
improvement = our_model_results['accuracy'] - best_baseline_acc
print(f"\nImprovement over best baseline: {improvement:.4f} ({improvement/best_baseline_acc*100:.1f}%)")

## 7. Final Evaluation Summary

In [None]:
print("\n" + "=" * 70)
print("FINAL EVALUATION SUMMARY")
print("=" * 70)

# Model performance summary
print(f"\n1. MODEL PERFORMANCE:")
print(f"   - Model: {model_metadata['outcome_model']['name']}")
print(f"   - Test Accuracy: {our_model_results['accuracy']:.4f}")
print(f"   - Test Precision: {our_model_results['precision']:.4f}")
print(f"   - Test Recall: {our_model_results['recall']:.4f}")
print(f"   - Test F1-Score: {our_model_results['f1']:.4f}")

# Cross-validation performance
print(f"\n2. CROSS-VALIDATION PERFORMANCE:")
print(f"   - CV Accuracy: {model_metadata['outcome_model']['cv_score']:.4f}")
print(f"   - CV-Test Gap: {abs(model_metadata['outcome_model']['cv_score'] - our_model_results['accuracy']):.4f}")
print(f"   - Overfitting: {'Low' if abs(model_metadata['outcome_model']['cv_score'] - our_model_results['accuracy']) < 0.05 else 'Moderate' if abs(model_metadata['outcome_model']['cv_score'] - our_model_results['accuracy']) < 0.1 else 'High'}")

# Feature analysis
if outcome_feature_imp is not None:
    print(f"\n3. FEATURE ANALYSIS:")
    print(f"   - Total features: {len(outcome_feature_imp)}")
    print(f"   - Features for 80% importance: {features_80}")
    print(f"   - Most important feature: {outcome_feature_imp.iloc[0]['feature']}")
    print(f"   - Top feature importance: {outcome_feature_imp.iloc[0]['importance']:.4f}")

# Error analysis summary
print(f"\n4. ERROR ANALYSIS:")
print(f"   - Total misclassifications: {len(misclassified)} ({len(misclassified)/len(error_df)*100:.1f}%)")
print(f"   - Most confused classes: {misclass_patterns.iloc[:-1, :-1].values.max()} misclassifications")

if y_pred_proba is not None:
    avg_confidence = error_df['prediction_confidence'].mean()
    correct_confidence = error_df[error_df['correct_prediction']]['prediction_confidence'].mean()
    incorrect_confidence = error_df[~error_df['correct_prediction']]['prediction_confidence'].mean()
    
    print(f"   - Average prediction confidence: {avg_confidence:.3f}")
    print(f"   - Correct predictions confidence: {correct_confidence:.3f}")
    print(f"   - Incorrect predictions confidence: {incorrect_confidence:.3f}")

# Baseline comparison
print(f"\n5. BASELINE COMPARISON:")
print(f"   - Best baseline accuracy: {best_baseline_acc:.4f}")
print(f"   - Our model accuracy: {our_model_results['accuracy']:.4f}")
print(f"   - Improvement: {improvement:.4f} ({improvement/best_baseline_acc*100:.1f}%)")

# Recommendations
print(f"\n6. RECOMMENDATIONS:")
if our_model_results['accuracy'] > 0.8:
    print(f"   âœ“ Model shows good performance (>80% accuracy)")
elif our_model_results['accuracy'] > 0.7:
    print(f"   âš  Model shows moderate performance (70-80% accuracy)")
else:
    print(f"   âœ— Model needs improvement (<70% accuracy)")

if abs(model_metadata['outcome_model']['cv_score'] - our_model_results['accuracy']) < 0.05:
    print(f"   âœ“ Low overfitting - model generalizes well")
else:
    print(f"   âš  Consider regularization to reduce overfitting")

if improvement > 0.1:
    print(f"   âœ“ Significant improvement over baseline models")
else:
    print(f"   âš  Consider more advanced techniques or feature engineering")

print(f"\n7. DEPLOYMENT READINESS:")
deployment_score = 0
if our_model_results['accuracy'] > 0.75:
    deployment_score += 1
if abs(model_metadata['outcome_model']['cv_score'] - our_model_results['accuracy']) < 0.05:
    deployment_score += 1
if improvement > 0.05:
    deployment_score += 1

if deployment_score >= 2:
    print(f"   âœ“ Model is ready for deployment")
    print(f"   âœ“ Consider A/B testing in production")
else:
    print(f"   âš  Model needs further improvement before deployment")
    print(f"   âš  Consider collecting more data or trying different algorithms")

print(f"\n" + "=" * 70)
print(f"EVALUATION COMPLETED SUCCESSFULLY! ðŸŽ‰")
print(f"=" * 70)