# Phase 9.4: Final Results Analysis & Production Readiness

This notebook analyzes the best trained model and provides production recommendations.

## Setup & Configuration

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project to path
project_root = Path('..')
sys.path.insert(0, str(project_root))

# Configure plotting
%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Configuration
OUTPUT_DIR = Path('outputs/results')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

DATA_DIR = project_root / 'data' / 'processed'
MODELS_DIR = project_root / 'models'

CLASSES = ['No Leak', '1/16"', '3/32"', '1/8"']

print(f"Output directory: {OUTPUT_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Models directory: {MODELS_DIR}")

## Load Data and Model

In [None]:
# Load test data
try:
    X_test = np.load(DATA_DIR / 'X_test.npy')
    y_test = np.load(DATA_DIR / 'y_test.npy')
    print("✓ Test data loaded")
    print(f"  Shape: {X_test.shape}")
except FileNotFoundError:
    print("⚠ Test data not found")
    X_test = y_test = None

# Try to load trained model
model = None
try:
    from tensorflow.keras.models import load_model
    
    # Look for model files
    model_files = list(MODELS_DIR.glob('*.h5')) + list(MODELS_DIR.glob('**/best_model.h5'))
    
    if model_files:
        model_path = model_files[0]
        model = load_model(model_path)
        print(f"\n✓ Model loaded: {model_path.name}")
    else:
        print("\n⚠ No trained model found")
        print("  Train a model first: python scripts/train_model.py ...")
except Exception as e:
    print(f"\n⚠ Could not load model: {e}")

## Evaluation Metrics

In [None]:
if model is not None and X_test is not None:
    from sklearn.metrics import (
        accuracy_score, precision_score, recall_score, f1_score,
        confusion_matrix, roc_auc_score, roc_curve, auc, classification_report
    )
    from tensorflow.keras.utils import to_categorical
    
    # Make predictions
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    print("\n=== Overall Metrics ===")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    
    # Per-class metrics
    print("\n=== Per-Class Metrics ===")
    for class_idx, class_name in enumerate(CLASSES):
        mask = y_test == class_idx
        if mask.sum() > 0:
            class_acc = accuracy_score(y_test[mask], y_pred[mask])
            print(f"{class_name:10s}: {class_acc:.4f} ({mask.sum()} samples)")
    
    # Classification report
    print("\n=== Detailed Classification Report ===")
    print(classification_report(y_test, y_pred, target_names=CLASSES, zero_division=0))

## Confusion Matrix

In [None]:
if model is not None and X_test is not None:
    from sklearn.metrics import confusion_matrix
    
    cm = confusion_matrix(y_test, y_pred)
    
    # Plot confusion matrix
    fig, ax = plt.subplots(figsize=(8, 7))
    
    sns.heatmap(
        cm, annot=True, fmt='d', cmap='Blues',
        xticklabels=CLASSES,
        yticklabels=CLASSES,
        ax=ax,
        cbar_kws={'label': 'Count'}
    )
    
    ax.set_title('Confusion Matrix - Test Set', fontweight='bold', fontsize=14)
    ax.set_xlabel('Predicted', fontweight='bold')
    ax.set_ylabel('True', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'confusion_matrix.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("✓ Saved: confusion_matrix.png")

## ROC Curves (One-vs-Rest)

In [None]:
if model is not None and X_test is not None:
    from sklearn.metrics import roc_curve, auc
    from sklearn.preprocessing import label_binarize
    
    y_test_bin = label_binarize(y_test, classes=np.arange(len(CLASSES)))
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    axes = axes.flatten()
    
    for class_idx, class_name in enumerate(CLASSES):
        # Compute ROC curve
        fpr, tpr, _ = roc_curve(y_test_bin[:, class_idx], y_pred_proba[:, class_idx])
        roc_auc = auc(fpr, tpr)
        
        # Plot
        ax = axes[class_idx]
        ax.plot(fpr, tpr, color='b', lw=2, label=f'ROC (AUC = {roc_auc:.3f})')
        ax.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--', label='Random')
        
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title(f'{class_name} (One-vs-Rest)', fontweight='bold')
        ax.legend(loc='lower right')
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'roc_curves.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("✓ Saved: roc_curves.png")

## Error Analysis

In [None]:
if model is not None and X_test is not None:
    # Identify misclassifications
    errors = y_test != y_pred
    error_indices = np.where(errors)[0]
    
    print(f"\n=== Error Analysis ===")
    print(f"Total errors: {errors.sum()} / {len(y_test)} ({100*errors.sum()/len(y_test):.2f}%)")
    
    # Error distribution
    print(f"\nError distribution by true class:")
    for class_idx, class_name in enumerate(CLASSES):
        mask = y_test == class_idx
        class_errors = errors[mask].sum()
        class_total = mask.sum()
        if class_total > 0:
            print(f"  {class_name:10s}: {class_errors:3d} / {class_total:3d} ({100*class_errors/class_total:5.2f}%)")
    
    # Analyze common confusions
    print(f"\nCommon misclassification patterns:")
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    
    confusions = []
    for true_idx in range(len(CLASSES)):
        for pred_idx in range(len(CLASSES)):
            if true_idx != pred_idx:
                count = cm[true_idx, pred_idx]
                if count > 0:
                    confusions.append((
                        count,
                        f"{CLASSES[true_idx]} → {CLASSES[pred_idx]}"
                    ))
    
    confusions.sort(reverse=True)
    for count, pattern in confusions[:5]:
        print(f"  {count:3d}x {pattern}")

## Confidence Analysis

In [None]:
if model is not None and X_test is not None:
    max_proba = y_pred_proba.max(axis=1)
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram of confidences
    ax = axes[0]
    correct = max_proba[y_test == y_pred]
    incorrect = max_proba[y_test != y_pred]
    
    ax.hist(correct, bins=20, alpha=0.6, label='Correct', color='green')
    ax.hist(incorrect, bins=20, alpha=0.6, label='Incorrect', color='red')
    ax.set_xlabel('Prediction Confidence')
    ax.set_ylabel('Frequency')
    ax.set_title('Prediction Confidence Distribution', fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Accuracy vs confidence threshold
    ax = axes[1]
    thresholds = np.linspace(0, 1, 101)
    accuracies = []
    coverage = []
    
    for threshold in thresholds:
        mask = max_proba >= threshold
        if mask.sum() > 0:
            acc = (y_test[mask] == y_pred[mask]).mean()
            cov = mask.sum() / len(y_test)
        else:
            acc = cov = 0
        accuracies.append(acc)
        coverage.append(cov)
    
    ax.plot(thresholds, accuracies, 'b-', linewidth=2, label='Accuracy')
    ax.plot(thresholds, coverage, 'r--', linewidth=2, label='Coverage')
    ax.set_xlabel('Confidence Threshold')
    ax.set_ylabel('Rate')
    ax.set_title('Accuracy-Coverage Tradeoff', fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'confidence_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("✓ Saved: confidence_analysis.png")
    
    print(f"\n=== Confidence Statistics ===")
    print(f"Mean confidence (correct):   {correct.mean():.4f}")
    print(f"Mean confidence (incorrect): {incorrect.mean():.4f}")
    print(f"\nRecommended threshold: 0.8")
    mask_08 = max_proba >= 0.8
    if mask_08.sum() > 0:
        acc_08 = (y_test[mask_08] == y_pred[mask_08]).mean()
        cov_08 = mask_08.sum() / len(y_test)
        print(f"  Accuracy at 0.8: {acc_08:.4f}")
        print(f"  Coverage at 0.8: {cov_08:.4f}")

## Model Deployment Checklist

In [None]:
checklist = f"""
{"="*70}
PRODUCTION DEPLOYMENT CHECKLIST
{"="*70}

MODEL PERFORMANCE:
  ✓ Test Accuracy:     {accuracy:.4f}
  ✓ Precision:         {precision:.4f}
  ✓ Recall:            {recall:.4f}
  ✓ F1-Score:          {f1:.4f}

DATA VALIDATION:
  [ ] Input data shape validation (1024, 9)
  [ ] Value range checks: [{X_test.min():.4f}, {X_test.max():.4f}]
  [ ] NaN/Inf detection
  [ ] Scaling/normalization verification

MODEL OPTIMIZATION:
  [ ] Convert to TFLite for edge deployment
  [ ] Quantize model (INT8/FP16)
  [ ] Test on target hardware
  [ ] Measure latency: <100ms target
  [ ] Measure memory: <50MB target

EXPORT & VERSIONING:
  [ ] Version model checkpoints
  [ ] Document training parameters
  [ ] Version training data
  [ ] Create model card

MONITORING & LOGGING:
  [ ] Log all predictions
  [ ] Track prediction confidence
  [ ] Monitor class distribution drift
  [ ] Set alerts for accuracy drop

TESTING:
  [ ] Unit tests for preprocessing
  [ ] Integration tests for inference
  [ ] Regression tests on known samples
  [ ] Edge cases (min/max values, zeros)

DOCUMENTATION:
  [ ] Document model architecture
  [ ] List training hyperparameters
  [ ] Provide usage examples
  [ ] Include confidence thresholds
  [ ] Document failure modes

COMPLIANCE:
  [ ] Data privacy review
  [ ] Model explainability assessment
  [ ] Bias audit across classes
  [ ] Performance on edge cases
  [ ] Error handling procedures

{"="*70}
KEY RECOMMENDATIONS:

1. Confidence Threshold
   - Recommended: 0.8
   - Use for high-confidence predictions only
   - Log low-confidence predictions for review

2. Error Handling
   - Handle cases where confidence < threshold
   - Flag uncertain predictions for manual review
   - Implement fallback procedures

3. Monitoring
   - Track prediction distribution per class
   - Monitor accuracy metrics continuously
   - Alert on significant performance drops

4. Maintenance
   - Retrain periodically with new data
   - A/B test model updates
   - Keep historical models for fallback

{"="*70}
"""

print(checklist)

# Save checklist
with open(OUTPUT_DIR / 'deployment_checklist.txt', 'w') as f:
    f.write(checklist)

print("✓ Checklist saved: deployment_checklist.txt")

## Final Summary

In [None]:
print("\n" + "="*70)
print("RESULTS ANALYSIS COMPLETE")
print("="*70)
print("\nOutputs saved to:", OUTPUT_DIR)
print("  - confusion_matrix.png")
print("  - roc_curves.png")
print("  - confidence_analysis.png")
print("  - deployment_checklist.txt")
print("\nKEY METRICS:")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}")
print("\nNEXT STEPS:")
print("  1. Review deployment checklist")
print("  2. Export model: python scripts/export_model.py --model-path ...")
print("  3. Run benchmarks: python scripts/benchmark.py --model-path ...")
print("  4. Deploy to production")
print("\n" + "="*70)