In [None]:
# Model Evaluation Notebook

import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from pathlib import Path

# Add project root to path
sys.path.append('..')

from src.models import SystemsThinkingClassifier, SubdimensionClassifier
from src.preprocessing import DocumentParser, TextCleaner, ContentFilter
from src.rag import SemanticRetriever, ContextProcessor

# Initialize models and components
classifier = SystemsThinkingClassifier("models/systems_thinking_classifier")
subdim_classifier = SubdimensionClassifier("models/subdimension_classifier")

# Load test data
def load_test_data():
    """Load annotated test data"""
    return pd.read_csv('../data/test_data.csv')

test_data = load_test_data()

# 1. High-Level Classification Performance
print("High-Level Classification Performance")
print("-" * 50)

y_true = test_data['is_systems_thinking']
y_pred = []
confidences = []

for text in test_data['text']:
    prediction, confidence = classifier.predict(text)
    y_pred.append(prediction)
    confidences.append(confidence)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - High Level Classification')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

# 2. Confidence Distribution
plt.figure(figsize=(10, 6))
sns.histplot(confidences, bins=30)
plt.title('Distribution of Prediction Confidences')
plt.xlabel('Confidence')
plt.ylabel('Count')
plt.show()

# 3. Subdimension Performance
print("\nSubdimension Performance")
print("-" * 50)

# Filter for systems thinking examples
systems_thinking_data = test_data[test_data['is_systems_thinking']]

# Get subdimension predictions
subdim_metrics = {
    'purpose': {'correct': 0, 'total': 0},
    'tensions': {'correct': 0, 'total': 0},
    'macro_issue_why': {'correct': 0, 'total': 0},
    'macro_issue_how': {'correct': 0, 'total': 0},
    'micro_issue_why': {'correct': 0, 'total': 0},
    'micro_issue_how': {'correct': 0, 'total': 0},
    'collaboration': {'correct': 0, 'total': 0},
    'agency': {'correct': 0, 'total': 0}
}

for _, row in systems_thinking_data.iterrows():
    predictions = subdim_classifier.predict(row['text'])
    for dim in subdim_metrics:
        if predictions[dim] > 0.5 and row[dim] == 1:
            subdim_metrics[dim]['correct'] += 1
        if row[dim] == 1:
            subdim_metrics[dim]['total'] += 1

# Calculate accuracy per dimension
accuracies = {
    dim: metrics['correct'] / metrics['total'] if metrics['total'] > 0 else 0
    for dim, metrics in subdim_metrics.items()
}

# Plot subdimension accuracies
plt.figure(figsize=(12, 6))
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()))
plt.title('Accuracy by Subdimension')
plt.xticks(rotation=45)
plt.ylabel('Accuracy')
plt.tight_layout()
plt.show()

# 4. Misclassification Analysis
print("\nMisclassification Analysis")
print("-" * 50)

misclassified = test_data[y_true != y_pred].copy()
misclassified['confidence'] = [confidences[i] for i in range(len(y_pred)) if y_true[i] != y_pred[i]]

# Sort by confidence
misclassified_high_conf = misclassified.sort_values('confidence', ascending=False).head(5)

print("\nTop 5 High Confidence Misclassifications:")
for _, row in misclassified_high_conf.iterrows():
    print(f"\nTrue Label: {row['is_systems_thinking']}")
    print(f"Confidence: {row['confidence']:.3f}")
    print(f"Text: {row['text'][:200]}...")

# 5. RAG Impact Analysis
print("\nRAG Impact Analysis")
print("-" * 50)

# Initialize RAG components
retriever = SemanticRetriever(vector_store=None, embeddings_generator=None)
context_processor = ContextProcessor()

# Compare performance with and without RAG
rag_comparison = pd.DataFrame(columns=['Text', 'Without_RAG', 'With_RAG'])

for _, row in test_data.head(10).iterrows():
    # Without RAG
    pred_no_rag, conf_no_rag = classifier.predict(row['text'])
    
    # With RAG
    contexts = retriever.retrieve(row['text'])
    processed_context = context_processor.assemble_context(contexts)
    pred_with_rag, conf_with_rag = classifier.predict(row['text'], context=processed_context)
    
    rag_comparison = rag_comparison.append({
        'Text': row['text'][:100],
        'Without_RAG': conf_no_rag,
        'With_RAG': conf_with_rag
    }, ignore_index=True)

# Plot RAG comparison
plt.figure(figsize=(10, 6))
rag_comparison[['Without_RAG', 'With_RAG']].plot(kind='bar')
plt.title('Impact of RAG on Prediction Confidence')
plt.xlabel('Sample')
plt.ylabel('Confidence')
plt.legend()
plt.tight_layout()
plt.show()

# 6. Error Analysis by Text Length
test_data['text_length'] = test_data['text'].str.len()
error_by_length = pd.DataFrame({
    'text_length': test_data['text_length'],
    'is_error': y_true != y_pred
})

plt.figure(figsize=(10, 6))
sns.boxplot(x='is_error', y='text_length', data=error_by_length)
plt.title('Text Length Distribution for Correct vs Incorrect Predictions')
plt.xlabel('Is Error')
plt.ylabel('Text Length')
plt.show()

# 7. Summary Statistics
print("\nSummary Statistics")
print("-" * 50)
print(f"Overall Accuracy: {(y_true == y_pred).mean():.3f}")
print(f"Average Confidence: {np.mean(confidences):.3f}")
print("\nSubdimension Performance:")
for dim, acc in accuracies.items():
    print(f"{dim}: {acc:.3f}")