# Validation Module

> Model performance evaluation and validation metrics.

This module provides:
- `validate_classifications()`: Evaluate model on test set
- `cross_validate()`: k-fold cross-validation
- `ClassificationMetrics`: Accuracy, precision, recall, F1, Cohen's kappa
- `ValidationResult`: Complete validation results with confusion matrices

In [None]:
#| default_exp validation

In [None]:
#| export
from __future__ import annotations
import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional, List, Dict, Tuple, Any
import json
import logging

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
    cohen_kappa_score,
)
from sklearn.model_selection import StratifiedKFold

from openness_classifier.core import (
    OpennessCategory,
    ClassificationType,
    Classification,
    LLMConfiguration,
)
from openness_classifier.config import ClassifierConfig, load_config
from openness_classifier.data import (
    TrainingExample,
    load_training_data,
    train_test_split,
    EmbeddingModel,
    compute_embeddings,
)
from openness_classifier.classifier import OpennessClassifier

## Metrics Data Classes

In [None]:
#| export
@dataclass
class ClassificationMetrics:
    """Classification performance metrics for a single type (data or code).
    
    Attributes:
        accuracy: Overall accuracy
        precision_per_class: Precision for each category
        recall_per_class: Recall for each category
        f1_per_class: F1 score for each category
        macro_f1: Macro-averaged F1
        weighted_f1: Weighted F1 by class support
        cohens_kappa: Inter-rater agreement (vs human coders)
        support_per_class: Number of samples per class
    """
    accuracy: float
    precision_per_class: Dict[str, float]
    recall_per_class: Dict[str, float]
    f1_per_class: Dict[str, float]
    macro_f1: float
    weighted_f1: float
    cohens_kappa: float
    support_per_class: Dict[str, int]
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            'accuracy': self.accuracy,
            'precision_per_class': self.precision_per_class,
            'recall_per_class': self.recall_per_class,
            'f1_per_class': self.f1_per_class,
            'macro_f1': self.macro_f1,
            'weighted_f1': self.weighted_f1,
            'cohens_kappa': self.cohens_kappa,
            'support_per_class': self.support_per_class,
        }
    
    def to_markdown(self) -> str:
        """Generate markdown table for manuscript."""
        lines = [
            "| Metric | Value |",
            "|--------|-------|",
            f"| Accuracy | {self.accuracy:.3f} |",
            f"| Macro F1 | {self.macro_f1:.3f} |",
            f"| Weighted F1 | {self.weighted_f1:.3f} |",
            f"| Cohen's Kappa | {self.cohens_kappa:.3f} |",
            "",
            "| Class | Precision | Recall | F1 | Support |",
            "|-------|-----------|--------|----|---------|",
        ]
        
        for cls in ['open', 'mostly_open', 'mostly_closed', 'closed']:
            if cls in self.precision_per_class:
                lines.append(
                    f"| {cls} | {self.precision_per_class[cls]:.3f} | "
                    f"{self.recall_per_class[cls]:.3f} | "
                    f"{self.f1_per_class[cls]:.3f} | "
                    f"{self.support_per_class[cls]} |"
                )
        
        return "\n".join(lines)

In [None]:
#| export
@dataclass
class ValidationResult:
    """Complete validation results with metrics and confusion matrices.
    
    Attributes:
        data_metrics: Metrics for data classification
        code_metrics: Metrics for code classification
        overall_accuracy: Combined accuracy
        test_set_size: Number of test samples
        train_set_size: Number of training samples
        validation_timestamp: When validation was performed
        model_config: LLM configuration used
        confusion_matrices: {"data": matrix, "code": matrix}
        misclassified_examples: List of misclassified statements
    """
    data_metrics: Optional[ClassificationMetrics] = None
    code_metrics: Optional[ClassificationMetrics] = None
    overall_accuracy: float = 0.0
    test_set_size: int = 0
    train_set_size: int = 0
    validation_timestamp: datetime = field(default_factory=datetime.utcnow)
    model_config: Optional[LLMConfiguration] = None
    confusion_matrices: Dict[str, np.ndarray] = field(default_factory=dict)
    misclassified_examples: List[Tuple[str, str, str]] = field(default_factory=list)
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            'data_metrics': self.data_metrics.to_dict() if self.data_metrics else None,
            'code_metrics': self.code_metrics.to_dict() if self.code_metrics else None,
            'overall_accuracy': self.overall_accuracy,
            'test_set_size': self.test_set_size,
            'train_set_size': self.train_set_size,
            'validation_timestamp': self.validation_timestamp.isoformat(),
            'model_config': self.model_config.to_dict() if self.model_config else None,
            'confusion_matrices': {
                k: v.tolist() for k, v in self.confusion_matrices.items()
            },
            'misclassified_count': len(self.misclassified_examples),
        }
    
    def to_json(self, path: Optional[str | Path] = None) -> str:
        """Export to JSON."""
        json_str = json.dumps(self.to_dict(), indent=2)
        if path:
            Path(path).write_text(json_str)
        return json_str
    
    def to_markdown(self) -> str:
        """Generate markdown report for manuscript."""
        lines = [
            "# Validation Results",
            "",
            f"**Date**: {self.validation_timestamp.strftime('%Y-%m-%d %H:%M UTC')}",
            f"**Test Set Size**: {self.test_set_size}",
            f"**Training Set Size**: {self.train_set_size}",
            f"**Overall Accuracy**: {self.overall_accuracy:.1%}",
            "",
        ]
        
        if self.data_metrics:
            lines.extend([
                "## Data Classification Metrics",
                "",
                self.data_metrics.to_markdown(),
                "",
            ])
        
        if self.code_metrics:
            lines.extend([
                "## Code Classification Metrics",
                "",
                self.code_metrics.to_markdown(),
                "",
            ])
        
        return "\n".join(lines)

## Validation Functions

In [None]:
#| export
def compute_metrics(
    y_true: List[str],
    y_pred: List[str],
    labels: Optional[List[str]] = None
) -> ClassificationMetrics:
    """Compute classification metrics from true and predicted labels.
    
    Args:
        y_true: Ground truth labels
        y_pred: Predicted labels
        labels: Label names (default: openness categories)
        
    Returns:
        ClassificationMetrics with all computed values
    """
    if labels is None:
        labels = ['open', 'mostly_open', 'mostly_closed', 'closed']
    
    # Filter to only labels that appear in data
    present_labels = sorted(set(y_true) | set(y_pred))
    
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=present_labels, zero_division=0
    )
    
    # Macro and weighted F1
    _, _, macro_f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )
    _, _, weighted_f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='weighted', zero_division=0
    )
    
    # Cohen's kappa
    kappa = cohen_kappa_score(y_true, y_pred)
    
    return ClassificationMetrics(
        accuracy=accuracy,
        precision_per_class=dict(zip(present_labels, precision)),
        recall_per_class=dict(zip(present_labels, recall)),
        f1_per_class=dict(zip(present_labels, f1)),
        macro_f1=macro_f1,
        weighted_f1=weighted_f1,
        cohens_kappa=kappa,
        support_per_class=dict(zip(present_labels, support.astype(int))),
    )

In [None]:
#| export
def validate_classifications(
    test_examples: List[TrainingExample],
    classifier: OpennessClassifier,
    progress_callback: Optional[callable] = None,
) -> ValidationResult:
    """Validate classifier on test set.
    
    Classifies all test examples and computes metrics vs ground truth.
    
    Args:
        test_examples: Test examples with ground truth labels
        classifier: Trained classifier
        progress_callback: Optional callback(processed, total)
        
    Returns:
        ValidationResult with metrics and confusion matrices
    """
    # Separate data and code examples
    data_examples = [e for e in test_examples if e.statement_type == ClassificationType.DATA]
    code_examples = [e for e in test_examples if e.statement_type == ClassificationType.CODE]
    
    result = ValidationResult(
        test_set_size=len(test_examples),
        train_set_size=len(classifier.data_examples) + len(classifier.code_examples),
        model_config=classifier.config.llm,
    )
    
    all_true = []
    all_pred = []
    processed = 0
    
    # Validate data classifications
    if data_examples:
        data_true, data_pred, data_misclassified = _validate_examples(
            data_examples, classifier, ClassificationType.DATA,
            lambda p: progress_callback(processed + p, len(test_examples)) if progress_callback else None
        )
        
        if data_true:
            result.data_metrics = compute_metrics(data_true, data_pred)
            result.confusion_matrices['data'] = confusion_matrix(
                data_true, data_pred,
                labels=['open', 'mostly_open', 'mostly_closed', 'closed']
            )
            all_true.extend(data_true)
            all_pred.extend(data_pred)
            result.misclassified_examples.extend(data_misclassified)
        
        processed += len(data_examples)
    
    # Validate code classifications
    if code_examples:
        code_true, code_pred, code_misclassified = _validate_examples(
            code_examples, classifier, ClassificationType.CODE,
            lambda p: progress_callback(processed + p, len(test_examples)) if progress_callback else None
        )
        
        if code_true:
            result.code_metrics = compute_metrics(code_true, code_pred)
            result.confusion_matrices['code'] = confusion_matrix(
                code_true, code_pred,
                labels=['open', 'mostly_open', 'mostly_closed', 'closed']
            )
            all_true.extend(code_true)
            all_pred.extend(code_pred)
            result.misclassified_examples.extend(code_misclassified)
    
    # Overall accuracy
    if all_true:
        result.overall_accuracy = accuracy_score(all_true, all_pred)
    
    return result


def _validate_examples(
    examples: List[TrainingExample],
    classifier: OpennessClassifier,
    statement_type: ClassificationType,
    progress_callback: Optional[callable] = None,
) -> Tuple[List[str], List[str], List[Tuple[str, str, str]]]:
    """Validate a list of examples."""
    true_labels = []
    pred_labels = []
    misclassified = []
    
    for i, ex in enumerate(examples):
        try:
            result = classifier.classify_statement(
                ex.statement_text, 
                statement_type,
                return_reasoning=False
            )
            
            true_label = ex.ground_truth.value
            pred_label = result.category.value
            
            true_labels.append(true_label)
            pred_labels.append(pred_label)
            
            if true_label != pred_label:
                misclassified.append((
                    ex.statement_text[:100],
                    true_label,
                    pred_label
                ))
                
        except Exception as e:
            logging.error(f"Failed to classify example {ex.id}: {e}")
        
        if progress_callback:
            progress_callback(i + 1)
    
    return true_labels, pred_labels, misclassified

## Cross-Validation

In [None]:
#| export
def cross_validate(
    examples: List[TrainingExample],
    config: ClassifierConfig,
    n_folds: int = 5,
    progress_callback: Optional[callable] = None,
) -> List[ValidationResult]:
    """Perform k-fold cross-validation.
    
    Args:
        examples: All training examples
        config: Classifier configuration
        n_folds: Number of folds (default: 5)
        progress_callback: Optional callback(fold, n_folds)
        
    Returns:
        List of ValidationResult, one per fold
    """
    # Separate by type
    data_examples = [e for e in examples if e.statement_type == ClassificationType.DATA]
    code_examples = [e for e in examples if e.statement_type == ClassificationType.CODE]
    
    results = []
    
    # Cross-validate data examples
    if data_examples:
        data_results = _cross_validate_type(
            data_examples, code_examples, config, n_folds,
            ClassificationType.DATA
        )
        results.extend(data_results)
    
    return results


def _cross_validate_type(
    examples: List[TrainingExample],
    other_examples: List[TrainingExample],
    config: ClassifierConfig,
    n_folds: int,
    statement_type: ClassificationType,
) -> List[ValidationResult]:
    """Cross-validate for a single type."""
    results = []
    
    labels = [e.ground_truth.value for e in examples]
    kfold = StratifiedKFold(n_splits=min(n_folds, len(examples)), shuffle=True, random_state=42)
    
    embedding_model = EmbeddingModel(config.embedding_model)
    
    for fold, (train_idx, test_idx) in enumerate(kfold.split(examples, labels)):
        train_ex = [examples[i] for i in train_idx]
        test_ex = [examples[i] for i in test_idx]
        
        # Compute embeddings
        compute_embeddings(train_ex, embedding_model)
        
        # Create classifier with fold's training data
        if statement_type == ClassificationType.DATA:
            classifier = OpennessClassifier(
                config=config,
                data_examples=train_ex,
                code_examples=other_examples,
                embedding_model=embedding_model,
            )
        else:
            classifier = OpennessClassifier(
                config=config,
                data_examples=other_examples,
                code_examples=train_ex,
                embedding_model=embedding_model,
            )
        
        # Validate
        result = validate_classifications(test_ex, classifier)
        results.append(result)
        
        logging.info(f"Fold {fold + 1}/{n_folds}: Accuracy = {result.overall_accuracy:.3f}")
    
    return results

## Performance Comparison

In [None]:
#| export
def performance_comparison(
    results: List[ValidationResult],
    labels: Optional[List[str]] = None
) -> pd.DataFrame:
    """Compare performance across multiple validation results.
    
    Useful for comparing model versions or cross-validation folds.
    
    Args:
        results: List of validation results to compare
        labels: Optional labels for each result
        
    Returns:
        DataFrame with metrics comparison
    """
    if labels is None:
        labels = [f"Run {i+1}" for i in range(len(results))]
    
    rows = []
    for label, result in zip(labels, results):
        row = {'label': label, 'overall_accuracy': result.overall_accuracy}
        
        if result.data_metrics:
            row['data_accuracy'] = result.data_metrics.accuracy
            row['data_kappa'] = result.data_metrics.cohens_kappa
            row['data_macro_f1'] = result.data_metrics.macro_f1
        
        if result.code_metrics:
            row['code_accuracy'] = result.code_metrics.accuracy
            row['code_kappa'] = result.code_metrics.cohens_kappa
            row['code_macro_f1'] = result.code_metrics.macro_f1
        
        rows.append(row)
    
    df = pd.DataFrame(rows)
    
    # Add summary statistics
    summary = df.describe().loc[['mean', 'std']]
    
    return df, summary

In [None]:
# Test metrics computation
y_true = ['open', 'open', 'closed', 'closed', 'mostly_open', 'mostly_closed']
y_pred = ['open', 'mostly_open', 'closed', 'closed', 'mostly_open', 'closed']

metrics = compute_metrics(y_true, y_pred)
print(f"Accuracy: {metrics.accuracy:.3f}")
print(f"Cohen's Kappa: {metrics.cohens_kappa:.3f}")
print(f"Macro F1: {metrics.macro_f1:.3f}")
print("\nMarkdown output:")
print(metrics.to_markdown()[:500])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()