<cell_type>markdown</cell_type># Classifier Module

> Core classification functions for data and code availability statements.

This module provides:
- `classify_statement()`: Classify a single availability statement
- `classify_publication()`: Classify both data and code for a publication
- `validate_classification_precedence()`: Enforce FR-004 hard precedence rule

The classifier uses few-shot learning with semantically similar examples selected via kNN.

**Refined Taxonomy (002-refine-classification-taxonomy)**:
- 5-step chain-of-thought reasoning for boundary classification
- Hard precedence rule: substantial barriers â†’ mostly_closed regardless of completeness
- Post-classification validation to ensure rule enforcement

In [None]:
#| default_exp classifier

In [None]:
#| export
from __future__ import annotations
from typing import Optional, List, Tuple
from datetime import datetime
import logging

from openness_classifier.core import (
    OpennessCategory,
    ClassificationType,
    Classification,
    LLMConfiguration,
    LLMProvider,
    ClassificationLogger,
    ClassificationError,
    LLMError,
)
from openness_classifier.config import ClassifierConfig, load_config
from openness_classifier.data import (
    TrainingExample,
    Publication,
    EmbeddingModel,
    load_training_data,
    compute_embeddings,
)
from openness_classifier.prompts import (
    select_knn_examples,
    build_few_shot_prompt,
    parse_classification_response,
    extract_completeness_attributes,
    has_substantial_barrier,
    SYSTEM_PROMPT,
    SUBSTANTIAL_BARRIERS,
)

## Classifier Class

Main classifier that manages training data, embeddings, and LLM calls.

In [None]:
#| export
class OpennessClassifier:
    """Few-shot LLM classifier for data and code openness.

    Manages training examples, embeddings, and LLM inference.
    Implements refined taxonomy with 5-step CoT and FR-004 hard precedence rule.

    Example:
        >>> classifier = OpennessClassifier.from_config(load_config())
        >>> result = classifier.classify_statement(
        ...     "Data available at https://zenodo.org/record/12345",
        ...     ClassificationType.DATA
        ... )
        >>> print(result.category)  # OpennessCategory.OPEN
    """

    def __init__(
        self,
        config: ClassifierConfig,
        data_examples: List[TrainingExample],
        code_examples: List[TrainingExample],
        embedding_model: EmbeddingModel,
        logger: Optional[ClassificationLogger] = None,
    ):
        self.config = config
        self.data_examples = data_examples
        self.code_examples = code_examples
        self.embedding_model = embedding_model
        self.llm_provider = LLMProvider(config.llm)
        self.logger = logger

    @classmethod
    def from_config(cls, config: ClassifierConfig) -> 'OpennessClassifier':
        """Create classifier from configuration.

        Loads training data and computes embeddings.
        """
        # Load training data
        data_examples, code_examples = load_training_data(config.training_data_path)

        # Initialize embedding model
        embedding_model = EmbeddingModel(config.embedding_model)

        # Compute embeddings
        compute_embeddings(data_examples, embedding_model)
        compute_embeddings(code_examples, embedding_model)

        # Setup logger
        log_path = config.log_dir / f"classifications_{datetime.now().strftime('%Y%m%d')}.jsonl"
        logger = ClassificationLogger(log_path)

        return cls(
            config=config,
            data_examples=data_examples,
            code_examples=code_examples,
            embedding_model=embedding_model,
            logger=logger,
        )

    def classify_statement(
        self,
        statement: str,
        statement_type: ClassificationType,
        return_reasoning: bool = True,
        publication_id: Optional[str] = None,
        enforce_precedence: bool = True,
    ) -> Classification:
        """Classify a single availability statement.

        Implements the refined taxonomy with 5-step CoT reasoning and
        hard precedence rule enforcement (FR-004).

        Args:
            statement: The availability statement text
            statement_type: DATA or CODE
            return_reasoning: Include chain-of-thought reasoning
            publication_id: Optional ID for logging
            enforce_precedence: If True, apply FR-004 hard precedence rule
                               post-classification to validate/correct

        Returns:
            Classification result with category, confidence, and reasoning
        """
        # Select appropriate training examples
        examples = (self.data_examples if statement_type == ClassificationType.DATA
                   else self.code_examples)

        # Select kNN examples
        selected = select_knn_examples(
            statement=statement,
            training_examples=examples,
            embedding_model=self.embedding_model,
            k=self.config.few_shot_k,
        )

        # Build prompt
        prompt = build_few_shot_prompt(
            statement=statement,
            statement_type=statement_type,
            examples=selected,
            include_reasoning=return_reasoning,
        )

        # Prepend system prompt
        full_prompt = f"{SYSTEM_PROMPT}\n\n{prompt}"

        # Call LLM
        response = self.llm_provider.complete(full_prompt)

        # Parse response
        category, confidence, reasoning = parse_classification_response(response)

        # Apply FR-004 hard precedence rule if enabled (T014)
        original_category = category
        if enforce_precedence:
            category, precedence_applied = validate_classification_precedence(
                category=category,
                statement=statement,
                reasoning=reasoning or "",
            )
            if precedence_applied:
                logging.info(
                    f"FR-004 precedence rule applied: {original_category.value} -> {category.value} "
                    f"(substantial barrier detected in statement)"
                )
                # Adjust reasoning to note the precedence application
                if reasoning:
                    reasoning = (
                        f"{reasoning}\n\n[VALIDATION NOTE: FR-004 precedence rule applied - "
                        f"substantial access barrier detected, classification adjusted from "
                        f"{original_category.value} to {category.value}]"
                    )

        # Create classification result
        classification = Classification(
            category=category,
            statement_type=statement_type,
            confidence_score=confidence,
            reasoning=reasoning if return_reasoning else None,
            model_config=self.config.llm,
            few_shot_example_ids=[ex.id for ex in selected],
        )

        # Log classification with extra metadata
        if self.logger and publication_id:
            extra_metadata = None
            if enforce_precedence and reasoning:
                # Extract completeness attributes for audit trail (SC-004)
                extra_metadata = extract_completeness_attributes(reasoning, statement_type)
                extra_metadata['precedence_applied'] = (original_category != category)
                extra_metadata['original_category'] = original_category.value if original_category != category else None

            self.logger.log_classification(
                publication_id=publication_id,
                classification=classification,
                statement_text=statement,
                extra=extra_metadata,
            )

        return classification

    def classify_publication(
        self,
        publication: Publication,
        return_reasoning: bool = True,
    ) -> Tuple[Optional[Classification], Optional[Classification]]:
        """Classify both data and code availability for a publication.

        Args:
            publication: Publication with data/code statements
            return_reasoning: Include reasoning in results

        Returns:
            Tuple of (data_classification, code_classification)
            Either can be None if statement is missing
        """
        data_result = None
        code_result = None

        if publication.has_data_statement():
            data_result = self.classify_statement(
                statement=publication.data_statement,
                statement_type=ClassificationType.DATA,
                return_reasoning=return_reasoning,
                publication_id=publication.id,
            )

        if publication.has_code_statement():
            code_result = self.classify_statement(
                statement=publication.code_statement,
                statement_type=ClassificationType.CODE,
                return_reasoning=return_reasoning,
                publication_id=publication.id,
            )

        return data_result, code_result

## Convenience Functions

Module-level functions for simpler usage.

In [None]:
#| export
_default_classifier: Optional[OpennessClassifier] = None


def get_classifier(config: Optional[ClassifierConfig] = None) -> OpennessClassifier:
    """Get or create the default classifier instance.
    
    Args:
        config: Optional configuration (loads from env if not provided)
        
    Returns:
        OpennessClassifier instance
    """
    global _default_classifier
    
    if _default_classifier is None or config is not None:
        if config is None:
            config = load_config()
        _default_classifier = OpennessClassifier.from_config(config)
    
    return _default_classifier


def classify_statement(
    statement: str,
    statement_type: ClassificationType | str,
    config: Optional[ClassifierConfig] = None,
    return_reasoning: bool = True,
) -> Classification:
    """Classify a single availability statement.
    
    Convenience function that manages classifier lifecycle.
    
    Args:
        statement: The availability statement text
        statement_type: "data" or "code" (or ClassificationType)
        config: Optional configuration
        return_reasoning: Include reasoning in result
        
    Returns:
        Classification result
        
    Example:
        >>> result = classify_statement(
        ...     "Data available at https://zenodo.org/record/12345",
        ...     "data"
        ... )
        >>> print(result.category.value)  # 'open'
    """
    # Convert string to enum if needed
    if isinstance(statement_type, str):
        statement_type = ClassificationType(statement_type.lower())
    
    classifier = get_classifier(config)
    return classifier.classify_statement(
        statement=statement,
        statement_type=statement_type,
        return_reasoning=return_reasoning,
    )


def classify_publication(
    data_statement: Optional[str] = None,
    code_statement: Optional[str] = None,
    publication_id: str = "unknown",
    config: Optional[ClassifierConfig] = None,
) -> Tuple[Optional[Classification], Optional[Classification]]:
    """Classify data and code availability for a publication.
    
    Args:
        data_statement: Data availability statement (optional)
        code_statement: Code availability statement (optional)
        publication_id: Identifier for logging
        config: Optional configuration
        
    Returns:
        Tuple of (data_classification, code_classification)
    """
    pub = Publication(
        id=publication_id,
        data_statement=data_statement,
        code_statement=code_statement,
    )
    
    classifier = get_classifier(config)
    return classifier.classify_publication(pub)

## Low Confidence Identification

In [None]:
#| export
def identify_low_confidence(
    classifications: List[Classification],
    threshold: float = 0.5
) -> List[Classification]:
    """Identify classifications with low confidence scores.

    Use this to find statements that may need manual review.

    Args:
        classifications: List of classification results
        threshold: Confidence threshold (default: 0.5)

    Returns:
        List of low-confidence classifications
    """
    return [c for c in classifications if c.confidence_score < threshold]


def suggest_training_examples(
    classifications: List[Tuple[str, Classification]],
    threshold: float = 0.5,
    max_suggestions: int = 10
) -> List[Tuple[str, Classification]]:
    """Suggest statements that would benefit from manual coding.

    Returns low-confidence classifications that should be manually
    reviewed and potentially added to training data.

    Args:
        classifications: List of (statement_text, classification) tuples
        threshold: Confidence threshold
        max_suggestions: Maximum suggestions to return

    Returns:
        List of (statement, classification) tuples needing review
    """
    low_conf = [
        (stmt, cls) for stmt, cls in classifications
        if cls.confidence_score < threshold
    ]

    # Sort by confidence (lowest first)
    low_conf.sort(key=lambda x: x[1].confidence_score)

    return low_conf[:max_suggestions]


def validate_classification_precedence(
    category: OpennessCategory,
    statement: str,
    reasoning: str,
) -> Tuple[OpennessCategory, bool]:
    """Apply FR-004 hard precedence rule for substantial access barriers.

    CRITICAL RULE: If substantial access barriers exist (data use agreements,
    proprietary terms, confidentiality restrictions), classification MUST be
    mostly_closed or closed, REGARDLESS of completeness or repository quality.

    This function validates and potentially corrects LLM classifications to
    ensure the hard precedence rule is always enforced.

    Args:
        category: The LLM's initial classification
        statement: The original availability statement
        reasoning: The LLM's reasoning text

    Returns:
        Tuple of (corrected_category, precedence_was_applied)
        - corrected_category: The validated/corrected classification
        - precedence_was_applied: True if a correction was made

    Examples:
        >>> # Statement mentions DUA but LLM classified as mostly_open
        >>> category, applied = validate_classification_precedence(
        ...     OpennessCategory.MOSTLY_OPEN,
        ...     "Data available via data use agreement from ICPSR",
        ...     "High completeness, all data types available..."
        ... )
        >>> category == OpennessCategory.MOSTLY_CLOSED and applied == True
        True

        >>> # Statement has no barriers, mostly_open classification preserved
        >>> category, applied = validate_classification_precedence(
        ...     OpennessCategory.MOSTLY_OPEN,
        ...     "All data available on Zenodo with free registration",
        ...     "High completeness with minor registration barrier..."
        ... )
        >>> category == OpennessCategory.MOSTLY_OPEN and applied == False
        True
    """
    # Check if substantial barrier exists in statement
    statement_has_barrier = has_substantial_barrier(statement)

    # "Upon request" or "contact author" is always CLOSED (not just mostly_closed)
    statement_lower = statement.lower()
    is_upon_request = any(phrase in statement_lower for phrase in [
        'upon request', 'upon reasonable request', 'contact the author',
        'contact author', 'available from the author', 'request from'
    ])

    # Determine if correction needed
    precedence_applied = False
    corrected_category = category

    if is_upon_request:
        # "Upon request" ALWAYS means CLOSED
        if category != OpennessCategory.CLOSED:
            corrected_category = OpennessCategory.CLOSED
            precedence_applied = True
    elif statement_has_barrier:
        # Substantial barrier (but not "upon request") -> at most mostly_closed
        if category in [OpennessCategory.OPEN, OpennessCategory.MOSTLY_OPEN]:
            corrected_category = OpennessCategory.MOSTLY_CLOSED
            precedence_applied = True

    return corrected_category, precedence_applied

In [None]:
# Test FR-004 precedence validation with boundary cases
from openness_classifier.core import OpennessCategory, ClassificationType

print("=== Testing FR-004 Hard Precedence Rule (T014) ===\n")

# Test case 1: DUA statement should be mostly_closed
cat1, applied1 = validate_classification_precedence(
    OpennessCategory.MOSTLY_OPEN,
    "All data available via data use agreement from ICPSR",
    "High completeness data"
)
print(f"Test 1 - DUA present (LLM: mostly_open):")
print(f"  Result: {cat1.value}, Precedence applied: {applied1}")
assert cat1 == OpennessCategory.MOSTLY_CLOSED and applied1 == True

# Test case 2: "Upon request" should be closed
cat2, applied2 = validate_classification_precedence(
    OpennessCategory.MOSTLY_OPEN,
    "Data available upon reasonable request from the corresponding author",
    "Some data"
)
print(f"\nTest 2 - 'Upon request' present (LLM: mostly_open):")
print(f"  Result: {cat2.value}, Precedence applied: {applied2}")
assert cat2 == OpennessCategory.CLOSED and applied2 == True

# Test case 3: No barriers, classification preserved
cat3, applied3 = validate_classification_precedence(
    OpennessCategory.MOSTLY_OPEN,
    "Raw; Results; Source Data available at Figshare with free registration",
    "High completeness"
)
print(f"\nTest 3 - No substantial barriers (LLM: mostly_open):")
print(f"  Result: {cat3.value}, Precedence applied: {applied3}")
assert cat3 == OpennessCategory.MOSTLY_OPEN and applied3 == False

# Test case 4: GitHub with all code should be mostly_open
cat4, applied4 = validate_classification_precedence(
    OpennessCategory.MOSTLY_OPEN,
    "All analysis and figure generation code on GitHub at github.com/user/repo",
    "High completeness"
)
print(f"\nTest 4 - All code on GitHub (LLM: mostly_open):")
print(f"  Result: {cat4.value}, Precedence applied: {applied4}")
assert cat4 == OpennessCategory.MOSTLY_OPEN and applied4 == False

print("\n=== All FR-004 precedence tests passed! ===")
print("\nClassifier module ready with refined taxonomy!")
print("To test with LLM, set up your .env file with API keys and run:")
print("  from openness_classifier import classify_statement")
print("  result = classify_statement('Data available at Zenodo', 'data')")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()