# Configuration Module

> Configuration management for the openness classifier.

This module handles:
- Loading configuration from environment variables
- Saving/loading configuration to/from JSON files
- Validating configuration settings

Configuration follows the 12-factor app methodology, loading sensitive values (API keys) from environment variables.

In [None]:
#| default_exp config

In [None]:
#| export
from __future__ import annotations
import os
import json
from pathlib import Path
from typing import Optional, Dict, Any
from dataclasses import dataclass, field, asdict
from datetime import datetime

from dotenv import load_dotenv

from openness_classifier.core import (
    LLMConfiguration, 
    LLMProviderType,
    ConfigurationError
)

## ClassifierConfig

Main configuration class for the classifier, including LLM settings and paths.

In [None]:
#| export
@dataclass
class ClassifierConfig:
    """Complete configuration for the openness classifier.
    
    Attributes:
        llm: LLM provider configuration
        training_data_path: Path to training data CSV
        log_dir: Directory for classification logs
        few_shot_k: Number of few-shot examples to use
        embedding_model: Sentence transformer model for similarity
    """
    llm: LLMConfiguration
    training_data_path: Path = Path('resources/abpoll-open-b71bd12/data/processed/articles_reviewed.csv')
    log_dir: Path = Path('logs')
    few_shot_k: int = 5
    embedding_model: str = 'all-MiniLM-L6-v2'
    
    def __post_init__(self):
        """Ensure paths are Path objects."""
        if isinstance(self.training_data_path, str):
            self.training_data_path = Path(self.training_data_path)
        if isinstance(self.log_dir, str):
            self.log_dir = Path(self.log_dir)
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            'llm': self.llm.to_dict(),
            'training_data_path': str(self.training_data_path),
            'log_dir': str(self.log_dir),
            'few_shot_k': self.few_shot_k,
            'embedding_model': self.embedding_model,
        }
    
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> 'ClassifierConfig':
        """Create from dictionary."""
        return cls(
            llm=LLMConfiguration.from_dict(data['llm']),
            training_data_path=Path(data.get('training_data_path', 
                'resources/abpoll-open-b71bd12/data/processed/articles_reviewed.csv')),
            log_dir=Path(data.get('log_dir', 'logs')),
            few_shot_k=data.get('few_shot_k', 5),
            embedding_model=data.get('embedding_model', 'all-MiniLM-L6-v2'),
        )

## Loading Configuration

Load configuration from environment variables or JSON files.

In [None]:
#| export
def load_config(
    config_path: Optional[str | Path] = None,
    env_file: Optional[str | Path] = None
) -> ClassifierConfig:
    """Load classifier configuration.
    
    Priority order:
    1. JSON config file (if provided)
    2. Environment variables (from .env file or system)
    3. Default values
    
    Args:
        config_path: Optional path to JSON config file
        env_file: Optional path to .env file (default: .env)
        
    Returns:
        ClassifierConfig with loaded settings
        
    Raises:
        ConfigurationError: If configuration is invalid or missing required values
    """
    # Load .env file if it exists
    if env_file:
        load_dotenv(env_file)
    else:
        load_dotenv()  # Load from .env in current directory
    
    # If JSON config provided, load from it
    if config_path:
        config_path = Path(config_path)
        if config_path.exists():
            with open(config_path) as f:
                return ClassifierConfig.from_dict(json.load(f))
        else:
            raise ConfigurationError(f"Config file not found: {config_path}")
    
    # Load from environment variables
    return _load_from_env()


def _load_from_env() -> ClassifierConfig:
    """Load configuration from environment variables."""
    # Get provider
    provider_str = os.getenv('LLM_PROVIDER', 'claude').lower()
    try:
        provider = LLMProviderType(provider_str)
    except ValueError:
        raise ConfigurationError(
            f"Invalid LLM_PROVIDER: {provider_str}. "
            f"Must be one of: claude, openai, ollama"
        )
    
    # Get model name with provider-specific defaults
    default_models = {
        LLMProviderType.CLAUDE: 'claude-3-5-sonnet-20241022',
        LLMProviderType.OPENAI: 'gpt-4-turbo',
        LLMProviderType.OLLAMA: 'llama3:8b',
    }
    model_name = os.getenv('LLM_MODEL_NAME', default_models[provider])
    
    # Validate API key exists for non-Ollama providers
    api_key = None
    if provider == LLMProviderType.CLAUDE:
        api_key = os.getenv('ANTHROPIC_API_KEY')
        if not api_key:
            raise ConfigurationError(
                "ANTHROPIC_API_KEY not found in environment. "
                "Set it in your .env file or environment."
            )
    elif provider == LLMProviderType.OPENAI:
        api_key = os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise ConfigurationError(
                "OPENAI_API_KEY not found in environment. "
                "Set it in your .env file or environment."
            )
    
    # Build LLM config
    llm_config = LLMConfiguration(
        provider=provider,
        model_name=model_name,
        temperature=float(os.getenv('LLM_TEMPERATURE', '0.1')),
        max_tokens=int(os.getenv('LLM_MAX_TOKENS', '500')),
        top_p=float(os.getenv('LLM_TOP_P', '0.95')),
        api_endpoint=os.getenv('OLLAMA_BASE_URL') if provider == LLMProviderType.OLLAMA else None,
        api_key_hash=LLMConfiguration.hash_api_key(api_key) if api_key else None,
    )
    
    # Build full config
    return ClassifierConfig(
        llm=llm_config,
        training_data_path=Path(os.getenv(
            'TRAINING_DATA_PATH',
            'resources/abpoll-open-b71bd12/data/processed/articles_reviewed.csv'
        )),
        log_dir=Path(os.getenv('LOG_DIR', 'logs')),
        few_shot_k=int(os.getenv('FEW_SHOT_K', '5')),
        embedding_model=os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2'),
    )

## Saving Configuration

In [None]:
#| export
def save_config(config: ClassifierConfig, path: str | Path) -> None:
    """Save configuration to JSON file.
    
    Note: API keys are NOT saved - only their hashes for audit trail.
    
    Args:
        config: Configuration to save
        path: Output path for JSON file
    """
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(path, 'w') as f:
        json.dump(config.to_dict(), f, indent=2)


def create_example_config(path: str | Path = 'example_config.json') -> None:
    """Create an example configuration file for documentation.
    
    Args:
        path: Output path for example config
    """
    example = ClassifierConfig(
        llm=LLMConfiguration(
            provider=LLMProviderType.CLAUDE,
            model_name='claude-3-5-sonnet-20241022',
            temperature=0.1,
            max_tokens=500,
            top_p=0.95,
            api_key_hash='<hash_of_your_api_key>',
        ),
        training_data_path=Path('resources/abpoll-open-b71bd12/data/processed/articles_reviewed.csv'),
        log_dir=Path('logs'),
        few_shot_k=5,
        embedding_model='all-MiniLM-L6-v2',
    )
    
    save_config(example, path)

In [None]:
# Test configuration loading (will fail without API key, which is expected)
import os

# Test with mock environment
os.environ['LLM_PROVIDER'] = 'ollama'  # Ollama doesn't need API key
os.environ['LLM_MODEL_NAME'] = 'llama3:8b'
os.environ['OLLAMA_BASE_URL'] = 'http://localhost:11434'

try:
    config = load_config()
    print(f"Provider: {config.llm.provider.value}")
    print(f"Model: {config.llm.model_name}")
    print(f"Training data: {config.training_data_path}")
    print("Config loading test passed!")
except ConfigurationError as e:
    print(f"Expected error (no API key): {e}")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()