# YAML Configuration - Solutions

Solutions to exercises from [exercises.ipynb](exercises.ipynb).

## Solution 1: Fix the Broken YAML

In [None]:
import yaml

fixed_yaml = """
project: Data Pipeline
version: "1.0"  # ‚úÖ Fixed: Quote to keep as string
enabled: true   # ‚úÖ Fixed: Use lowercase true

connections:    # ‚úÖ Fixed: Added colon
  database:
    host: localhost
    port: 5432  # ‚úÖ Fixed: Removed quotes to make it int
  storage:
    type: local  # ‚úÖ Fixed: Proper indentation
    path: /data

countries:
  - "NO"  # ‚úÖ Fixed: Quoted to prevent boolean conversion
  - "SE"
"""

data = yaml.safe_load(fixed_yaml)
print("‚úÖ Fixed YAML:")
print(yaml.dump(data, default_flow_style=False))

# Verify fixes
assert isinstance(data['version'], str)
assert data['enabled'] is True
assert isinstance(data['connections']['database']['port'], int)
assert data['countries'][0] == 'NO'  # String, not False!
print("\n‚úÖ All assertions passed!")

## Solution 2: Use Anchors to DRY Up Config

In [None]:
refactored_yaml = """
# Define defaults with anchor
_defaults: &db_defaults
  port: 5432
  timeout: 30
  pool_size: 5
  retry_attempts: 3

dev_database:
  <<: *db_defaults  # Merge defaults
  host: dev.db.com

staging_database:
  <<: *db_defaults
  host: staging.db.com

prod_database:
  <<: *db_defaults
  host: prod.db.com
  timeout: 60    # Override for production
  pool_size: 10  # Override for production
"""

data = yaml.safe_load(refactored_yaml)
print("Dev database:")
print(data['dev_database'])
print("\nProd database:")
print(data['prod_database'])

# Verify
assert data['dev_database']['port'] == 5432
assert data['dev_database']['timeout'] == 30
assert data['prod_database']['timeout'] == 60  # Overridden
assert data['prod_database']['pool_size'] == 10  # Overridden
print("\n‚úÖ Refactored successfully with 70% less duplication!")

## Solution 3: Create a Pydantic Schema

In [None]:
from pydantic import BaseModel, Field, validator
from typing import List, Dict, Optional, Literal

class ConnectionConfig(BaseModel):
    type: Literal['local', 'azure_adls', 'azure_sql']
    base_path: Optional[str] = None

class RetryConfig(BaseModel):
    max_attempts: int = Field(ge=1, le=10, description="Max retry attempts")
    backoff_seconds: float = Field(gt=0, description="Backoff time in seconds")

class NodeConfig(BaseModel):
    name: str
    operation: Literal['read', 'transform', 'write']
    depends_on: List[str] = Field(default_factory=list)
    
    @validator('name')
    def validate_name(cls, v):
        if not v.replace('_', '').isalnum():
            raise ValueError('Node name must be alphanumeric + underscores')
        return v

class PipelineConfig(BaseModel):
    pipeline: str
    nodes: List[NodeConfig]
    
    @validator('nodes')
    def validate_dependencies(cls, nodes):
        """Check that depends_on references valid nodes."""
        node_names = {n.name for n in nodes}
        for node in nodes:
            for dep in node.depends_on:
                if dep not in node_names:
                    raise ValueError(f"Node '{node.name}' depends on unknown node '{dep}'")
        return nodes

class AppConfig(BaseModel):
    project: str
    engine: Literal['pandas', 'spark'] = 'pandas'
    connections: Dict[str, ConnectionConfig]
    retry: RetryConfig
    pipelines: List[PipelineConfig]

# Test
pipeline_yaml = """
project: Sales ETL
engine: pandas

connections:
  data:
    type: local
    base_path: ./data

retry:
  max_attempts: 3
  backoff_seconds: 2.0

pipelines:
  - pipeline: bronze_to_silver
    nodes:
      - name: load_sales
        operation: read
      - name: clean_sales
        operation: transform
        depends_on: [load_sales]
"""

data = yaml.safe_load(pipeline_yaml)
config = AppConfig(**data)
print("‚úÖ Valid config loaded!")
print(config.model_dump_json(indent=2))

## Solution 4: Environment Variable Substitution

In [None]:
import os
import re

def load_yaml_with_env(yaml_string: str) -> dict:
    """
    Load YAML with environment variable substitution.
    Supports:
    - ${VAR} - replace with env var (error if not set)
    - ${VAR:-default} - replace with env var or default
    """
    def replace_env(match):
        full_match = match.group(1)
        
        # Check for default value syntax
        if ':-' in full_match:
            var_name, default = full_match.split(':-', 1)
            return os.environ.get(var_name, default)
        else:
            # No default - must be set
            var_name = full_match
            if var_name not in os.environ:
                raise ValueError(f"Environment variable '{var_name}' not set")
            return os.environ[var_name]
    
    # Replace ${...} patterns
    expanded = re.sub(r'\$\{([^}]+)\}', replace_env, yaml_string)
    return yaml.safe_load(expanded)

# Test
os.environ['DB_HOST'] = 'production.db.com'
# DB_PORT not set - should use default

test_yaml = """
database:
  host: ${DB_HOST}
  port: ${DB_PORT:-5432}
  timeout: ${TIMEOUT:-30}
"""

config = load_yaml_with_env(test_yaml)
print("Config with env vars:")
print(config)

assert config['database']['host'] == 'production.db.com'
assert config['database']['port'] == 5432  # Default used (string in YAML)
assert config['database']['timeout'] == 30
print("\n‚úÖ All tests passed!")

## Solution 5: Multi-Environment Configs

In [None]:
from pathlib import Path
from typing import Any, Dict

def deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
    """Recursively merge override into base."""
    result = base.copy()
    
    for key, value in override.items():
        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
            result[key] = deep_merge(result[key], value)
        else:
            result[key] = value
    
    return result

def load_config_for_env(base_path: Path, env: str) -> dict:
    """
    Load config with inheritance:
    1. Load base.yaml
    2. Load {env}.yaml
    3. Merge (env overrides base)
    """
    base_file = base_path / 'base.yaml'
    env_file = base_path / f'{env}.yaml'
    
    # Load base
    with open(base_file, 'r') as f:
        base_config = yaml.safe_load(f) or {}
    
    # Load environment-specific
    if env_file.exists():
        with open(env_file, 'r') as f:
            env_config = yaml.safe_load(f) or {}
    else:
        env_config = {}
    
    # Merge
    return deep_merge(base_config, env_config)

# Create test files
config_dir = Path('example_configs/multi_env')
config_dir.mkdir(exist_ok=True, parents=True)

base_yaml = """
project: My Project
timeout: 30
log_level: INFO
database:
  host: localhost
  port: 5432
"""

prod_yaml = """
timeout: 60
log_level: WARNING
replicas: 3
database:
  host: prod.db.com  # Override
  # port inherited from base
"""

(config_dir / 'base.yaml').write_text(base_yaml)
(config_dir / 'prod.yaml').write_text(prod_yaml)

# Test
config = load_config_for_env(config_dir, 'prod')
print("Merged prod config:")
print(yaml.dump(config, default_flow_style=False))

assert config['timeout'] == 60  # Overridden
assert config['project'] == 'My Project'  # From base
assert config['replicas'] == 3  # From prod
assert config['database']['host'] == 'prod.db.com'  # Overridden
assert config['database']['port'] == 5432  # Inherited from base
print("\n‚úÖ Multi-environment config works!")

## Solution 6: Analyze Odibi Config

In [None]:
from collections import Counter

def analyze_odibi_config(yaml_path: str) -> dict:
    """Analyze an Odibi pipeline configuration."""
    with open(yaml_path, 'r') as f:
        config = yaml.safe_load(f)
    
    pipelines = config.get('pipelines', [])
    connections = config.get('connections', {})
    
    # Count nodes
    total_nodes = sum(len(p.get('nodes', [])) for p in pipelines)
    
    # Connection types
    conn_types = [conn.get('type') for conn in connections.values()]
    
    # Formats used
    formats = []
    nodes_with_deps = 0
    
    for pipeline in pipelines:
        for node in pipeline.get('nodes', []):
            # Check for read/write formats
            if 'read' in node:
                fmt = node['read'].get('format')
                if fmt:
                    formats.append(fmt)
            if 'write' in node:
                fmt = node['write'].get('format')
                if fmt:
                    formats.append(fmt)
            
            # Check for dependencies
            if node.get('depends_on'):
                nodes_with_deps += 1
    
    return {
        'total_pipelines': len(pipelines),
        'total_nodes': total_nodes,
        'connection_types': list(set(conn_types)),
        'formats': dict(Counter(formats)),
        'nodes_with_deps': nodes_with_deps,
        'pipeline_names': [p.get('pipeline') for p in pipelines]
    }

# Test with real Odibi config
odibi_path = r'c:\Users\hodibi\OneDrive - Ingredion\Desktop\Repos\Odibi\examples\example_delta_pipeline.yaml'
analysis = analyze_odibi_config(odibi_path)

print("üìä Odibi Config Analysis")
print("=" * 50)
print(f"Pipelines: {analysis['total_pipelines']}")
print(f"  Names: {', '.join(analysis['pipeline_names'])}")
print(f"\nNodes: {analysis['total_nodes']}")
print(f"  With dependencies: {analysis['nodes_with_deps']}")
print(f"\nConnection types: {', '.join(analysis['connection_types'])}")
print(f"\nFormats used:")
for fmt, count in analysis['formats'].items():
    print(f"  {fmt}: {count}x")
print("\n‚úÖ Analysis complete!")

## Bonus Solution: Config Linter

In [None]:
import re
from pathlib import Path
from typing import List

def lint_yaml_config(yaml_path: Path) -> List[str]:
    """Check YAML config for common issues."""
    warnings = []
    
    # Read raw YAML (before parsing)
    content = yaml_path.read_text()
    lines = content.split('\n')
    
    # Check 1: Hardcoded secrets
    secret_patterns = [
        r'password\s*:\s*[^\s]+',
        r'secret\s*:\s*[^\s]+',
        r'api_key\s*:\s*[^\s]+',
    ]
    for i, line in enumerate(lines, 1):
        for pattern in secret_patterns:
            if re.search(pattern, line, re.IGNORECASE):
                if '${' not in line:  # Not an env var
                    warnings.append(f"Line {i}: Possible hardcoded secret")
    
    # Check 2: Unquoted boolean-like values
    risky_values = ['NO', 'YES', 'ON', 'OFF']
    for i, line in enumerate(lines, 1):
        for val in risky_values:
            if re.search(f':\s*{val}\s*$', line):
                warnings.append(f"Line {i}: Unquoted '{val}' may be misinterpreted")
    
    # Check 3: Parse and check structure
    try:
        config = yaml.safe_load(content)
        
        # Check for TODO/FIXME
        for i, line in enumerate(lines, 1):
            if 'TODO' in line or 'FIXME' in line:
                warnings.append(f"Line {i}: Contains TODO/FIXME")
        
        # Check naming consistency
        def check_keys(obj, path=''):
            if isinstance(obj, dict):
                for key, value in obj.items():
                    # Check snake_case vs camelCase
                    if re.search(r'[a-z][A-Z]', key):  # camelCase
                        warnings.append(f"{path}.{key}: Uses camelCase (prefer snake_case)")
                    check_keys(value, f"{path}.{key}" if path else key)
            elif isinstance(obj, list):
                for i, item in enumerate(obj):
                    check_keys(item, f"{path}[{i}]")
        
        check_keys(config)
        
    except yaml.YAMLError as e:
        warnings.append(f"YAML parsing error: {e}")
    
    return warnings

# Test
test_yaml = """
project: Test
database:
  host: localhost
  password: hardcoded_secret_123  # Bad!
  country: NO  # Unquoted!
camelCaseKey: value  # Bad naming
# TODO: fix this later
"""

test_file = Path('example_configs/test_lint.yaml')
test_file.write_text(test_yaml)

warnings = lint_yaml_config(test_file)
print("üîç Linter Results:")
for warning in warnings:
    print(f"  ‚ö†Ô∏è  {warning}")

if not warnings:
    print("  ‚úÖ No issues found!")