# Module 10.3: Documentation & Reproducibility

**Learning Objectives:**
- Set up automated documentation generation with mkdocs
- Enforce dataset path resolution standards across modules
- Implement reproducible environment management
- Create robust error handling and CLI patterns

**Semiconductor Context:**
Production ML systems require comprehensive documentation and reproducible environments to ensure consistent results across different teams, environments, and time periods. This is critical for regulatory compliance and process validation in semiconductor manufacturing.

In [None]:
import json
import sys
from pathlib import Path
import subprocess
import tempfile
import os

# Dataset path resolution following copilot instructions
DATA_DIR = Path('../../../datasets').resolve()
print(f"Dataset directory: {DATA_DIR}")
print(f"Exists: {DATA_DIR.exists()}")

# Import our pipeline
sys.path.append('.')
from importlib.util import spec_from_file_loader, module_from_spec

# Load the pipeline module
spec = spec_from_file_loader("doc_pipeline", "10.3-documentation-reproducibility-pipeline.py")
doc_pipeline = module_from_spec(spec)
spec.loader.exec_module(doc_pipeline)

# Create pipeline instance
pipeline = doc_pipeline.DocumentationReproducibilityPipeline()

## 1. Documentation Generation

### Converting Notebooks to Documentation

The first step in establishing a documentation workflow is converting Jupyter notebooks to readable documentation formats.

In [None]:
# Create a temporary notebook for demonstration
temp_dir = Path(tempfile.mkdtemp())
demo_nb_dir = temp_dir / "demo_notebooks"
demo_nb_dir.mkdir(parents=True)

# Create a simple demo notebook
demo_notebook = {
    "cells": [
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": ["# Demo Notebook\n", "\n", "This is a demonstration notebook for documentation generation."]
        },
        {
            "cell_type": "code",
            "execution_count": 1,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": ["Hello, semiconductor world!\n"]
                }
            ],
            "source": ["print('Hello, semiconductor world!')"]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3",
            "language": "python",
            "name": "python3"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 4
}

# Save demo notebook
demo_nb_path = demo_nb_dir / "demo.ipynb"
with open(demo_nb_path, 'w') as f:
    json.dump(demo_notebook, f, indent=2)

print(f"Created demo notebook: {demo_nb_path}")

In [None]:
# Generate documentation from the demo notebook
output_dir = temp_dir / "docs_output"

result = pipeline.generate_documentation(
    input_dir=demo_nb_dir,
    output_dir=output_dir,
    format="both"  # Generate both markdown and HTML
)

print("Documentation generation result:")
print(json.dumps(result, indent=2))

# Show generated files
if result["success"]:
    print("\nGenerated files:")
    for file_info in result["converted_files"]:
        output_path = Path(file_info["output"])
        if output_path.exists():
            print(f"  {file_info['format']}: {output_path} ({output_path.stat().st_size} bytes)")

## 2. Dataset Path Validation

### Enforcing Standard Path Resolution

One of the critical aspects of reproducibility is ensuring consistent dataset access patterns across all modules.

In [None]:
# Validate dataset paths in existing modules
modules_dir = Path("../../")

validation_result = pipeline.validate_dataset_paths(modules_dir)

print("Dataset path validation result:")
print(json.dumps(validation_result, indent=2, default=str))

# Summarize findings
if validation_result["success"]:
    summary = validation_result["summary"]
    print(f"\n✅ Validation Summary:")
    print(f"  Total notebooks: {summary['total_notebooks']}")
    print(f"  Warnings: {summary['total_warnings']}")
    print(f"  Invalid paths: {summary['total_invalid_paths']}")
else:
    print("\n❌ Validation failed")

## 3. Environment Export for Reproducibility

### Creating Reproducible Environment Specifications

For reproducible results, we need to capture exact environment specifications.

In [None]:
# Export environment in different formats
export_dir = temp_dir / "environment_exports"
export_dir.mkdir(exist_ok=True)

# Try conda export first
conda_result = pipeline.export_environment(
    output_path=export_dir / "environment.yml",
    format="conda"
)

print("Conda environment export:")
print(json.dumps(conda_result, indent=2))

# Export pip requirements as fallback
pip_result = pipeline.export_environment(
    output_path=export_dir / "requirements.txt",
    format="pip"
)

print("\nPip requirements export:")
print(json.dumps(pip_result, indent=2))

# Show file contents (first few lines)
for result in [conda_result, pip_result]:
    if result["success"]:
        file_path = Path(result["output_file"])
        if file_path.exists():
            print(f"\nFirst 10 lines of {file_path.name}:")
            with open(file_path, 'r') as f:
                lines = f.readlines()[:10]
                for i, line in enumerate(lines, 1):
                    print(f"  {i:2d}: {line.rstrip()}")

## 4. MkDocs Documentation Site Setup

### Creating a Professional Documentation Site

MkDocs with Material theme provides a modern, searchable documentation site.

In [None]:
# Set up mkdocs documentation site
mkdocs_dir = temp_dir / "mkdocs_site"

setup_result = pipeline.setup_mkdocs(mkdocs_dir)

print("MkDocs setup result:")
print(json.dumps(setup_result, indent=2))

if setup_result["success"]:
    # Show the generated mkdocs.yml file
    config_file = Path(setup_result["config_file"])
    if config_file.exists():
        print(f"\nContents of {config_file.name}:")
        with open(config_file, 'r') as f:
            print(f.read())
        
        # Show directory structure
        print(f"\nMkDocs project structure:")
        for item in sorted(mkdocs_dir.rglob("*")):
            if item.is_file():
                rel_path = item.relative_to(mkdocs_dir)
                print(f"  {rel_path}")

In [None]:
# Attempt to build the documentation site
build_result = pipeline.build_docs(mkdocs_dir)

print("MkDocs build result:")
print(json.dumps(build_result, indent=2))

if build_result["success"]:
    site_dir = Path(build_result["site_dir"])
    if site_dir.exists():
        print(f"\nGenerated site files:")
        for item in sorted(site_dir.rglob("*.html")):
            rel_path = item.relative_to(site_dir)
            size = item.stat().st_size
            print(f"  {rel_path} ({size} bytes)")
else:
    print("Note: MkDocs build may fail if mkdocs is not installed. This is expected in some environments.")

## 5. CLI Usage Examples

### Command-Line Interface for Production Use

The pipeline provides a comprehensive CLI for automation and CI/CD integration.

In [None]:
# Demonstrate CLI help
import subprocess

try:
    result = subprocess.run(
        [sys.executable, "10.3-documentation-reproducibility-pipeline.py", "--help"],
        capture_output=True,
        text=True,
        cwd=Path.cwd()
    )
    
    print("CLI Help Output:")
    print(result.stdout)
    
except Exception as e:
    print(f"Could not run CLI help: {e}")

In [None]:
# Test CLI subcommand help
subcommands = ['generate-docs', 'validate-paths', 'export-env', 'setup-mkdocs', 'build-docs']

for cmd in subcommands:
    try:
        result = subprocess.run(
            [sys.executable, "10.3-documentation-reproducibility-pipeline.py", cmd, "--help"],
            capture_output=True,
            text=True,
            cwd=Path.cwd(),
            timeout=10
        )
        
        print(f"\n{'='*50}")
        print(f"CLI Help for '{cmd}':")
        print('='*50)
        print(result.stdout)
        
    except Exception as e:
        print(f"Could not get help for {cmd}: {e}")

## 6. Best Practices Summary

### Key Reproducibility Patterns

1. **Standardized Dataset Paths**: Use `Path('../../../datasets').resolve()` pattern
2. **Version Pinning**: Export exact environment specifications
3. **Documentation Automation**: Convert notebooks to multiple formats
4. **Error Handling**: JSON error responses for programmatic usage
5. **CLI Design**: Comprehensive help text with examples

In [None]:
# Cleanup temporary files
import shutil

try:
    shutil.rmtree(temp_dir)
    print(f"Cleaned up temporary directory: {temp_dir}")
except Exception as e:
    print(f"Could not clean up {temp_dir}: {e}")

print("\n✅ Module 10.3 Documentation & Reproducibility Complete!")
print("\nNext steps:")
print("1. Set up mkdocs in your project root")
print("2. Configure CI/CD to validate dataset paths")
print("3. Automate environment exports for releases")
print("4. Integrate notebook conversion into documentation pipeline")