# Pipeline Orchestration Exercises

Practice orchestrating complete pipeline execution flows.

## Exercise 1: Build a Simple Pipeline Executor

Create a simplified Pipeline class that:
1. Takes a list of node configs
2. Determines execution order
3. Executes nodes sequentially
4. Tracks completed/failed/skipped

In [None]:
from dataclasses import dataclass, field
from typing import List, Dict, Any, Callable

@dataclass
class SimpleNodeConfig:
    name: str
    depends_on: List[str] = field(default_factory=list)
    # Function that executes the node (returns True for success, False for failure)
    execute_fn: Callable[[], bool] = None

@dataclass
class SimplePipelineResults:
    completed: List[str] = field(default_factory=list)
    failed: List[str] = field(default_factory=list)
    skipped: List[str] = field(default_factory=list)

class SimplePipeline:
    def __init__(self, nodes: List[SimpleNodeConfig]):
        # TODO: Store nodes
        pass
    
    def _topological_sort(self) -> List[str]:
        """Return execution order using topological sort."""
        # TODO: Implement topological sort
        # Hint: Use the algorithm from Module 5
        pass
    
    def run(self) -> SimplePipelineResults:
        """Execute pipeline and return results."""
        results = SimplePipelineResults()
        
        # TODO: Get execution order
        # TODO: For each node:
        #   - Check if dependencies failed
        #   - Skip if yes
        #   - Execute and track result
        
        return results

# Test your implementation
nodes = [
    SimpleNodeConfig("A", [], lambda: True),
    SimpleNodeConfig("B", [], lambda: False),  # This fails
    SimpleNodeConfig("C", ["A"], lambda: True),
    SimpleNodeConfig("D", ["B"], lambda: True),  # Should be skipped
    SimpleNodeConfig("E", ["C", "D"], lambda: True),  # Should be skipped
]

pipeline = SimplePipeline(nodes)
results = pipeline.run()

print(f"Completed: {results.completed}")  # Should be ['A', 'C']
print(f"Failed: {results.failed}")  # Should be ['B']
print(f"Skipped: {results.skipped}")  # Should be ['D', 'E']

## Exercise 2: Implement Layer Detection

Group nodes into execution layers based on dependencies.

In [None]:
def get_execution_layers(nodes: Dict[str, List[str]]) -> List[List[str]]:
    """
    Group nodes into layers for parallel execution.
    
    Args:
        nodes: Dict mapping node name to list of dependencies
    
    Returns:
        List of layers, where each layer is a list of node names
    
    Example:
        Input: {'A': [], 'B': [], 'C': ['A'], 'D': ['A', 'B'], 'E': ['C', 'D']}
        Output: [['A', 'B'], ['C', 'D'], ['E']]
    """
    # TODO: Implement layer detection
    # Hint: Layer 0 has no dependencies
    # Layer 1 depends only on Layer 0
    # Layer N depends only on Layers 0..N-1
    pass

# Test cases
test_cases = [
    {
        'nodes': {'A': [], 'B': [], 'C': ['A'], 'D': ['A', 'B'], 'E': ['C', 'D']},
        'expected': [['A', 'B'], ['C', 'D'], ['E']]
    },
    {
        'nodes': {'A': [], 'B': ['A'], 'C': ['B'], 'D': ['C']},
        'expected': [['A'], ['B'], ['C'], ['D']]
    },
    {
        'nodes': {'A': [], 'B': [], 'C': [], 'D': ['A', 'B', 'C']},
        'expected': [['A', 'B', 'C'], ['D']]
    },
]

for i, test in enumerate(test_cases, 1):
    result = get_execution_layers(test['nodes'])
    # Sort for comparison (order within layer doesn't matter)
    result_sorted = [sorted(layer) for layer in result]
    expected_sorted = [sorted(layer) for layer in test['expected']]
    
    if result_sorted == expected_sorted:
        print(f"✅ Test {i} passed")
    else:
        print(f"❌ Test {i} failed")
        print(f"   Expected: {expected_sorted}")
        print(f"   Got: {result_sorted}")

## Exercise 3: Build PipelineResults Aggregator

Aggregate results from multiple pipeline runs.

In [None]:
@dataclass
class PipelineRun:
    pipeline_name: str
    completed: List[str]
    failed: List[str]
    skipped: List[str]
    duration: float

@dataclass
class AggregatedResults:
    total_pipelines: int
    successful_pipelines: int
    failed_pipelines: int
    total_nodes: int
    total_completed: int
    total_failed: int
    total_skipped: int
    total_duration: float
    
    def success_rate(self) -> float:
        """Calculate node success rate."""
        if self.total_nodes == 0:
            return 0.0
        return (self.total_completed / self.total_nodes) * 100

def aggregate_pipeline_results(runs: List[PipelineRun]) -> AggregatedResults:
    """
    Aggregate results from multiple pipeline runs.
    
    A pipeline is considered successful if it has no failed nodes.
    """
    # TODO: Calculate aggregated statistics
    pass

# Test data
runs = [
    PipelineRun(
        pipeline_name="bronze_to_silver",
        completed=["A", "B", "C"],
        failed=[],
        skipped=[],
        duration=10.5
    ),
    PipelineRun(
        pipeline_name="silver_to_gold",
        completed=["D", "E"],
        failed=["F"],
        skipped=["G", "H"],
        duration=5.2
    ),
    PipelineRun(
        pipeline_name="gold_analytics",
        completed=["I", "J", "K", "L"],
        failed=[],
        skipped=[],
        duration=8.3
    ),
]

agg = aggregate_pipeline_results(runs)

print(f"Total Pipelines: {agg.total_pipelines}")  # Should be 3
print(f"Successful: {agg.successful_pipelines}")  # Should be 2
print(f"Failed: {agg.failed_pipelines}")  # Should be 1
print(f"Total Nodes: {agg.total_nodes}")  # Should be 12
print(f"Completed: {agg.total_completed}")  # Should be 9
print(f"Failed: {agg.total_failed}")  # Should be 1
print(f"Skipped: {agg.total_skipped}")  # Should be 2
print(f"Total Duration: {agg.total_duration:.1f}s")  # Should be 24.0
print(f"Success Rate: {agg.success_rate():.1f}%")  # Should be 75.0%

## Exercise 4: Pipeline Validation

Implement comprehensive pipeline validation.

In [None]:
@dataclass
class ValidationResult:
    valid: bool
    errors: List[str] = field(default_factory=list)
    warnings: List[str] = field(default_factory=list)

@dataclass
class NodeDef:
    name: str
    depends_on: List[str]
    connection: str

def validate_pipeline(
    nodes: List[NodeDef],
    available_connections: List[str]
) -> ValidationResult:
    """
    Validate pipeline configuration.
    
    Checks:
    1. No circular dependencies
    2. All dependencies exist
    3. Connections are available (warning if missing)
    4. No duplicate node names
    """
    result = ValidationResult(valid=True)
    
    # TODO: Implement validation checks
    # Set result.valid = False if any errors found
    # Add errors to result.errors
    # Add warnings to result.warnings
    
    return result

# Test cases
test_valid = [
    NodeDef("A", [], "local"),
    NodeDef("B", ["A"], "local"),
    NodeDef("C", ["B"], "azure"),
]

test_circular = [
    NodeDef("A", ["B"], "local"),
    NodeDef("B", ["C"], "local"),
    NodeDef("C", ["A"], "local"),  # Circular!
]

test_missing_dep = [
    NodeDef("A", [], "local"),
    NodeDef("B", ["X"], "local"),  # X doesn't exist!
]

test_duplicate = [
    NodeDef("A", [], "local"),
    NodeDef("A", [], "local"),  # Duplicate!
]

connections = ["local", "azure"]

print("Test 1 - Valid pipeline:")
result = validate_pipeline(test_valid, connections)
print(f"  Valid: {result.valid}")
print(f"  Errors: {result.errors}")

print("\nTest 2 - Circular dependency:")
result = validate_pipeline(test_circular, connections)
print(f"  Valid: {result.valid}")
print(f"  Errors: {result.errors}")

print("\nTest 3 - Missing dependency:")
result = validate_pipeline(test_missing_dep, connections)
print(f"  Valid: {result.valid}")
print(f"  Errors: {result.errors}")

print("\nTest 4 - Duplicate nodes:")
result = validate_pipeline(test_duplicate, connections)
print(f"  Valid: {result.valid}")
print(f"  Errors: {result.errors}")

## Exercise 5: Pipeline Execution Simulator

Create a complete pipeline execution simulator with timing and random failures.

In [None]:
import random
import time
from datetime import datetime

@dataclass
class SimulatedNode:
    name: str
    depends_on: List[str]
    duration: float  # Seconds
    failure_rate: float  # 0.0 to 1.0

@dataclass
class ExecutionResults:
    pipeline_name: str
    completed: List[str] = field(default_factory=list)
    failed: List[str] = field(default_factory=list)
    skipped: List[str] = field(default_factory=list)
    node_durations: Dict[str, float] = field(default_factory=dict)
    start_time: str = ""
    end_time: str = ""
    total_duration: float = 0.0

class PipelineSimulator:
    def __init__(self, pipeline_name: str, nodes: List[SimulatedNode]):
        # TODO: Initialize simulator
        pass
    
    def _topological_sort(self) -> List[str]:
        # TODO: Implement topological sort
        pass
    
    def _execute_node(self, node: SimulatedNode) -> bool:
        """
        Simulate node execution.
        
        Returns True for success, False for failure.
        """
        # TODO: Sleep for node.duration
        # TODO: Randomly fail based on failure_rate
        pass
    
    def run(self, verbose: bool = True) -> ExecutionResults:
        """
        Execute pipeline with simulation.
        
        If verbose, print progress as nodes execute.
        """
        # TODO: Implement full execution with:
        # - Timing
        # - Dependency checking
        # - Failure propagation
        # - Progress logging
        pass

# Test the simulator
nodes = [
    SimulatedNode("raw_customers", [], duration=0.5, failure_rate=0.1),
    SimulatedNode("raw_orders", [], duration=0.5, failure_rate=0.1),
    SimulatedNode("clean_customers", ["raw_customers"], duration=1.0, failure_rate=0.0),
    SimulatedNode("clean_orders", ["raw_orders"], duration=1.0, failure_rate=0.0),
    SimulatedNode("customer_orders", ["clean_customers", "clean_orders"], duration=1.5, failure_rate=0.0),
]

simulator = PipelineSimulator("bronze_to_silver", nodes)
results = simulator.run(verbose=True)

print("\n" + "="*60)
print(f"Pipeline: {results.pipeline_name}")
print(f"Completed: {len(results.completed)}")
print(f"Failed: {len(results.failed)}")
print(f"Skipped: {len(results.skipped)}")
print(f"Total Duration: {results.total_duration:.2f}s")
print("="*60)

## Challenge Exercise: Multi-Pipeline Manager

Build a manager that can run multiple pipelines with dependencies between them.

In [None]:
@dataclass
class PipelineDef:
    name: str
    nodes: List[str]
    depends_on_pipelines: List[str] = field(default_factory=list)

class MultiPipelineManager:
    """
    Manages multiple pipelines where pipelines can depend on other pipelines.
    
    Example:
        bronze_to_silver: no dependencies
        silver_to_gold: depends on bronze_to_silver
        analytics: depends on silver_to_gold
    """
    
    def __init__(self, pipelines: List[PipelineDef]):
        # TODO: Store pipelines
        pass
    
    def _get_pipeline_execution_order(self) -> List[str]:
        """
        Get execution order for pipelines based on their dependencies.
        """
        # TODO: Topological sort on pipelines
        pass
    
    def run(self, pipeline_names: List[str] = None) -> Dict[str, bool]:
        """
        Run specified pipelines (or all if None).
        
        Returns dict mapping pipeline name to success status.
        
        If a pipeline fails, dependent pipelines are skipped.
        """
        # TODO: Implement multi-pipeline execution
        pass

# Test the manager
pipelines = [
    PipelineDef("raw_ingestion", ["fetch_customers", "fetch_orders"]),
    PipelineDef("bronze_to_silver", ["clean_customers", "clean_orders"], ["raw_ingestion"]),
    PipelineDef("silver_to_gold", ["customer_orders", "order_metrics"], ["bronze_to_silver"]),
    PipelineDef("analytics", ["customer_analytics", "revenue_report"], ["silver_to_gold"]),
]

manager = MultiPipelineManager(pipelines)

print("Running all pipelines:")
results = manager.run()

for name, success in results.items():
    status = "✅" if success else "❌"
    print(f"{status} {name}")