# Pipeline Orchestration Solutions

Complete solutions with detailed explanations.

## Solution 1: Simple Pipeline Executor

In [None]:
from dataclasses import dataclass, field
from typing import List, Dict, Any, Callable
from collections import defaultdict, deque

@dataclass
class SimpleNodeConfig:
    name: str
    depends_on: List[str] = field(default_factory=list)
    execute_fn: Callable[[], bool] = None

@dataclass
class SimplePipelineResults:
    completed: List[str] = field(default_factory=list)
    failed: List[str] = field(default_factory=list)
    skipped: List[str] = field(default_factory=list)

class SimplePipeline:
    def __init__(self, nodes: List[SimpleNodeConfig]):
        self.nodes = {node.name: node for node in nodes}
    
    def _topological_sort(self) -> List[str]:
        """Kahn's algorithm for topological sort."""
        # Calculate in-degrees
        in_degree = {name: 0 for name in self.nodes}
        for node in self.nodes.values():
            for dep in node.depends_on:
                in_degree[node.name] += 1
        
        # Find nodes with no dependencies
        queue = deque([name for name, degree in in_degree.items() if degree == 0])
        result = []
        
        while queue:
            current = queue.popleft()
            result.append(current)
            
            # Reduce in-degree for dependent nodes
            for name, node in self.nodes.items():
                if current in node.depends_on:
                    in_degree[name] -= 1
                    if in_degree[name] == 0:
                        queue.append(name)
        
        if len(result) != len(self.nodes):
            raise ValueError("Circular dependency detected")
        
        return result
    
    def run(self) -> SimplePipelineResults:
        """Execute pipeline and return results."""
        results = SimplePipelineResults()
        
        # Get execution order
        execution_order = self._topological_sort()
        
        # Execute nodes in order
        for node_name in execution_order:
            node = self.nodes[node_name]
            
            # Check if dependencies failed
            deps_failed = any(dep in results.failed for dep in node.depends_on)
            
            if deps_failed:
                results.skipped.append(node_name)
                print(f"⏭️  SKIPPED: {node_name} (dependency failed)")
                continue
            
            # Execute node
            try:
                success = node.execute_fn()
                if success:
                    results.completed.append(node_name)
                    print(f"✅ COMPLETED: {node_name}")
                else:
                    results.failed.append(node_name)
                    print(f"❌ FAILED: {node_name}")
            except Exception as e:
                results.failed.append(node_name)
                print(f"❌ FAILED: {node_name} - {str(e)}")
        
        return results

# Test
nodes = [
    SimpleNodeConfig("A", [], lambda: True),
    SimpleNodeConfig("B", [], lambda: False),
    SimpleNodeConfig("C", ["A"], lambda: True),
    SimpleNodeConfig("D", ["B"], lambda: True),
    SimpleNodeConfig("E", ["C", "D"], lambda: True),
]

pipeline = SimplePipeline(nodes)
results = pipeline.run()

print(f"\nCompleted: {results.completed}")
print(f"Failed: {results.failed}")
print(f"Skipped: {results.skipped}")

## Solution 2: Layer Detection

In [None]:
def get_execution_layers(nodes: Dict[str, List[str]]) -> List[List[str]]:
    """
    Group nodes into layers for parallel execution.
    
    Algorithm:
    1. Layer 0: nodes with no dependencies
    2. Layer N: nodes whose dependencies are all in layers 0..N-1
    """
    layers = []
    processed = set()
    remaining = set(nodes.keys())
    
    while remaining:
        # Find nodes where all dependencies are processed
        current_layer = []
        for node in remaining:
            deps = nodes[node]
            if all(dep in processed for dep in deps):
                current_layer.append(node)
        
        if not current_layer:
            raise ValueError("Circular dependency detected")
        
        layers.append(current_layer)
        processed.update(current_layer)
        remaining -= set(current_layer)
    
    return layers

# Test cases
test_cases = [
    {
        'nodes': {'A': [], 'B': [], 'C': ['A'], 'D': ['A', 'B'], 'E': ['C', 'D']},
        'expected': [['A', 'B'], ['C', 'D'], ['E']]
    },
    {
        'nodes': {'A': [], 'B': ['A'], 'C': ['B'], 'D': ['C']},
        'expected': [['A'], ['B'], ['C'], ['D']]
    },
    {
        'nodes': {'A': [], 'B': [], 'C': [], 'D': ['A', 'B', 'C']},
        'expected': [['A', 'B', 'C'], ['D']]
    },
]

for i, test in enumerate(test_cases, 1):
    result = get_execution_layers(test['nodes'])
    result_sorted = [sorted(layer) for layer in result]
    expected_sorted = [sorted(layer) for layer in test['expected']]
    
    if result_sorted == expected_sorted:
        print(f"✅ Test {i} passed")
        print(f"   Layers: {result}")
    else:
        print(f"❌ Test {i} failed")
        print(f"   Expected: {expected_sorted}")
        print(f"   Got: {result_sorted}")

## Solution 3: PipelineResults Aggregator

In [None]:
@dataclass
class PipelineRun:
    pipeline_name: str
    completed: List[str]
    failed: List[str]
    skipped: List[str]
    duration: float

@dataclass
class AggregatedResults:
    total_pipelines: int
    successful_pipelines: int
    failed_pipelines: int
    total_nodes: int
    total_completed: int
    total_failed: int
    total_skipped: int
    total_duration: float
    
    def success_rate(self) -> float:
        if self.total_nodes == 0:
            return 0.0
        return (self.total_completed / self.total_nodes) * 100

def aggregate_pipeline_results(runs: List[PipelineRun]) -> AggregatedResults:
    """
    Aggregate results from multiple pipeline runs.
    """
    total_pipelines = len(runs)
    successful_pipelines = sum(1 for run in runs if len(run.failed) == 0)
    failed_pipelines = total_pipelines - successful_pipelines
    
    total_completed = sum(len(run.completed) for run in runs)
    total_failed = sum(len(run.failed) for run in runs)
    total_skipped = sum(len(run.skipped) for run in runs)
    total_nodes = total_completed + total_failed + total_skipped
    
    total_duration = sum(run.duration for run in runs)
    
    return AggregatedResults(
        total_pipelines=total_pipelines,
        successful_pipelines=successful_pipelines,
        failed_pipelines=failed_pipelines,
        total_nodes=total_nodes,
        total_completed=total_completed,
        total_failed=total_failed,
        total_skipped=total_skipped,
        total_duration=total_duration,
    )

# Test
runs = [
    PipelineRun(
        pipeline_name="bronze_to_silver",
        completed=["A", "B", "C"],
        failed=[],
        skipped=[],
        duration=10.5
    ),
    PipelineRun(
        pipeline_name="silver_to_gold",
        completed=["D", "E"],
        failed=["F"],
        skipped=["G", "H"],
        duration=5.2
    ),
    PipelineRun(
        pipeline_name="gold_analytics",
        completed=["I", "J", "K", "L"],
        failed=[],
        skipped=[],
        duration=8.3
    ),
]

agg = aggregate_pipeline_results(runs)

print("Aggregated Results:")
print(f"  Total Pipelines: {agg.total_pipelines}")
print(f"  Successful: {agg.successful_pipelines}")
print(f"  Failed: {agg.failed_pipelines}")
print(f"  Total Nodes: {agg.total_nodes}")
print(f"  Completed: {agg.total_completed}")
print(f"  Failed: {agg.total_failed}")
print(f"  Skipped: {agg.total_skipped}")
print(f"  Total Duration: {agg.total_duration:.1f}s")
print(f"  Success Rate: {agg.success_rate():.1f}%")

## Solution 4: Pipeline Validation

In [None]:
@dataclass
class ValidationResult:
    valid: bool
    errors: List[str] = field(default_factory=list)
    warnings: List[str] = field(default_factory=list)

@dataclass
class NodeDef:
    name: str
    depends_on: List[str]
    connection: str

def validate_pipeline(
    nodes: List[NodeDef],
    available_connections: List[str]
) -> ValidationResult:
    """
    Comprehensive pipeline validation.
    """
    result = ValidationResult(valid=True)
    
    # Check for duplicate node names
    names = [node.name for node in nodes]
    duplicates = set([name for name in names if names.count(name) > 1])
    if duplicates:
        result.valid = False
        result.errors.append(f"Duplicate node names: {duplicates}")
    
    # Build node map
    node_map = {node.name: node for node in nodes}
    
    # Check for missing dependencies
    for node in nodes:
        for dep in node.depends_on:
            if dep not in node_map:
                result.valid = False
                result.errors.append(f"Node '{node.name}': dependency '{dep}' does not exist")
    
    # Check for circular dependencies (if no errors so far)
    if result.valid:
        try:
            _topological_sort(node_map)
        except ValueError as e:
            result.valid = False
            result.errors.append(str(e))
    
    # Check connections (warnings only)
    for node in nodes:
        if node.connection not in available_connections:
            result.warnings.append(
                f"Node '{node.name}': connection '{node.connection}' not available"
            )
    
    return result

def _topological_sort(node_map: Dict[str, NodeDef]) -> List[str]:
    """Helper to detect circular dependencies."""
    in_degree = {name: 0 for name in node_map}
    for node in node_map.values():
        for dep in node.depends_on:
            if dep in in_degree:
                in_degree[node.name] += 1
    
    queue = deque([name for name, degree in in_degree.items() if degree == 0])
    result = []
    
    while queue:
        current = queue.popleft()
        result.append(current)
        
        for name, node in node_map.items():
            if current in node.depends_on:
                in_degree[name] -= 1
                if in_degree[name] == 0:
                    queue.append(name)
    
    if len(result) != len(node_map):
        raise ValueError("Circular dependency detected")
    
    return result

# Tests
test_valid = [
    NodeDef("A", [], "local"),
    NodeDef("B", ["A"], "local"),
    NodeDef("C", ["B"], "azure"),
]

test_circular = [
    NodeDef("A", ["B"], "local"),
    NodeDef("B", ["C"], "local"),
    NodeDef("C", ["A"], "local"),
]

test_missing_dep = [
    NodeDef("A", [], "local"),
    NodeDef("B", ["X"], "local"),
]

test_duplicate = [
    NodeDef("A", [], "local"),
    NodeDef("A", [], "local"),
]

connections = ["local", "azure"]

print("Test 1 - Valid pipeline:")
result = validate_pipeline(test_valid, connections)
print(f"  Valid: {result.valid}")
print(f"  Errors: {result.errors}")
print(f"  Warnings: {result.warnings}")

print("\nTest 2 - Circular dependency:")
result = validate_pipeline(test_circular, connections)
print(f"  Valid: {result.valid}")
print(f"  Errors: {result.errors}")

print("\nTest 3 - Missing dependency:")
result = validate_pipeline(test_missing_dep, connections)
print(f"  Valid: {result.valid}")
print(f"  Errors: {result.errors}")

print("\nTest 4 - Duplicate nodes:")
result = validate_pipeline(test_duplicate, connections)
print(f"  Valid: {result.valid}")
print(f"  Errors: {result.errors}")

## Solution 5: Pipeline Execution Simulator

In [None]:
import random
import time
from datetime import datetime

@dataclass
class SimulatedNode:
    name: str
    depends_on: List[str]
    duration: float
    failure_rate: float

@dataclass
class ExecutionResults:
    pipeline_name: str
    completed: List[str] = field(default_factory=list)
    failed: List[str] = field(default_factory=list)
    skipped: List[str] = field(default_factory=list)
    node_durations: Dict[str, float] = field(default_factory=dict)
    start_time: str = ""
    end_time: str = ""
    total_duration: float = 0.0

class PipelineSimulator:
    def __init__(self, pipeline_name: str, nodes: List[SimulatedNode]):
        self.pipeline_name = pipeline_name
        self.nodes = {node.name: node for node in nodes}
    
    def _topological_sort(self) -> List[str]:
        in_degree = {name: 0 for name in self.nodes}
        for node in self.nodes.values():
            for dep in node.depends_on:
                in_degree[node.name] += 1
        
        queue = deque([name for name, degree in in_degree.items() if degree == 0])
        result = []
        
        while queue:
            current = queue.popleft()
            result.append(current)
            
            for name, node in self.nodes.items():
                if current in node.depends_on:
                    in_degree[name] -= 1
                    if in_degree[name] == 0:
                        queue.append(name)
        
        return result
    
    def _execute_node(self, node: SimulatedNode) -> bool:
        """Simulate node execution with random failure."""
        time.sleep(node.duration)
        return random.random() > node.failure_rate
    
    def run(self, verbose: bool = True) -> ExecutionResults:
        """Execute pipeline with simulation."""
        start = time.time()
        results = ExecutionResults(
            pipeline_name=self.pipeline_name,
            start_time=datetime.now().isoformat()
        )
        
        # Get execution order
        execution_order = self._topological_sort()
        
        if verbose:
            print(f"\nExecuting pipeline: {self.pipeline_name}")
            print(f"Execution order: {' → '.join(execution_order)}\n")
        
        # Execute nodes
        for node_name in execution_order:
            node = self.nodes[node_name]
            
            # Check dependencies
            deps_failed = any(dep in results.failed for dep in node.depends_on)
            
            if deps_failed:
                results.skipped.append(node_name)
                if verbose:
                    print(f"⏭️  SKIPPED: {node_name} (dependency failed)")
                continue
            
            # Execute
            if verbose:
                print(f"⚙️  EXECUTING: {node_name} ({node.duration}s)...", end=" ")
            
            node_start = time.time()
            success = self._execute_node(node)
            node_duration = time.time() - node_start
            results.node_durations[node_name] = node_duration
            
            if success:
                results.completed.append(node_name)
                if verbose:
                    print("✅ COMPLETED")
            else:
                results.failed.append(node_name)
                if verbose:
                    print("❌ FAILED")
        
        # Finalize results
        results.total_duration = time.time() - start
        results.end_time = datetime.now().isoformat()
        
        return results

# Test the simulator
nodes = [
    SimulatedNode("raw_customers", [], duration=0.5, failure_rate=0.1),
    SimulatedNode("raw_orders", [], duration=0.5, failure_rate=0.1),
    SimulatedNode("clean_customers", ["raw_customers"], duration=1.0, failure_rate=0.0),
    SimulatedNode("clean_orders", ["raw_orders"], duration=1.0, failure_rate=0.0),
    SimulatedNode("customer_orders", ["clean_customers", "clean_orders"], duration=1.5, failure_rate=0.0),
]

simulator = PipelineSimulator("bronze_to_silver", nodes)
results = simulator.run(verbose=True)

print("\n" + "="*60)
print(f"Pipeline: {results.pipeline_name}")
print(f"Completed: {len(results.completed)} {results.completed}")
print(f"Failed: {len(results.failed)} {results.failed}")
print(f"Skipped: {len(results.skipped)} {results.skipped}")
print(f"Total Duration: {results.total_duration:.2f}s")
print("="*60)

## Challenge Solution: Multi-Pipeline Manager

In [None]:
@dataclass
class PipelineDef:
    name: str
    nodes: List[str]
    depends_on_pipelines: List[str] = field(default_factory=list)

class MultiPipelineManager:
    def __init__(self, pipelines: List[PipelineDef]):
        self.pipelines = {p.name: p for p in pipelines}
    
    def _get_pipeline_execution_order(self) -> List[str]:
        """Topological sort on pipelines."""
        in_degree = {name: 0 for name in self.pipelines}
        for pipeline in self.pipelines.values():
            for dep in pipeline.depends_on_pipelines:
                in_degree[pipeline.name] += 1
        
        queue = deque([name for name, degree in in_degree.items() if degree == 0])
        result = []
        
        while queue:
            current = queue.popleft()
            result.append(current)
            
            for name, pipeline in self.pipelines.items():
                if current in pipeline.depends_on_pipelines:
                    in_degree[name] -= 1
                    if in_degree[name] == 0:
                        queue.append(name)
        
        if len(result) != len(self.pipelines):
            raise ValueError("Circular pipeline dependency")
        
        return result
    
    def run(self, pipeline_names: List[str] = None) -> Dict[str, bool]:
        """Run pipelines in dependency order."""
        # Determine which pipelines to run
        if pipeline_names is None:
            to_run = self._get_pipeline_execution_order()
        else:
            # Get dependencies and order them
            all_needed = set()
            for name in pipeline_names:
                self._collect_dependencies(name, all_needed)
            
            all_order = self._get_pipeline_execution_order()
            to_run = [p for p in all_order if p in all_needed]
        
        # Execute pipelines
        results = {}
        failed_pipelines = set()
        
        for name in to_run:
            pipeline = self.pipelines[name]
            
            # Check if dependencies failed
            deps_failed = any(dep in failed_pipelines for dep in pipeline.depends_on_pipelines)
            
            if deps_failed:
                print(f"⏭️  SKIPPED: {name} (pipeline dependency failed)")
                results[name] = False
                failed_pipelines.add(name)
                continue
            
            # Simulate pipeline execution
            print(f"⚙️  RUNNING: {name} ({len(pipeline.nodes)} nodes)")
            success = random.random() > 0.2  # 20% failure rate
            
            if success:
                print(f"✅ COMPLETED: {name}")
                results[name] = True
            else:
                print(f"❌ FAILED: {name}")
                results[name] = False
                failed_pipelines.add(name)
        
        return results
    
    def _collect_dependencies(self, pipeline_name: str, collected: set):
        """Recursively collect all dependencies."""
        if pipeline_name in collected:
            return
        
        collected.add(pipeline_name)
        pipeline = self.pipelines[pipeline_name]
        
        for dep in pipeline.depends_on_pipelines:
            self._collect_dependencies(dep, collected)

# Test
pipelines = [
    PipelineDef("raw_ingestion", ["fetch_customers", "fetch_orders"]),
    PipelineDef("bronze_to_silver", ["clean_customers", "clean_orders"], ["raw_ingestion"]),
    PipelineDef("silver_to_gold", ["customer_orders", "order_metrics"], ["bronze_to_silver"]),
    PipelineDef("analytics", ["customer_analytics", "revenue_report"], ["silver_to_gold"]),
]

manager = MultiPipelineManager(pipelines)

print("Running all pipelines:\n")
results = manager.run()

print("\n" + "="*60)
print("Final Results:")
for name, success in results.items():
    status = "✅" if success else "❌"
    print(f"{status} {name}")
print("="*60)