# Node Execution Exercises

Practice implementing and debugging Node execution patterns.

## Exercise 1: Build a Custom Node Executor

Create a simplified Node executor that handles read → transform → write.

In [None]:
from typing import Any, Dict, Optional, List
from dataclasses import dataclass
import pandas as pd
import time

@dataclass
class SimpleNodeConfig:
    name: str
    read_path: Optional[str] = None
    transform_sql: Optional[str] = None
    write_path: Optional[str] = None

class SimpleNode:
    """Simplified Node implementation."""
    
    def __init__(self, config: SimpleNodeConfig):
        self.config = config
        self._execution_steps = []
    
    def execute(self) -> Dict[str, Any]:
        """Execute the node.
        
        TODO: Implement the 3-phase execution:
        1. Read phase: Load CSV if read_path is set
        2. Transform phase: Apply SQL if transform_sql is set
        3. Write phase: Save CSV if write_path is set
        
        Return: {"success": bool, "duration": float, "rows": int, "steps": list}
        """
        start_time = time.time()
        result_df = None
        
        # TODO: Implement read phase
        
        # TODO: Implement transform phase
        
        # TODO: Implement write phase
        
        duration = time.time() - start_time
        return {
            "success": True,
            "duration": duration,
            "rows": len(result_df) if result_df is not None else 0,
            "steps": self._execution_steps
        }

# Test your implementation
test_data = pd.DataFrame({
    "id": [1, 2, 3],
    "value": [10, 20, 30]
})
test_data.to_csv("test_input.csv", index=False)

config = SimpleNodeConfig(
    name="test_node",
    read_path="test_input.csv",
    write_path="test_output.csv"
)

node = SimpleNode(config)
result = node.execute()
print(result)

## Exercise 2: Transform Step Router

Implement a router that handles different transform types.

In [None]:
from typing import Union, Callable
import pandasql as ps

class TransformRouter:
    """Routes transform steps to appropriate executors."""
    
    def __init__(self):
        self.functions = {}  # function_name -> callable
    
    def register_function(self, name: str, func: Callable):
        """Register a transform function."""
        self.functions[name] = func
    
    def execute_step(self, step: Union[str, Dict], current_df: pd.DataFrame) -> pd.DataFrame:
        """
        Execute a transform step.
        
        TODO: Implement routing logic:
        - If step is a string: treat as SQL and execute with pandasql
        - If step is dict with "function" key: call registered function
        - If step is dict with "operation" key: handle built-in operation
        
        Args:
            step: Transform step (SQL string or dict)
            current_df: Current DataFrame
        
        Returns:
            Transformed DataFrame
        """
        # TODO: Implement routing
        pass

# Test cases
router = TransformRouter()

# Register a custom function
def double_values(df: pd.DataFrame, column: str) -> pd.DataFrame:
    df[column] = df[column] * 2
    return df

router.register_function("double_values", double_values)

# Test data
df = pd.DataFrame({"id": [1, 2, 3], "value": [10, 20, 30]})

# Test SQL step
result1 = router.execute_step("SELECT * FROM df WHERE value > 15", df)
print("SQL result:", result1)

# Test function step
result2 = router.execute_step(
    {"function": "double_values", "params": {"column": "value"}},
    df
)
print("Function result:", result2)

## Exercise 3: Validation Engine

Build a validation engine with multiple check types.

In [None]:
from typing import List, Optional
from dataclasses import dataclass

@dataclass
class ValidationConfig:
    not_empty: bool = False
    no_nulls: Optional[List[str]] = None
    min_rows: Optional[int] = None
    max_rows: Optional[int] = None
    required_columns: Optional[List[str]] = None

class ValidationEngine:
    """Validates DataFrames against rules."""
    
    def validate(self, df: pd.DataFrame, config: ValidationConfig) -> List[str]:
        """
        Validate DataFrame against rules.
        
        TODO: Implement all validation checks:
        1. not_empty: Check DataFrame has rows
        2. no_nulls: Check specified columns have no nulls
        3. min_rows: Check minimum row count
        4. max_rows: Check maximum row count
        5. required_columns: Check all required columns exist
        
        Args:
            df: DataFrame to validate
            config: Validation configuration
        
        Returns:
            List of failure messages (empty if all pass)
        """
        failures = []
        
        # TODO: Implement validation logic
        
        return failures

# Test cases
validator = ValidationEngine()

# Test 1: Valid DataFrame
df1 = pd.DataFrame({"id": [1, 2, 3], "name": ["A", "B", "C"]})
config1 = ValidationConfig(
    not_empty=True,
    no_nulls=["id", "name"],
    min_rows=2,
    required_columns=["id", "name"]
)
failures1 = validator.validate(df1, config1)
assert len(failures1) == 0, f"Expected no failures, got: {failures1}"
print("✓ Test 1 passed")

# Test 2: Empty DataFrame
df2 = pd.DataFrame()
config2 = ValidationConfig(not_empty=True)
failures2 = validator.validate(df2, config2)
assert len(failures2) > 0, "Expected failure for empty DataFrame"
print("✓ Test 2 passed")

# Test 3: Null values
df3 = pd.DataFrame({"id": [1, None, 3], "name": ["A", "B", "C"]})
config3 = ValidationConfig(no_nulls=["id"])
failures3 = validator.validate(df3, config3)
assert len(failures3) > 0, "Expected failure for null values"
print("✓ Test 3 passed")

# Test 4: Row count
df4 = pd.DataFrame({"id": [1, 2]})
config4 = ValidationConfig(min_rows=5)
failures4 = validator.validate(df4, config4)
assert len(failures4) > 0, "Expected failure for min_rows"
print("✓ Test 4 passed")

## Exercise 4: Error Context Builder

Create rich error messages with execution context.

In [None]:
from dataclasses import dataclass, field
from typing import List, Optional

@dataclass
class ExecutionContext:
    node_name: str
    config_file: Optional[str] = None
    step_index: Optional[int] = None
    total_steps: Optional[int] = None
    previous_steps: List[str] = field(default_factory=list)
    input_schema: List[str] = field(default_factory=list)

class ErrorContextBuilder:
    """Builds rich error messages with context."""
    
    @staticmethod
    def build_message(error: Exception, context: ExecutionContext) -> str:
        """
        Build a comprehensive error message.
        
        TODO: Create a formatted error message that includes:
        1. Node name and config file
        2. Current step (if in transform phase)
        3. Previous successful steps
        4. Input schema (if available)
        5. Original error message
        6. Suggestions based on error type
        
        Args:
            error: Original exception
            context: Execution context
        
        Returns:
            Formatted error message
        """
        # TODO: Build formatted message
        pass
    
    @staticmethod
    def generate_suggestions(error: Exception) -> List[str]:
        """
        Generate helpful suggestions based on error.
        
        TODO: Analyze error and provide suggestions for:
        - KeyError: Check context registration
        - AttributeError: Check column names
        - ValueError: Check data types
        - FileNotFoundError: Check paths
        
        Args:
            error: Exception
        
        Returns:
            List of suggestions
        """
        # TODO: Generate suggestions
        pass

# Test cases
builder = ErrorContextBuilder()

# Test 1: KeyError
context1 = ExecutionContext(
    node_name="transform_data",
    config_file="pipeline.yaml",
    step_index=2,
    total_steps=5,
    previous_steps=["Read from CSV", "Applied filter"],
    input_schema=["id", "name", "value"]
)
error1 = KeyError("missing_column")
message1 = builder.build_message(error1, context1)
print("Error message 1:")
print(message1)
print()

# Test 2: Column not found
context2 = ExecutionContext(
    node_name="aggregate_data",
    previous_steps=["Read from source"]
)
error2 = ValueError("Column 'total' not found")
suggestions2 = builder.generate_suggestions(error2)
print("Suggestions for column error:")
for s in suggestions2:
    print(f"  - {s}")

## Exercise 5: Metadata Collector

Build a system to collect and track execution metadata.

In [None]:
from datetime import datetime
from typing import Any, Dict
import pandas as pd

class MetadataCollector:
    """Collects execution metadata."""
    
    def __init__(self):
        self.execution_steps = []
    
    def track_step(self, step: str):
        """Track an execution step."""
        self.execution_steps.append(step)
    
    def collect(self, df: Optional[pd.DataFrame], duration: float) -> Dict[str, Any]:
        """
        Collect comprehensive metadata.
        
        TODO: Return metadata dictionary with:
        1. timestamp: ISO format
        2. duration: Execution time
        3. steps: List of execution steps
        4. rows: Row count (if df is not None)
        5. columns: Column count
        6. schema: Column names and types
        7. memory_usage: DataFrame memory usage in MB
        8. null_counts: Null count per column
        
        Args:
            df: Result DataFrame (can be None)
            duration: Execution duration in seconds
        
        Returns:
            Metadata dictionary
        """
        metadata = {
            "timestamp": datetime.now().isoformat(),
            "duration": duration,
            "steps": self.execution_steps.copy()
        }
        
        # TODO: Add DataFrame metadata if available
        
        return metadata

# Test
collector = MetadataCollector()
collector.track_step("Read from CSV")
collector.track_step("Applied filter: value > 10")
collector.track_step("Grouped by category")

test_df = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "value": [10, None, 30, 40, 50],
    "category": ["A", "B", "A", "B", "A"]
})

metadata = collector.collect(test_df, duration=0.125)
print("Collected metadata:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

## Exercise 6: Node Execution Debugger

Create a debugging tool that inspects node execution.

In [None]:
class NodeDebugger:
    """Debug tool for node execution."""
    
    def __init__(self, node):
        self.node = node
    
    def show_config(self):
        """Display node configuration."""
        print(f"Node: {self.node.config.name}")
        print(f"Read: {self.node.config.read}")
        print(f"Transform: {self.node.config.transform}")
        print(f"Validate: {self.node.config.validation}")
        print(f"Write: {self.node.config.write}")
    
    def execute_with_trace(self) -> Dict[str, Any]:
        """
        Execute node with detailed tracing.
        
        TODO: Implement execution with phase-by-phase output:
        1. Print "Starting READ phase" and show result shape
        2. Print "Starting TRANSFORM phase" and show each step
        3. Print "Starting VALIDATE phase" and show checks
        4. Print "Starting WRITE phase" and confirm success
        5. Return detailed trace with timing for each phase
        
        Returns:
            Trace dictionary with phase timings and results
        """
        trace = {}
        
        # TODO: Implement traced execution
        
        return trace
    
    def inspect_dataframe(self, df: pd.DataFrame, label: str = "DataFrame"):
        """Display DataFrame inspection."""
        print(f"\n{label} Inspection:")
        print(f"  Shape: {df.shape}")
        print(f"  Columns: {list(df.columns)}")
        print(f"  Dtypes:\n{df.dtypes}")
        print(f"  Null counts:\n{df.isnull().sum()}")
        print(f"  Sample (first 3 rows):\n{df.head(3)}")

# Test with a mock node
# (Create test implementation based on Exercise 1)

## Bonus Challenge: Build a Mini Node Orchestrator

Create a simple orchestrator that executes multiple nodes in dependency order.

In [None]:
from typing import List, Dict

class MiniOrchestrator:
    """Simple node orchestrator."""
    
    def __init__(self):
        self.nodes = {}
        self.results = {}
    
    def add_node(self, node, depends_on: List[str] = None):
        """Add a node with dependencies."""
        self.nodes[node.config.name] = {
            "node": node,
            "depends_on": depends_on or []
        }
    
    def execute_all(self) -> Dict[str, Any]:
        """
        Execute all nodes in dependency order.
        
        TODO: Implement topological sort execution:
        1. Build dependency graph
        2. Sort nodes topologically
        3. Execute each node in order
        4. Track results and handle failures
        5. Return execution summary
        
        Returns:
            Summary with node results and total duration
        """
        # TODO: Implement orchestration
        pass

# Test: Create a 3-node pipeline
# node_a: Read data
# node_b: Transform (depends on node_a)
# node_c: Write (depends on node_b)