# Node Execution Exercise Solutions

## Solution 1: Custom Node Executor

In [None]:
from typing import Any, Dict, Optional, List
from dataclasses import dataclass
import pandas as pd
import time
import pandasql as ps

@dataclass
class SimpleNodeConfig:
    name: str
    read_path: Optional[str] = None
    transform_sql: Optional[str] = None
    write_path: Optional[str] = None

class SimpleNode:
    """Simplified Node implementation."""
    
    def __init__(self, config: SimpleNodeConfig):
        self.config = config
        self._execution_steps = []
    
    def execute(self) -> Dict[str, Any]:
        """Execute the node."""
        start_time = time.time()
        result_df = None
        
        try:
            # Read phase
            if self.config.read_path:
                result_df = pd.read_csv(self.config.read_path)
                self._execution_steps.append(f"Read from {self.config.read_path}")
            
            # Transform phase
            if self.config.transform_sql and result_df is not None:
                # Use pandasql to execute SQL on DataFrame
                result_df = ps.sqldf(self.config.transform_sql, locals())
                self._execution_steps.append("Applied SQL transform")
            
            # Write phase
            if self.config.write_path and result_df is not None:
                result_df.to_csv(self.config.write_path, index=False)
                self._execution_steps.append(f"Written to {self.config.write_path}")
            
            duration = time.time() - start_time
            return {
                "success": True,
                "duration": duration,
                "rows": len(result_df) if result_df is not None else 0,
                "steps": self._execution_steps
            }
        
        except Exception as e:
            duration = time.time() - start_time
            return {
                "success": False,
                "duration": duration,
                "error": str(e),
                "steps": self._execution_steps
            }

# Test
test_data = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "value": [10, 20, 30, 40, 50]
})
test_data.to_csv("test_input.csv", index=False)

config = SimpleNodeConfig(
    name="test_node",
    read_path="test_input.csv",
    transform_sql="SELECT * FROM result_df WHERE value > 20",
    write_path="test_output.csv"
)

node = SimpleNode(config)
result = node.execute()
print("Execution result:", result)

# Verify output
output = pd.read_csv("test_output.csv")
print("\nOutput DataFrame:")
print(output)

## Solution 2: Transform Step Router

In [None]:
from typing import Union, Callable
import pandasql as ps

class TransformRouter:
    """Routes transform steps to appropriate executors."""
    
    def __init__(self):
        self.functions = {}
    
    def register_function(self, name: str, func: Callable):
        """Register a transform function."""
        self.functions[name] = func
    
    def execute_step(self, step: Union[str, Dict], current_df: pd.DataFrame) -> pd.DataFrame:
        """Execute a transform step."""
        
        # SQL step (string)
        if isinstance(step, str):
            # Execute SQL using pandasql
            # Make current_df available as 'df' in SQL
            df = current_df
            return ps.sqldf(step, locals())
        
        # Function or operation step (dict)
        elif isinstance(step, dict):
            if "function" in step:
                # Function step
                func_name = step["function"]
                params = step.get("params", {})
                
                if func_name not in self.functions:
                    raise ValueError(f"Function '{func_name}' not registered")
                
                func = self.functions[func_name]
                return func(current_df.copy(), **params)
            
            elif "operation" in step:
                # Built-in operation
                operation = step["operation"]
                params = step.get("params", {})
                
                if operation == "pivot":
                    return current_df.pivot(**params)
                elif operation == "melt":
                    return current_df.melt(**params)
                elif operation == "sort":
                    return current_df.sort_values(**params)
                else:
                    raise ValueError(f"Unknown operation: {operation}")
        
        raise ValueError(f"Invalid step format: {step}")

# Test
router = TransformRouter()

# Register custom functions
def double_values(df: pd.DataFrame, column: str) -> pd.DataFrame:
    df[column] = df[column] * 2
    return df

def add_column(df: pd.DataFrame, name: str, value: Any) -> pd.DataFrame:
    df[name] = value
    return df

router.register_function("double_values", double_values)
router.register_function("add_column", add_column)

# Test data
df = pd.DataFrame({"id": [1, 2, 3], "value": [10, 20, 30]})

# Test 1: SQL step
print("Test 1: SQL step")
result1 = router.execute_step("SELECT * FROM df WHERE value > 15", df)
print(result1)

# Test 2: Function step
print("\nTest 2: Function step")
result2 = router.execute_step(
    {"function": "double_values", "params": {"column": "value"}},
    df
)
print(result2)

# Test 3: Operation step
print("\nTest 3: Operation step")
result3 = router.execute_step(
    {"operation": "sort", "params": {"by": "value", "ascending": False}},
    df
)
print(result3)

# Test 4: Chained steps
print("\nTest 4: Chained steps")
steps = [
    {"function": "add_column", "params": {"name": "category", "value": "A"}},
    "SELECT id, value * 2 as value, category FROM df",
    {"operation": "sort", "params": {"by": "value"}}
]
result = df
for step in steps:
    result = router.execute_step(step, result)
print(result)

## Solution 3: Validation Engine

In [None]:
from typing import List, Optional
from dataclasses import dataclass

@dataclass
class ValidationConfig:
    not_empty: bool = False
    no_nulls: Optional[List[str]] = None
    min_rows: Optional[int] = None
    max_rows: Optional[int] = None
    required_columns: Optional[List[str]] = None

class ValidationEngine:
    """Validates DataFrames against rules."""
    
    def validate(self, df: pd.DataFrame, config: ValidationConfig) -> List[str]:
        """Validate DataFrame against rules."""
        failures = []
        
        # Check 1: Not empty
        if config.not_empty:
            if len(df) == 0:
                failures.append("DataFrame is empty")
        
        # Check 2: No nulls
        if config.no_nulls:
            for col in config.no_nulls:
                if col not in df.columns:
                    failures.append(f"Column '{col}' does not exist")
                    continue
                
                null_count = df[col].isnull().sum()
                if null_count > 0:
                    failures.append(f"Column '{col}' has {null_count} null values")
        
        # Check 3: Min rows
        if config.min_rows is not None:
            if len(df) < config.min_rows:
                failures.append(
                    f"DataFrame has {len(df)} rows, minimum required is {config.min_rows}"
                )
        
        # Check 4: Max rows
        if config.max_rows is not None:
            if len(df) > config.max_rows:
                failures.append(
                    f"DataFrame has {len(df)} rows, maximum allowed is {config.max_rows}"
                )
        
        # Check 5: Required columns
        if config.required_columns:
            missing_cols = set(config.required_columns) - set(df.columns)
            if missing_cols:
                failures.append(
                    f"Missing required columns: {', '.join(missing_cols)}"
                )
        
        return failures

# Test suite
validator = ValidationEngine()

print("Running validation tests...\n")

# Test 1: Valid DataFrame
df1 = pd.DataFrame({"id": [1, 2, 3], "name": ["A", "B", "C"]})
config1 = ValidationConfig(
    not_empty=True,
    no_nulls=["id", "name"],
    min_rows=2,
    required_columns=["id", "name"]
)
failures1 = validator.validate(df1, config1)
assert len(failures1) == 0, f"Expected no failures, got: {failures1}"
print("‚úì Test 1 passed: Valid DataFrame")

# Test 2: Empty DataFrame
df2 = pd.DataFrame()
config2 = ValidationConfig(not_empty=True)
failures2 = validator.validate(df2, config2)
assert len(failures2) > 0
print(f"‚úì Test 2 passed: Empty DataFrame detected - {failures2[0]}")

# Test 3: Null values
df3 = pd.DataFrame({"id": [1, None, 3], "name": ["A", "B", "C"]})
config3 = ValidationConfig(no_nulls=["id"])
failures3 = validator.validate(df3, config3)
assert len(failures3) > 0
print(f"‚úì Test 3 passed: Null values detected - {failures3[0]}")

# Test 4: Row count violations
df4 = pd.DataFrame({"id": [1, 2]})
config4 = ValidationConfig(min_rows=5, max_rows=1)
failures4 = validator.validate(df4, config4)
assert len(failures4) == 2  # Both min and max violated
print(f"‚úì Test 4 passed: Row count violations - {len(failures4)} failures")

# Test 5: Missing columns
df5 = pd.DataFrame({"id": [1, 2, 3]})
config5 = ValidationConfig(required_columns=["id", "name", "value"])
failures5 = validator.validate(df5, config5)
assert len(failures5) > 0
print(f"‚úì Test 5 passed: Missing columns detected - {failures5[0]}")

# Test 6: Multiple failures
df6 = pd.DataFrame({"id": [None, None]})
config6 = ValidationConfig(
    not_empty=True,
    no_nulls=["id"],
    min_rows=5,
    required_columns=["id", "name"]
)
failures6 = validator.validate(df6, config6)
print(f"\n‚úì Test 6 passed: Multiple failures ({len(failures6)}):")
for failure in failures6:
    print(f"  - {failure}")

print("\nAll validation tests passed!")

## Solution 4: Error Context Builder

In [None]:
from dataclasses import dataclass, field
from typing import List, Optional

@dataclass
class ExecutionContext:
    node_name: str
    config_file: Optional[str] = None
    step_index: Optional[int] = None
    total_steps: Optional[int] = None
    previous_steps: List[str] = field(default_factory=list)
    input_schema: List[str] = field(default_factory=list)

class ErrorContextBuilder:
    """Builds rich error messages with context."""
    
    @staticmethod
    def build_message(error: Exception, context: ExecutionContext) -> str:
        """Build a comprehensive error message."""
        lines = []
        lines.append("=" * 70)
        lines.append("NODE EXECUTION ERROR")
        lines.append("=" * 70)
        
        # Node and file info
        lines.append(f"\nNode: {context.node_name}")
        if context.config_file:
            lines.append(f"Config: {context.config_file}")
        
        # Step info
        if context.step_index is not None and context.total_steps is not None:
            lines.append(
                f"\nFailed at: Step {context.step_index + 1} of {context.total_steps}"
            )
        
        # Previous steps (what succeeded)
        if context.previous_steps:
            lines.append("\nPrevious successful steps:")
            for i, step in enumerate(context.previous_steps, 1):
                lines.append(f"  {i}. ‚úì {step}")
        
        # Input schema
        if context.input_schema:
            lines.append(f"\nInput schema: {', '.join(context.input_schema)}")
        
        # Original error
        lines.append(f"\nError type: {error.__class__.__name__}")
        lines.append(f"Error message: {str(error)}")
        
        # Suggestions
        suggestions = ErrorContextBuilder.generate_suggestions(error)
        if suggestions:
            lines.append("\nSuggestions:")
            for suggestion in suggestions:
                lines.append(f"  üí° {suggestion}")
        
        lines.append("\n" + "=" * 70)
        
        return "\n".join(lines)
    
    @staticmethod
    def generate_suggestions(error: Exception) -> List[str]:
        """Generate helpful suggestions based on error."""
        suggestions = []
        error_str = str(error).lower()
        error_type = error.__class__.__name__.lower()
        
        # KeyError suggestions
        if "keyerror" in error_type:
            suggestions.append(
                "Check that all referenced DataFrames are registered in context"
            )
            suggestions.append(
                "Verify node dependencies in 'depends_on' list"
            )
            suggestions.append(
                "Ensure previous nodes executed successfully"
            )
        
        # Column errors
        if "column" in error_str or "attributeerror" in error_type:
            suggestions.append(
                "Check that previous nodes output the expected columns"
            )
            suggestions.append(
                "Use df.columns to inspect available columns"
            )
            suggestions.append(
                "Verify column names match exactly (case-sensitive)"
            )
        
        # Type errors
        if "valueerror" in error_type or "typeerror" in error_type:
            suggestions.append(
                "Check data types match expected types"
            )
            suggestions.append(
                "Use df.dtypes to inspect column types"
            )
            suggestions.append(
                "Consider adding type conversions in transform step"
            )
        
        # File errors
        if "filenotfounderror" in error_type or "not found" in error_str:
            suggestions.append(
                "Verify file paths are correct and files exist"
            )
            suggestions.append(
                "Check file permissions"
            )
            suggestions.append(
                "Use absolute paths instead of relative paths"
            )
        
        # Function errors
        if "function" in error_str and "not" in error_str:
            suggestions.append(
                "Ensure the transform function is decorated with @transform"
            )
            suggestions.append(
                "Import the module containing the transform function"
            )
            suggestions.append(
                "Check function name spelling in config"
            )
        
        return suggestions

# Test cases
builder = ErrorContextBuilder()

print("Test 1: Transform step error\n")
context1 = ExecutionContext(
    node_name="transform_data",
    config_file="pipeline.yaml",
    step_index=2,
    total_steps=5,
    previous_steps=["Read from CSV", "Applied filter"],
    input_schema=["id", "name", "value"]
)
error1 = KeyError("customer_id")
message1 = builder.build_message(error1, context1)
print(message1)

print("\n\nTest 2: Column not found error\n")
context2 = ExecutionContext(
    node_name="aggregate_data",
    previous_steps=["Read from source"],
    input_schema=["id", "name"]
)
error2 = AttributeError("DataFrame has no column 'total'")
message2 = builder.build_message(error2, context2)
print(message2)

print("\n\nTest 3: File not found error\n")
context3 = ExecutionContext(
    node_name="load_data",
    config_file="etl.yaml"
)
error3 = FileNotFoundError("data/input.csv not found")
message3 = builder.build_message(error3, context3)
print(message3)

## Solution 5: Metadata Collector

In [None]:
from datetime import datetime
from typing import Any, Dict, Optional
import pandas as pd
import json

class MetadataCollector:
    """Collects execution metadata."""
    
    def __init__(self):
        self.execution_steps = []
    
    def track_step(self, step: str):
        """Track an execution step."""
        self.execution_steps.append(step)
    
    def collect(self, df: Optional[pd.DataFrame], duration: float) -> Dict[str, Any]:
        """Collect comprehensive metadata."""
        metadata = {
            "timestamp": datetime.now().isoformat(),
            "duration": round(duration, 3),
            "steps": self.execution_steps.copy()
        }
        
        if df is not None:
            # Basic stats
            metadata["rows"] = len(df)
            metadata["columns"] = len(df.columns)
            
            # Schema with types
            metadata["schema"] = {
                col: str(dtype) for col, dtype in df.dtypes.items()
            }
            
            # Memory usage (in MB)
            memory_bytes = df.memory_usage(deep=True).sum()
            metadata["memory_usage_mb"] = round(memory_bytes / (1024 ** 2), 2)
            
            # Null counts per column
            null_counts = df.isnull().sum()
            metadata["null_counts"] = {
                col: int(count) for col, count in null_counts.items() if count > 0
            }
            
            # Data quality score (% of non-null values)
            total_cells = len(df) * len(df.columns)
            null_cells = df.isnull().sum().sum()
            metadata["data_quality_score"] = round(
                ((total_cells - null_cells) / total_cells * 100) if total_cells > 0 else 0,
                2
            )
        
        return metadata
    
    def to_json(self, df: Optional[pd.DataFrame], duration: float) -> str:
        """Export metadata as JSON."""
        metadata = self.collect(df, duration)
        return json.dumps(metadata, indent=2)
    
    def display(self, df: Optional[pd.DataFrame], duration: float):
        """Display formatted metadata."""
        metadata = self.collect(df, duration)
        
        print("=" * 60)
        print("EXECUTION METADATA")
        print("=" * 60)
        
        print(f"\nTimestamp: {metadata['timestamp']}")
        print(f"Duration: {metadata['duration']}s")
        
        print("\nExecution Steps:")
        for i, step in enumerate(metadata['steps'], 1):
            print(f"  {i}. {step}")
        
        if df is not None:
            print(f"\nDataFrame Stats:")
            print(f"  Rows: {metadata['rows']:,}")
            print(f"  Columns: {metadata['columns']}")
            print(f"  Memory: {metadata['memory_usage_mb']} MB")
            print(f"  Quality Score: {metadata['data_quality_score']}%")
            
            print(f"\nSchema:")
            for col, dtype in metadata['schema'].items():
                null_info = ""
                if col in metadata['null_counts']:
                    null_info = f" ({metadata['null_counts'][col]} nulls)"
                print(f"  {col}: {dtype}{null_info}")
        
        print("\n" + "=" * 60)

# Test
collector = MetadataCollector()
collector.track_step("Read from CSV")
collector.track_step("Applied filter: value > 10")
collector.track_step("Grouped by category")
collector.track_step("Calculated aggregates")

# Create test DataFrame with some nulls
test_df = pd.DataFrame({
    "id": range(1, 1001),
    "value": [i * 10 if i % 5 != 0 else None for i in range(1, 1001)],
    "category": ["A", "B", "C", "D", "E"] * 200,
    "name": [f"Item_{i}" for i in range(1, 1001)],
    "score": [i / 10 if i % 7 != 0 else None for i in range(1, 1001)]
})

# Display metadata
collector.display(test_df, duration=0.245)

# Export as JSON
print("\n\nMetadata as JSON:")
print(collector.to_json(test_df, duration=0.245))

## Solution 6: Mini Orchestrator (Bonus)

In [None]:
from typing import List, Dict, Any, Set
from collections import defaultdict, deque
import time

class MiniOrchestrator:
    """Simple node orchestrator with topological sort."""
    
    def __init__(self):
        self.nodes = {}  # node_name -> {node, depends_on}
        self.results = {}  # node_name -> result
    
    def add_node(self, node, depends_on: List[str] = None):
        """Add a node with dependencies."""
        self.nodes[node.config.name] = {
            "node": node,
            "depends_on": depends_on or []
        }
    
    def _topological_sort(self) -> List[str]:
        """Sort nodes in dependency order using Kahn's algorithm."""
        # Build in-degree map
        in_degree = {name: 0 for name in self.nodes}
        
        for name, info in self.nodes.items():
            for dep in info["depends_on"]:
                if dep in in_degree:
                    in_degree[name] += 1
        
        # Queue nodes with no dependencies
        queue = deque([name for name, degree in in_degree.items() if degree == 0])
        sorted_nodes = []
        
        while queue:
            # Process node with no remaining dependencies
            current = queue.popleft()
            sorted_nodes.append(current)
            
            # Reduce in-degree for dependent nodes
            for name, info in self.nodes.items():
                if current in info["depends_on"]:
                    in_degree[name] -= 1
                    if in_degree[name] == 0:
                        queue.append(name)
        
        # Check for cycles
        if len(sorted_nodes) != len(self.nodes):
            raise ValueError("Circular dependency detected in node graph")
        
        return sorted_nodes
    
    def execute_all(self) -> Dict[str, Any]:
        """Execute all nodes in dependency order."""
        start_time = time.time()
        execution_order = self._topological_sort()
        
        summary = {
            "total_nodes": len(self.nodes),
            "execution_order": execution_order,
            "node_results": {},
            "failures": [],
            "total_duration": 0
        }
        
        print(f"Executing {len(execution_order)} nodes in order: {' ‚Üí '.join(execution_order)}\n")
        
        for node_name in execution_order:
            node_info = self.nodes[node_name]
            node = node_info["node"]
            
            # Check dependencies succeeded
            deps_failed = False
            for dep in node_info["depends_on"]:
                if dep in summary["failures"]:
                    print(f"‚è≠Ô∏è  Skipping {node_name} (dependency {dep} failed)")
                    summary["failures"].append(node_name)
                    deps_failed = True
                    break
            
            if deps_failed:
                continue
            
            # Execute node
            print(f"‚ñ∂Ô∏è  Executing {node_name}...")
            result = node.execute()
            
            self.results[node_name] = result
            summary["node_results"][node_name] = result
            
            if result["success"]:
                print(f"   ‚úì Success ({result['duration']:.3f}s, {result.get('rows', 0)} rows)")
            else:
                print(f"   ‚úó Failed: {result.get('error', 'Unknown error')}")
                summary["failures"].append(node_name)
            
            print()
        
        summary["total_duration"] = time.time() - start_time
        summary["success_count"] = len(execution_order) - len(summary["failures"])
        
        return summary
    
    def display_summary(self, summary: Dict[str, Any]):
        """Display execution summary."""
        print("=" * 60)
        print("ORCHESTRATION SUMMARY")
        print("=" * 60)
        print(f"\nTotal nodes: {summary['total_nodes']}")
        print(f"Successful: {summary['success_count']}")
        print(f"Failed: {len(summary['failures'])}")
        print(f"Total duration: {summary['total_duration']:.3f}s")
        
        if summary['failures']:
            print(f"\nFailed nodes: {', '.join(summary['failures'])}")
        
        print("\nNode Details:")
        for node_name in summary['execution_order']:
            if node_name in summary['node_results']:
                result = summary['node_results'][node_name]
                status = "‚úì" if result['success'] else "‚úó"
                print(f"  {status} {node_name}: {result['duration']:.3f}s")
        
        print("\n" + "=" * 60)

# Test with a 3-node pipeline
from dataclasses import dataclass

@dataclass
class MockNodeConfig:
    name: str

class MockNode:
    def __init__(self, name: str, should_fail: bool = False):
        self.config = MockNodeConfig(name=name)
        self.should_fail = should_fail
    
    def execute(self):
        import time
        import random
        
        duration = random.uniform(0.1, 0.3)
        time.sleep(duration)
        
        if self.should_fail:
            return {
                "success": False,
                "duration": duration,
                "error": "Simulated failure",
                "steps": []
            }
        
        return {
            "success": True,
            "duration": duration,
            "rows": random.randint(100, 1000),
            "steps": [f"Executed {self.config.name}"]
        }

# Create orchestrator
orchestrator = MiniOrchestrator()

# Add nodes
node_a = MockNode("load_data")
node_b = MockNode("transform_data")
node_c = MockNode("aggregate_data")
node_d = MockNode("write_results")

orchestrator.add_node(node_a)  # No dependencies
orchestrator.add_node(node_b, depends_on=["load_data"])  # Depends on A
orchestrator.add_node(node_c, depends_on=["transform_data"])  # Depends on B
orchestrator.add_node(node_d, depends_on=["aggregate_data"])  # Depends on C

# Execute pipeline
summary = orchestrator.execute_all()

# Display summary
orchestrator.display_summary(summary)