# ODIBI Framework - Test Exploration

This notebook lets you run and explore the tests interactively to understand how each component works.

In [None]:
# Setup: Add odibi to Python path
import sys
from pathlib import Path

# Add the project root to path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Python path includes: {project_root in [Path(p) for p in sys.path]}")

In [None]:
# Verify imports work
import pandas as pd
from odibi.config import NodeConfig, ReadConfig, WriteConfig, TransformConfig, PipelineConfig
from odibi.context import PandasContext, create_context
from odibi.registry import transform, FunctionRegistry, validate_function_params

print("✅ All imports successful!")

---

## 1. Config Validation Tests

Test Pydantic schemas that validate YAML configs.

In [None]:
# Test 1.1: Valid ReadConfig with path
from pydantic import ValidationError

config = ReadConfig(
    connection="local",
    format="csv",
    path="data/input.csv"
)

print("Valid ReadConfig:")
print(f"  Connection: {config.connection}")
print(f"  Format: {config.format}")
print(f"  Path: {config.path}")
print(f"  Table: {config.table}")

In [None]:
# Test 1.2: Invalid ReadConfig - missing path AND table (should fail)
try:
    config = ReadConfig(
        connection="local",
        format="csv"
        # Missing both path and table!
    )
    print("❌ Should have failed!")
except ValidationError as e:
    print("✅ Validation caught the error:")
    print(f"   {e.errors()[0]['msg']}")

In [None]:
# Test 1.3: Valid NodeConfig with read operation
node = NodeConfig(
    name="load_data",
    description="Load CSV data",
    read=ReadConfig(
        connection="local",
        format="csv",
        path="input.csv"
    )
)

print(f"Valid NodeConfig: {node.name}")
print(f"  Has read: {node.read is not None}")
print(f"  Has transform: {node.transform is not None}")
print(f"  Has write: {node.write is not None}")

In [None]:
# Test 1.4: Invalid NodeConfig - no operations (should fail)
try:
    node = NodeConfig(
        name="empty_node"
        # No read, transform, or write!
    )
    print("❌ Should have failed!")
except ValidationError as e:
    print("✅ Validation caught the error:")
    error_msg = str(e)
    print(f"   Error contains 'must have at least one': {'must have at least one' in error_msg}")

In [None]:
# Test 1.5: NodeConfig with dependencies
node = NodeConfig(
    name="process_data",
    depends_on=["load_data", "load_reference"],
    transform=TransformConfig(
        steps=["SELECT * FROM load_data"]
    )
)

print(f"Node '{node.name}' depends on: {node.depends_on}")

In [None]:
# Test 1.6: PipelineConfig rejects duplicate node names
try:
    pipeline = PipelineConfig(
        pipeline="test_pipeline",
        nodes=[
            NodeConfig(
                name="duplicate",
                read=ReadConfig(connection="local", format="csv", path="a.csv")
            ),
            NodeConfig(
                name="duplicate",  # Same name!
                read=ReadConfig(connection="local", format="csv", path="b.csv")
            )
        ]
    )
    print("❌ Should have failed!")
except ValidationError as e:
    print("✅ Validation caught duplicate names:")
    error_msg = str(e)
    print(f"   Error contains 'Duplicate': {'Duplicate' in error_msg}")

---

## 2. Context API Tests

Test the unified Context for passing DataFrames between nodes.

In [None]:
# Test 2.1: Register and retrieve DataFrame
ctx = PandasContext()
df = pd.DataFrame({
    "id": [1, 2, 3],
    "value": [10, 20, 30]
})

ctx.register("my_data", df)
retrieved = ctx.get("my_data")

print("✅ Registered and retrieved DataFrame:")
print(retrieved)

In [None]:
# Test 2.2: Check if DataFrame exists
print(f"Has 'my_data': {ctx.has('my_data')}")
print(f"Has 'does_not_exist': {ctx.has('does_not_exist')}")

In [None]:
# Test 2.3: Error when DataFrame not found
try:
    ctx.get("missing_dataframe")
    print("❌ Should have failed!")
except KeyError as e:
    print("✅ Got helpful KeyError:")
    print(f"   {str(e)}")

In [None]:
# Test 2.4: List all registered names
ctx.register("data1", pd.DataFrame({"a": [1]}))
ctx.register("data2", pd.DataFrame({"b": [2]}))

print(f"Registered DataFrames: {ctx.list_names()}")

In [None]:
# Test 2.5: Clear all DataFrames
print(f"Before clear: {len(ctx.list_names())} DataFrames")
ctx.clear()
print(f"After clear: {len(ctx.list_names())} DataFrames")
print(f"Has 'my_data': {ctx.has('my_data')}")

In [None]:
# Test 2.6: Type validation - rejects non-DataFrame
ctx = PandasContext()
try:
    ctx.register("invalid", {"not": "a dataframe"})
    print("❌ Should have failed!")
except TypeError as e:
    print("✅ Type validation works:")
    print(f"   {str(e)}")

In [None]:
# Test 2.7: Simulating a pipeline - data flow between nodes
ctx = PandasContext()

# Node 1: Load raw data
raw = pd.DataFrame({
    "id": [1, 2, 3, 4],
    "value": [5, 15, 25, 35]
})
ctx.register("raw_data", raw)
print("Node 1: Loaded raw data")

# Node 2: Filter data
raw = ctx.get("raw_data")
filtered = raw[raw["value"] > 10]
ctx.register("filtered_data", filtered)
print("Node 2: Filtered data (value > 10)")

# Node 3: Compute summary
filtered = ctx.get("filtered_data")
print(f"Node 3: Final result has {len(filtered)} rows")
print(filtered)

---

## 3. Function Registry Tests

Test the `@transform` decorator and parameter validation.

In [None]:
# Test 3.1: Register a function with @transform
# Clear registry first (in case running multiple times)
FunctionRegistry._functions.clear()
FunctionRegistry._signatures.clear()

@transform
def my_transform(context, param1: str, param2: int = 10):
    """Example transform function."""
    return f"Got {param1} and {param2}"

print(f"Registered functions: {FunctionRegistry.list_functions()}")

In [None]:
# Test 3.2: Call the decorated function
ctx = PandasContext()
result = my_transform(ctx, "hello", 42)
print(f"Result: {result}")

In [None]:
# Test 3.3: Validate parameters - valid case
try:
    validate_function_params(
        "my_transform",
        {"param1": "value", "param2": 20}
    )
    print("✅ Parameters validated successfully")
except ValueError as e:
    print(f"❌ Unexpected error: {e}")

In [None]:
# Test 3.4: Validate parameters - missing required param
try:
    validate_function_params(
        "my_transform",
        {"param2": 20}  # Missing param1!
    )
    print("❌ Should have failed!")
except ValueError as e:
    print("✅ Caught missing parameter:")
    print(f"   {str(e)}")

In [None]:
# Test 3.5: Validate parameters - unexpected param
try:
    validate_function_params(
        "my_transform",
        {"param1": "value", "unknown_param": "oops"}
    )
    print("❌ Should have failed!")
except ValueError as e:
    print("✅ Caught unexpected parameter:")
    print(f"   {str(e)}")

In [None]:
# Test 3.6: Get function metadata
info = FunctionRegistry.get_function_info("my_transform")

print("Function info:")
print(f"  Name: {info['name']}")
print(f"  Docstring: {info['docstring']}")
print(f"  Parameters:")
for param_name, param_info in info['parameters'].items():
    required = "required" if param_info['required'] else "optional"
    default = f" (default: {param_info['default']})" if param_info['default'] is not None else ""
    print(f"    - {param_name}: {required}{default}")

In [None]:
# Test 3.7: Real-world transform function
@transform
def filter_by_threshold(context, source_table: str, threshold: float):
    """Filter data by threshold value."""
    df = context.get(source_table)
    return df[df["value"] > threshold]

# Set up test data
ctx = PandasContext()
data = pd.DataFrame({
    "id": [1, 2, 3, 4],
    "value": [5.0, 15.0, 25.0, 35.0]
})
ctx.register("source_data", data)

# Execute transform
result = filter_by_threshold(ctx, source_table="source_data", threshold=20.0)

print("Original data:")
print(data)
print("\nFiltered data (threshold=20.0):")
print(result)

---

## 4. Integration Example

Putting it all together: config + context + transforms

In [None]:
# Define a complete pipeline config
pipeline = PipelineConfig(
    pipeline="sales_pipeline",
    description="Process sales data",
    nodes=[
        NodeConfig(
            name="load_sales",
            description="Load raw sales data",
            read=ReadConfig(
                connection="local",
                format="csv",
                path="sales.csv"
            ),
            cache=True
        ),
        NodeConfig(
            name="clean_sales",
            description="Remove invalid records",
            depends_on=["load_sales"],
            transform=TransformConfig(
                steps=[
                    {
                        "function": "filter_by_threshold",
                        "params": {
                            "source_table": "load_sales",
                            "threshold": 0.0
                        }
                    }
                ]
            )
        ),
        NodeConfig(
            name="save_results",
            description="Save cleaned data",
            depends_on=["clean_sales"],
            write=WriteConfig(
                connection="local",
                format="parquet",
                path="cleaned_sales.parquet",
                mode="overwrite"
            )
        )
    ]
)

print(f"Pipeline: {pipeline.pipeline}")
print(f"Nodes: {len(pipeline.nodes)}")
for node in pipeline.nodes:
    deps = f" (depends on: {', '.join(node.depends_on)})" if node.depends_on else ""
    print(f"  - {node.name}{deps}")

In [None]:
# Validate the transform step parameters
transform_step = pipeline.nodes[1].transform.steps[0]
function_name = transform_step["function"]
params = transform_step["params"]

print(f"Validating transform function: {function_name}")
print(f"Parameters: {params}")

try:
    validate_function_params(function_name, params)
    print("✅ Transform parameters are valid!")
except ValueError as e:
    print(f"❌ Validation failed: {e}")

---

## Summary

**What we've proven:**

1. ✅ **Config validation works** - Invalid configs are caught with clear error messages
2. ✅ **Context API works** - DataFrames can be registered and retrieved by name
3. ✅ **Function registry works** - Transform functions are validated and type-safe
4. ✅ **Integration ready** - All components work together

**What's missing:**
- Dependency graph builder (to order nodes)
- Pipeline executor (to actually run nodes)
- Engine implementation (to read/write data)

**Next step:** Build the orchestration layer!