# Multi-Account ADLS Test - Phase 2A Core Feature

**Tests:**
1. ‚úÖ Multi-account ADLS configuration with SparkEngine
2. ‚úÖ Read from account A, write to account B
3. ‚úÖ Schema introspection
4. ‚úÖ Delta Lake time travel
5. ‚úÖ Parallel Key Vault setup

**Cleanup:** All test data removed at end

In [None]:
%pip install "git+https://github.com/henryodibi11/Odibi.git#egg=odibi[spark,pandas,azure]" --quiet
dbutils.library.restartPython()

In [None]:
# Setup
import pandas as pd
from pyspark.sql import SparkSession
from odibi.engine import SparkEngine
from odibi.connections import AzureADLS, LocalConnection
from odibi.utils import configure_connections_parallel
import os

spark = SparkSession.getActiveSession()

# Test paths (simulating 2 different storage accounts with local DBFS)
ACCOUNT_A_BASE = "/dbfs/tmp/odibi_test_account_a"  # Simulate storage account A
ACCOUNT_B_BASE = "/dbfs/tmp/odibi_test_account_b"  # Simulate storage account B

os.makedirs(ACCOUNT_A_BASE, exist_ok=True)
os.makedirs(ACCOUNT_B_BASE, exist_ok=True)

# Test data
employees_v1 = pd.DataFrame(
    {
        "id": [1, 2, 3],
        "name": ["Alice", "Bob", "Charlie"],
        "department": ["Engineering", "Sales", "Engineering"],
        "salary": [100000, 80000, 95000],
    }
)

employees_v2 = pd.DataFrame(
    {
        "id": [1, 2, 3, 4],
        "name": ["Alice", "Bob", "Charlie", "David"],
        "department": ["Engineering", "Sales", "Engineering", "HR"],
        "salary": [105000, 82000, 98000, 70000],
    }
)

print("‚úì Setup complete")
print(f"  Account A: {ACCOUNT_A_BASE}")
print(f"  Account B: {ACCOUNT_B_BASE}")

## Test 1: Multi-Account ADLS Configuration

In [None]:
print("=" * 70)
print("TEST 1: Multi-Account ADLS Configuration")
print("=" * 70)

# Create 2 ADLS connections (simulating 2 storage accounts)
# In production, these would be real Azure storage accounts
adls_connections = {
    "account_a": AzureADLS(
        account="datalakea",
        container="bronze",
        auth_mode="direct_key",
        account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==",
        validate=True,
    ),
    "account_b": AzureADLS(
        account="datalakeb",
        container="silver",
        auth_mode="direct_key",
        account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==",
        validate=True,
    ),
}

print("\n‚ö° Configuring 2 storage accounts in parallel...\n")

# Configure connections in parallel (Phase 2C feature)
configured_adls, errors = configure_connections_parallel(
    adls_connections, prefetch_secrets=True, max_workers=2, verbose=True
)

assert len(errors) == 0, f"Configuration errors: {errors}"
print("\n‚úì Both accounts configured successfully")

# Create SparkEngine with multi-account configuration
multi_spark_engine = SparkEngine(connections=configured_adls, spark_session=spark)
print("‚úì SparkEngine initialized with 2 storage accounts")

# Verify Spark session has both account keys
spark_conf = spark.sparkContext.getConf()
key_a = spark_conf.get("fs.azure.account.key.datalakea.dfs.core.windows.net")
key_b = spark_conf.get("fs.azure.account.key.datalakeb.dfs.core.windows.net")

assert key_a is not None, "Account A not configured in Spark!"
assert key_b is not None, "Account B not configured in Spark!"
print("‚úì Both storage account keys configured in Spark session")

print("\n" + "=" * 70)
print("‚úÖ TEST 1 PASSED - Multi-account ADLS works!")
print("=" * 70)

## Test 2: Cross-Account Data Transfer

Read from Account A, write to Account B (medallion architecture pattern)

In [None]:
print("=" * 70)
print("TEST 2: Cross-Account Data Transfer")
print("=" * 70)

# Use local connections to simulate the accounts
conn_a = LocalConnection(base_path=ACCOUNT_A_BASE)
conn_b = LocalConnection(base_path=ACCOUNT_B_BASE)

# Write to Account A (bronze)
spark_df = spark.createDataFrame(employees_v1)
local_engine = SparkEngine(spark_session=spark)

print("\n1Ô∏è‚É£  Writing data to Account A (bronze)...")
local_engine.write(
    spark_df, connection=conn_a, path="raw_employees", format="parquet", mode="overwrite"
)
print(f"   ‚úì Written to: {ACCOUNT_A_BASE}/raw_employees")

# Read from Account A
print("\n2Ô∏è‚É£  Reading from Account A...")
df_from_a = local_engine.read(connection=conn_a, path="raw_employees", format="parquet")
print(f"   ‚úì Read {df_from_a.count()} rows from Account A")

# Transform data
print("\n3Ô∏è‚É£  Transforming data (SQL)...")
result = local_engine.execute_sql(
    "SELECT department, AVG(salary) as avg_salary FROM employees GROUP BY department",
    {"employees": df_from_a},
)
print("   ‚úì Transformation complete")
result.show()

# Write to Account B (silver)
print("\n4Ô∏è‚É£  Writing transformed data to Account B (silver)...")
local_engine.write(
    result, connection=conn_b, path="dept_salary_agg", format="parquet", mode="overwrite"
)
print(f"   ‚úì Written to: {ACCOUNT_B_BASE}/dept_salary_agg")

# Verify in Account B
print("\n5Ô∏è‚É£  Verifying data in Account B...")
df_from_b = local_engine.read(connection=conn_b, path="dept_salary_agg", format="parquet")
print(f"   ‚úì Read {df_from_b.count()} rows from Account B")
df_from_b.show()

assert df_from_b.count() > 0, "No data in Account B!"

print("\n" + "=" * 70)
print("‚úÖ TEST 2 PASSED - Cross-account transfer works!")
print("   Bronze (Account A) ‚Üí Transform ‚Üí Silver (Account B)")
print("=" * 70)

## Test 3: Schema Introspection

In [None]:
print("=" * 70)
print("TEST 3: Schema Introspection")
print("=" * 70)

test_df = spark.createDataFrame(employees_v1)

# Test get_schema
print("\n1Ô∏è‚É£  Testing get_schema()...")
schema = local_engine.get_schema(test_df)
print(f"   Schema: {schema}")
assert len(schema) == 4, "Expected 4 columns!"
assert schema[0][0] == "id", "First column should be 'id'"
print("   ‚úì get_schema() works")

# Test get_shape
print("\n2Ô∏è‚É£  Testing get_shape()...")
shape = local_engine.get_shape(test_df)
print(f"   Shape: {shape}")
assert shape == (3, 4), "Expected (3, 4)!"
print("   ‚úì get_shape() works")

# Test count_rows
print("\n3Ô∏è‚É£  Testing count_rows()...")
count = local_engine.count_rows(test_df)
print(f"   Row count: {count}")
assert count == 3, "Expected 3 rows!"
print("   ‚úì count_rows() works")

print("\n" + "=" * 70)
print("‚úÖ TEST 3 PASSED - Schema introspection works!")
print("=" * 70)

## Test 4: Delta Lake Time Travel

In [None]:
print("=" * 70)
print("TEST 4: Delta Lake Time Travel")
print("=" * 70)

# Write version 1
print("\n1Ô∏è‚É£  Writing Delta table version 1 (3 employees)...")
df_v1 = spark.createDataFrame(employees_v1)
local_engine.write(
    df_v1, connection=conn_a, path="employees_delta", format="delta", mode="overwrite"
)
print("   ‚úì Version 1 written")

# Write version 2
print("\n2Ô∏è‚É£  Writing Delta table version 2 (4 employees)...")
df_v2 = spark.createDataFrame(employees_v2)
local_engine.write(
    df_v2, connection=conn_a, path="employees_delta", format="delta", mode="overwrite"
)
print("   ‚úì Version 2 written")

# Read latest version
print("\n3Ô∏è‚É£  Reading latest version...")
df_latest = local_engine.read(connection=conn_a, path="employees_delta", format="delta")
print(f"   ‚úì Latest version has {df_latest.count()} rows")
assert df_latest.count() == 4, "Latest should have 4 rows!"

# Read version 0 (time travel)
print("\n4Ô∏è‚É£  Reading version 0 (time travel)...")
df_v0 = local_engine.read(
    connection=conn_a, path="employees_delta", format="delta", options={"versionAsOf": "0"}
)
print(f"   ‚úì Version 0 has {df_v0.count()} rows")
assert df_v0.count() == 3, "Version 0 should have 3 rows!"

print("\n5Ô∏è‚É£  Comparing versions...")
print(f"   Version 0: {df_v0.count()} rows (original)")
print(f"   Latest:    {df_latest.count()} rows (updated)")

print("\n" + "=" * 70)
print("‚úÖ TEST 4 PASSED - Delta time travel works!")
print("=" * 70)

## Test 5: URI Generation for ADLS

In [None]:
print("=" * 70)
print("TEST 5: ADLS URI Generation")
print("=" * 70)

# Test URI generation for both accounts
print("\n1Ô∏è‚É£  Account A URI:")
uri_a = configured_adls["account_a"].uri("test/data.parquet")
print(f"   {uri_a}")
assert "abfss://bronze@datalakea.dfs.core.windows.net" in uri_a
assert "test/data.parquet" in uri_a
print("   ‚úì Account A URI correct")

print("\n2Ô∏è‚É£  Account B URI:")
uri_b = configured_adls["account_b"].uri("aggregated/results.parquet")
print(f"   {uri_b}")
assert "abfss://silver@datalakeb.dfs.core.windows.net" in uri_b
assert "aggregated/results.parquet" in uri_b
print("   ‚úì Account B URI correct")

print("\n" + "=" * 70)
print("‚úÖ TEST 5 PASSED - ADLS URI generation works!")
print("=" * 70)

## Cleanup - Remove All Test Data

In [None]:
import shutil

print("=" * 70)
print("CLEANUP")
print("=" * 70)

# Remove Account A
if os.path.exists(ACCOUNT_A_BASE):
    shutil.rmtree(ACCOUNT_A_BASE)
    print(f"‚úì Removed Account A: {ACCOUNT_A_BASE}")

# Remove Account B
if os.path.exists(ACCOUNT_B_BASE):
    shutil.rmtree(ACCOUNT_B_BASE)
    print(f"‚úì Removed Account B: {ACCOUNT_B_BASE}")

# Clean DBFS
try:
    dbutils.fs.rm(f"dbfs:{ACCOUNT_A_BASE}", recurse=True)
    dbutils.fs.rm(f"dbfs:{ACCOUNT_B_BASE}", recurse=True)
    print("‚úì Cleaned up DBFS")
except:
    pass

print("\n" + "=" * 70)
print("üéâ ALL TESTS PASSED!")
print("=" * 70)
print("\n‚úÖ Multi-account ADLS configuration")
print("‚úÖ Cross-account data transfer (A ‚Üí B)")
print("‚úÖ Schema introspection (get_schema, get_shape, count_rows)")
print("‚úÖ Delta Lake time travel")
print("‚úÖ ADLS URI generation")
print("\nüöÄ Phase 2A/2B/2C fully validated in Databricks!")
print("=" * 70)