# Delta Lake Optimization Project - Validation & Testing

This notebook provides validation and testing utilities for the Delta Lake optimization project. Use it to:

- ✅ Validate your Databricks environment setup
- 🔍 Test project functionality before running main notebooks
- 🧪 Verify optimization techniques are working as expected
- 📊 Quick environment health check

## Usage
Run this notebook after setting up your environment but before running the main project. It will identify any issues early and provide troubleshooting guidance.

In [0]:
# Import required libraries and run basic checks
import sys
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import datetime

print("🔧 Delta Lake Optimization Project - Environment Validation")
print("=" * 60)
print(f"Python version: {sys.version}")
print(f"PySpark version: {pyspark.__version__}")
print(f"Validation time: {datetime.datetime.now()}")

In [0]:
# Validate Spark and Delta Lake functionality
def validate_spark_environment():
    """Validate that Spark and Delta Lake are working correctly."""
    
    tests_passed = 0
    tests_total = 5
    
    print("🧪 Running Spark and Delta Lake validation tests...\n")
    
    # Test 1: Basic Spark functionality
    try:
        test_df = spark.range(10).withColumn("doubled", F.col("id") * 2)
        row_count = test_df.count()
        assert row_count == 10, f"Expected 10 rows, got {row_count}"
        print("✅ Test 1: Basic Spark DataFrame operations")
        tests_passed += 1
    except Exception as e:
        print(f"❌ Test 1 FAILED: Basic Spark operations - {e}")
    
    # Test 2: Delta Lake table creation
    try:
        test_table = "default.validation_test_table"
        test_df = spark.range(5).withColumn("test_col", F.lit("test_value"))
        test_df.write.format("delta").mode("overwrite").saveAsTable(test_table)
        
        # Verify table exists and has correct data
        read_df = spark.table(test_table)
        assert read_df.count() == 5, "Delta table read/write failed"
        
        print("✅ Test 2: Delta Lake table creation and reading")
        tests_passed += 1
        
        # Clean up
        spark.sql(f"DROP TABLE IF EXISTS {test_table}")
        
    except Exception as e:
        print(f"❌ Test 2 FAILED: Delta Lake functionality - {e}")
    
    # Test 3: DESCRIBE DETAIL command
    try:
        # Create a temporary table for testing
        temp_table = "default.temp_detail_test"
        spark.range(100).write.format("delta").mode("overwrite").saveAsTable(temp_table)
        
        detail_df = spark.sql(f"DESCRIBE DETAIL {temp_table}")
        detail_row = detail_df.collect()[0]
        
        # Verify we get expected metadata
        assert detail_row['numFiles'] is not None, "numFiles should not be null"
        assert detail_row['sizeInBytes'] is not None, "sizeInBytes should not be null"
        
        print("✅ Test 3: DESCRIBE DETAIL metadata extraction")
        tests_passed += 1
        
        # Clean up
        spark.sql(f"DROP TABLE IF EXISTS {temp_table}")
        
    except Exception as e:
        print(f"❌ Test 3 FAILED: DESCRIBE DETAIL functionality - {e}")
    
    # Test 4: OPTIMIZE command
    try:
        # Create table with multiple small files
        optimize_table = "default.optimize_test_table"
        
        # Write multiple small partitions to create multiple files
        for i in range(3):
            spark.range(10).withColumn("batch_id", F.lit(i)).write.format("delta").mode("append").saveAsTable(optimize_table)
        
        # Run OPTIMIZE
        spark.sql(f"OPTIMIZE {optimize_table}")
        
        print("✅ Test 4: OPTIMIZE command execution")
        tests_passed += 1
        
        # Clean up
        spark.sql(f"DROP TABLE IF EXISTS {optimize_table}")
        
    except Exception as e:
        print(f"❌ Test 4 FAILED: OPTIMIZE functionality - {e}")
    
    # Test 5: Catalog/Schema operations
    try:
        test_catalog = "validation_test_catalog"
        test_schema = "validation_test_schema"
        
        # Create and use catalog/schema
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {test_catalog}")
        spark.sql(f"USE CATALOG {test_catalog}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {test_schema}")
        spark.sql(f"USE SCHEMA {test_schema}")
        
        # Verify context
        current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0]
        current_schema = spark.sql("SELECT current_database()").collect()[0][0]
        
        assert current_catalog == test_catalog, f"Expected catalog {test_catalog}, got {current_catalog}"
        assert current_schema == test_schema, f"Expected schema {test_schema}, got {current_schema}"
        
        print("✅ Test 5: Catalog and schema operations")
        tests_passed += 1
        
        # Clean up
        spark.sql(f"DROP CATALOG IF EXISTS {test_catalog} CASCADE")
        spark.sql("USE CATALOG main")  # Reset to default
        
    except Exception as e:
        print(f"❌ Test 5 FAILED: Catalog/Schema operations - {e}")
        # Try to reset context even if test failed
        try:
            spark.sql("USE CATALOG main")
        except:
            pass
    
    # Summary
    print(f"\n📊 Validation Summary: {tests_passed}/{tests_total} tests passed")
    
    if tests_passed == tests_total:
        print("🎉 All tests passed! Your environment is ready for the Delta Lake optimization project.")
        return True
    else:
        print(f"⚠️  {tests_total - tests_passed} tests failed. See troubleshooting section below.")
        return False

# Run validation
validation_result = validate_spark_environment()

In [0]:
# Performance baseline test
def run_performance_baseline():
    """Run a basic performance test to establish baseline metrics."""
    
    print("⏱️  Running performance baseline test...\n")
    
    try:
        # Create a moderately sized dataset for performance testing
        baseline_table = "default.performance_baseline_test"
        
        print("   📊 Creating baseline dataset (100K rows)...")
        start_time = time.time()
        
        # Generate synthetic data similar to main project
        baseline_df = spark.range(100000).select(
            F.col("id").alias("row_id"),
            (F.rand() * 1000).cast("int").alias("customer_id"),
            (F.rand() * 100).cast("int").alias("product_id"),
            (F.rand() * 500 + 10).cast("decimal(10,2)").alias("amount"),
            F.expr("array('USA', 'Canada', 'UK', 'Germany', 'France')[int(rand() * 5)]").alias("country"),
            F.expr("date_add('2023-01-01', int(rand() * 365))").alias("sale_date")
        )
        
        baseline_df.write.format("delta").mode("overwrite").saveAsTable(baseline_table)
        create_time = time.time() - start_time
        
        print(f"   ✅ Dataset created in {create_time:.2f} seconds")
        
        # Test query performance
        print("   🔍 Running baseline query performance test...")
        
        query_start = time.time()
        result_df = spark.sql(f"""
        SELECT country, 
               COUNT(*) as total_sales,
               AVG(amount) as avg_amount,
               SUM(amount) as total_revenue
        FROM {baseline_table}
        WHERE country IN ('USA', 'Germany')
        GROUP BY country
        ORDER BY total_revenue DESC
        """)
        
        result_count = result_df.count()
        query_time = time.time() - query_start
        
        print(f"   ✅ Query completed in {query_time:.2f} seconds, returned {result_count} rows")
        
        # Get table details
        detail = spark.sql(f"DESCRIBE DETAIL {baseline_table}").collect()[0]
        num_files = detail['numFiles'] or 0
        size_mb = round((detail['sizeInBytes'] or 0) / 1024 / 1024, 2)
        
        print(f"   📁 Table: {num_files} files, {size_mb} MB total")
        
        # Display results
        print("\n   📈 Query Results:")
        display(result_df)
        
        # Performance summary
        print(f"\n🏁 Performance Baseline Summary:")
        print(f"   • Data creation: {create_time:.2f}s")
        print(f"   • Query execution: {query_time:.2f}s")
        print(f"   • Files generated: {num_files}")
        print(f"   • Data size: {size_mb} MB")
        
        if query_time < 10:
            print("   ✅ Good baseline performance for optimization experiments")
        else:
            print("   ⚠️  Slower than expected - may indicate resource constraints")
        
        # Clean up
        spark.sql(f"DROP TABLE IF EXISTS {baseline_table}")
        
        return True
        
    except Exception as e:
        print(f"❌ Performance test failed: {e}")
        return False

# Run performance test only if validation passed
if validation_result:
    import time
    performance_result = run_performance_baseline()
else:
    print("⏭️  Skipping performance test due to validation failures")

## 🔧 Troubleshooting Guide

If any validation tests failed, use this guide to resolve common issues:

### Common Issues and Solutions

#### ❌ Basic Spark Operations Failed
- **Cause**: Spark cluster not properly initialized
- **Solution**: Restart your cluster or ensure you're running in a Databricks environment

#### ❌ Delta Lake Functionality Failed  
- **Cause**: Delta Lake not available or not enabled
- **Solution**: 
  - Ensure you're using Databricks Runtime 11.0+ 
  - For local environments, install delta-spark: `pip install delta-spark`

#### ❌ DESCRIBE DETAIL Failed
- **Cause**: Insufficient permissions or outdated runtime
- **Solution**:
  - Use Databricks Runtime (not Apache Spark)
  - Ensure table access permissions

#### ❌ OPTIMIZE Command Failed
- **Cause**: Command not available or insufficient permissions
- **Solution**: 
  - Use Databricks Runtime with Delta Lake
  - Ensure write permissions to the workspace

#### ❌ Catalog/Schema Operations Failed
- **Cause**: Unity Catalog not enabled or insufficient permissions
- **Solution**:
  - For Free Edition: Use `default` catalog and create schemas only
  - For Standard+: Ensure Unity Catalog is configured
  - Modify the main project to use existing catalog/schema

In [0]:
# Final validation summary and next steps
print("🎓 Delta Lake Optimization Project - Validation Complete")
print("=" * 60)

if validation_result:
    print("✅ Environment validation: PASSED")
    
    if 'performance_result' in locals() and performance_result:
        print("✅ Performance baseline: PASSED")
        print("\n🚀 Your environment is fully ready!")
        print("\n📋 Next Steps:")
        print("1. Open and run project.ipynb - the main learning notebook")
        print("2. Try metrics_collection.ipynb for automated tracking")
        print("3. Explore partitioning_comparison_extension.ipynb for advanced scenarios")
        
    else:
        print("⚠️  Performance baseline: FAILED or SKIPPED")
        print("\n🚀 Environment is functional but may have performance issues")
        print("\n📋 Next Steps:")
        print("1. Review cluster configuration (memory, cores)")
        print("2. Consider using a larger cluster for better performance")
        print("3. Proceed with project.ipynb but expect longer execution times")
        
else:
    print("❌ Environment validation: FAILED")
    print("\n🔧 Action Required:")
    print("1. Review the troubleshooting guide above")
    print("2. Resolve the failed test issues")
    print("3. Re-run this validation notebook")
    print("4. Contact support if issues persist")

print("\n💡 Questions or Issues?")
print("• Check README.md for detailed setup instructions")
print("• Review ARCHITECTURE.md for project overview")
print("• Open GitHub Issues for additional support")

print("\nHappy learning! 🎉")