# Great Expectations V5 - Main Orchestrator



**Steps Overview**:
- **Step 1**: Environment Diagnostics & Package Installation
- **Step 2**: GX Context Setup & API Discovery  
- **Step 3**: SQL Server Connection & Data Loading
- **Step 4**: Data Quality Validation Setup
- **Step 4.5**: Team Rules Definition Library 
- **Step 5**: Expectation Validation Execution
- **Step 6**: Results Analysis & Reporting
- **Step 7**: Data Docs Generation
- **Step 8**: Azure OpenAI Integration (Optional)



In [0]:

# Configuration
BASE_PATH = "/Workspace/Users/prashanth.kumar4@shell.com/Great-expectations-DataQuality-v1"
STEPS = [
    "step1_environment_diagnostics",
    "step2_gx_context_setup",
    "step2_5_gx_methods_demo", 
    "step3_sql_connection_data",
    "step4_validation_setup",
    "step4_5_rules_definition",  
    "step5_expectation_validation",
    "step6_results_analysis",
    "step7_data_docs",
    "step8_azure_openai"
]


pipeline_status = {}
pipeline_results = {}



def run_pipeline_step(step_name, step_number, timeout_seconds=600, skip_on_error=False):
    """
    Execute a pipeline step notebook and handle results
    
    Args:
        step_name: Name of the step notebook (without .ipynb)
        step_number: Step number for tracking
        timeout_seconds: Timeout for notebook execution  
        skip_on_error: Whether to continue pipeline if step fails
    """
    print(f"\n STEP {step_number}: {step_name}")
    print("-" * 50)
    
    step_path = f"{BASE_PATH}/{step_name}"
    
    try:
        # Run the step notebook
        print(f"Executing: {step_path}")
        result = dbutils.notebook.run(step_path, timeout_seconds)
        
        # Parse result (notebooks should return JSON status)
        import json
        try:
            # Try to parse as JSON first
            if result and result.strip().startswith('{'):
                step_result = json.loads(result)
            elif result:
                # If not JSON, treat as plain text message
                step_result = {"status": "success", "message": f"Completed: {result}"}
            else:
                # No return value - this is normal for most notebooks
                step_result = {"status": "success", "message": f"Step {step_number} executed successfully"}
        except Exception as parse_error:
            # Fallback for any parsing issues
            step_result = {"status": "success", "message": f"Completed (output: {str(result)[:50]}...)"}
        
        # Update pipeline status
        pipeline_status[step_name] = "SUCCESS"
        pipeline_results[step_name] = step_result
        
        print(f"SUCCESS: {step_name}")
        print(f"Result: {step_result.get('message', 'No message')}")
        
        return True
        
    except Exception as e:
        error_msg = str(e)
        print(f"ERROR in {step_name}: {error_msg}")
        
        pipeline_status[step_name] = "FAILED"
        pipeline_results[step_name] = {"status": "error", "message": error_msg}
        
        if skip_on_error:
            print(f"Continuing pipeline despite error (skip_on_error=True)")
            return False
        else:
            print(f"Stopping pipeline due to error")
            raise Exception(f"Pipeline failed at {step_name}: {error_msg}")



In [0]:

# EXECUTE PIPELINE STEPS



run_pipeline_step("step1_environment_diagnostics", 1, timeout_seconds=300)
run_pipeline_step("step2_gx_context_setup", 2, timeout_seconds=300)
run_pipeline_step("step2_5_gx_methods_demo", 3, timeout_seconds=300)
run_pipeline_step("step3_sql_connection_data", 4, timeout_seconds=600)
run_pipeline_step("step4_validation_setup", 5, timeout_seconds=300)
run_pipeline_step("step4_5_rules_definition", 6, timeout_seconds=300)
run_pipeline_step("step5_expectation_validation", 7, timeout_seconds=600)
run_pipeline_step("step6_results_analysis", 8, timeout_seconds=300)
run_pipeline_step("step7_data_docs", 9, timeout_seconds=300, skip_on_error=True)
# run_pipeline_step("step8_azure_openai", 10, timeout_seconds=300, skip_on_error=True)




 STEP 1: step1_environment_diagnostics
--------------------------------------------------
Executing: /Workspace/Users/prashanth.kumar4@shell.com/Great-expectations-DataQuality-v1/step1_environment_diagnostics
SUCCESS: step1_environment_diagnostics
Result: Step 1 executed successfully

 STEP 2: step2_gx_context_setup
--------------------------------------------------
Executing: /Workspace/Users/prashanth.kumar4@shell.com/Great-expectations-DataQuality-v1/step2_gx_context_setup
SUCCESS: step2_gx_context_setup
Result: No message

 STEP 3: step2_5_gx_methods_demo
--------------------------------------------------
Executing: /Workspace/Users/prashanth.kumar4@shell.com/Great-expectations-DataQuality-v1/step2_5_gx_methods_demo
SUCCESS: step2_5_gx_methods_demo
Result: No message

 STEP 4: step3_sql_connection_data
--------------------------------------------------
Executing: /Workspace/Users/prashanth.kumar4@shell.com/Great-expectations-DataQuality-v1/step3_sql_connection_data
SUCCESS: step3_

True

In [0]:

# PIPELINE  REPORT

total_steps = len(STEPS)
successful_steps = sum(1 for status in pipeline_status.values() if status == "SUCCESS")
failed_steps = sum(1 for status in pipeline_status.values() if status == "FAILED")

print(f"SUMMARY:")
print(f"   Total Steps: {total_steps}")
print(f"  Successful: {successful_steps}")
print(f"    Failed: {failed_steps}")
print(f"    Success Rate: {(successful_steps/total_steps)*100:.1f}%")

# Detailed status
print(f"\nDETAILED STEP STATUS:")
for i, step in enumerate(STEPS, 1):
    status = pipeline_status.get(step, "NOT_RUN")
    result = pipeline_results.get(step, {})
    
    status_emoji = "" if status == "SUCCESS" else "" if status == "FAILED" else ""
    print(f"   {status_emoji} Step {i}: {step} - {status}")
    
    if "message" in result:
        message = result["message"][:100] + "..." if len(result["message"]) > 100 else result["message"]
        print(f"       {message}")

# Critical path analysis
critical_failures = [step for step in STEPS[:6] if pipeline_status.get(step) == "FAILED"]
if critical_failures:
    print(f"\n  CRITICAL FAILURES (Steps 1-6):")
    for step in critical_failures:
        print(f"    {step}")
    print(f"    Recommendation: Fix these before proceeding")
else:
    print(f"\n CORE PIPELINE STATUS: ALL CRITICAL STEPS SUCCESSFUL")

# Generate summary for downstream use
pipeline_summary = {
    "total_steps": total_steps,
    "successful_steps": successful_steps, 
    "failed_steps": failed_steps,
    "success_rate": (successful_steps/total_steps)*100,
    "critical_failures": critical_failures,
    "status": "SUCCESS" if not critical_failures else "PARTIAL_SUCCESS",
    "ready_for_production": successful_steps >= 6  # Steps 1-6 are core
}

print(f"\n Pipeline ready for production: {pipeline_summary['ready_for_production']}")
print("=" * 80)

# Return summary for parent workflows
import json
dbutils.notebook.exit(json.dumps(pipeline_summary))