# Step 4: Data Quality Validation Setup


In [0]:


import sys
print(f"Python environment: {sys.executable}")

# Step 1: Try to import Great Expectations
try:
    import great_expectations as gx
    print(f"SUCCESS: Great Expectations {gx.__version__} is available!")
    print(f"Module location: {gx.__file__}")
    print(f"Ready to proceed with Step 4 validation setup")
    
    # Verify Databricks environment
    try:
        dbutils.fs.ls('/')
        print("Databricks environment confirmed")
    except NameError:
        print("Warning: dbutils not available")
    


    
except ImportError:
    print("Great Expectations not found - Auto-installing now...")
    
    # Check if we're in Databricks
    try:
        dbutils.fs.ls('/')
        print("Databricks environment confirmed")
        
        print("AUTO-INSTALLING WITH DATABRICKS %pip...")
        print("Installing: great-expectations[sql,azure,databricks]...")
        
        # Auto-install using %pip magic
        get_ipython().run_line_magic('pip', 'install great-expectations[sql,azure,databricks]')
        
        print("Installation completed!")
        print("AUTO-RESTARTING PYTHON ENVIRONMENT...")
        
  
        dbutils.library.restartPython()
        
    except NameError:
        print("Not in Databricks - manual installation required")
        print("MANUAL INSTALLATION REQUIRED:")
        print("pip install great-expectations[sql]")
        raise ImportError("Manual installation required - not in Databricks environment")
    
    except Exception as e:
        print(f"Auto-installation failed: {e}")
        print("FALLBACK - RUN THESE COMMANDS MANUALLY:")
        print("1. %pip install great-expectations[sql,azure,databricks]")
        print("2. dbutils.library.restartPython()")
        print("3. Re-run this cell")
        raise ImportError("Auto-installation failed - use manual commands above")

Python environment: /local_disk0/.ephemeral_nfs/envs/pythonEnv-f38b53fb-ed20-4ae5-94f7-129d43bc14bf/bin/python
SUCCESS: Great Expectations 1.5.6 is available!
Module location: /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/great_expectations/__init__.py
Ready to proceed with Step 4 validation setup
Databricks environment confirmed


In [0]:


import json
import great_expectations as gx
import pandas as pd
from pyspark.sql import functions as F

step4_results = {
    "status": "running",
    "context_recreated": False,
    "datasource_created": False,
    "expectation_suite_created": False,
    "validator_created": False,
    "validation_ready": False,
    "suite_name": "dq_logic_validation_suite",
    "datasource_name": "sql_server_datasource",
    "asset_name": "dq_logic_table",
    "error_message": None
}

try:

    
    context = None
    

    try:
        context = gx.get_context(project_root_dir="/dbfs/FileStore/great_expectations")
        print("File-based context recreated")
        step4_results["context_type"] = "FileDataContext"
    except:
        try:
            context = gx.get_context(mode="ephemeral")
            print("Ephemeral context created")
            step4_results["context_type"] = "EphemeralDataContext"
        except Exception as e:
            print(f"Context creation failed: {e}")
            raise e
    
    step4_results["context_recreated"] = True
    print(f"Context type: {type(context).__name__}")

except Exception as e:
    print(f"Error recreating context: {e}")
    step4_results["status"] = "error"
    step4_results["error_message"] = f"Context recreation failed: {e}"



if step4_results["context_recreated"]:

    
    try:

        suite_name = step4_results["suite_name"]
        

        try:
            suite = context.suites.add(gx.ExpectationSuite(name=suite_name))
            print(f"Expectation suite created: {suite_name}")
            step4_results["expectation_suite_created"] = True
        except Exception as e:

            suite = gx.ExpectationSuite(name=suite_name)
            print(f"Basic expectation suite created: {suite_name}")
            step4_results["expectation_suite_created"] = True
        

        step4_results["validation_framework_ready"] = True
        step4_results["status"] = "success"
        
        print(f"Validation framework ready for expectation creation")
        
    except Exception as e:
        print(f"Validation setup failed: {e}")
        step4_results["error_message"] = f"Validation setup failed: {e}"
        step4_results["status"] = "error" 

File-based context recreated
Context type: FileDataContext
Basic expectation suite created: dq_logic_validation_suite
Validation framework ready for expectation creation


In [0]:

def clean_for_json(obj):
    """Convert non-serializable types to JSON-compatible types"""
    import datetime
    import numpy as np
    
    if isinstance(obj, dict):
        return {k: clean_for_json(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [clean_for_json(v) for v in obj]
    elif isinstance(obj, (datetime.date, datetime.datetime, np.datetime64)):
        return str(obj)
    elif isinstance(obj, (np.integer, np.int32, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.bool_, bool)):
        return bool(obj)
    else:
        return obj

# Clean and return results
step4_results = clean_for_json(step4_results)
dbutils.notebook.exit(json.dumps(step4_results))