# Step 4.5: Data Quality Rules Definition Library


In [0]:


import sys
print(f"Python environment: {sys.executable}")

# Step 1: Try to import Great Expectations
try:
    import great_expectations as gx
    print(f"SUCCESS: Great Expectations {gx.__version__} is available!")
    print(f"Module location: {gx.__file__}")
    print(f"Ready to proceed with Step 4 validation setup")
    
    # Verify Databricks environment
    try:
        dbutils.fs.ls('/')
        print("Databricks environment confirmed")
    except NameError:
        print("Warning: dbutils not available")
    


    
except ImportError:
    print("Great Expectations not found - Auto-installing now...")
    
    # Check if we're in Databricks
    try:
        dbutils.fs.ls('/')
        print("Databricks environment confirmed")
        
        print("AUTO-INSTALLING WITH DATABRICKS %pip...")
        print("Installing: great-expectations[sql,azure,databricks]...")
        
        # Auto-install using %pip magic
        get_ipython().run_line_magic('pip', 'install great-expectations[sql,azure,databricks]')
        
        print("Installation completed!")
        print("AUTO-RESTARTING PYTHON ENVIRONMENT...")
        
  
        dbutils.library.restartPython()
        
    except NameError:
        print("Not in Databricks - manual installation required")
        print("MANUAL INSTALLATION REQUIRED:")
        print("pip install great-expectations[sql]")
        raise ImportError("Manual installation required - not in Databricks environment")
    
    except Exception as e:
        print(f"Auto-installation failed: {e}")
        print("FALLBACK - RUN THESE COMMANDS MANUALLY:")
        print("1. %pip install great-expectations[sql,azure,databricks]")
        print("2. dbutils.library.restartPython()")
        print("3. Re-run this cell")
        raise ImportError("Auto-installation failed - use manual commands above")

Python environment: /local_disk0/.ephemeral_nfs/envs/pythonEnv-08a0eb54-6f7c-4845-bd4d-6607a9f6a80d/bin/python
SUCCESS: Great Expectations 1.5.7 is available!
Module location: /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/great_expectations/__init__.py
Ready to proceed with Step 4 validation setup
Databricks environment confirmed


In [0]:
# =============================================================================
# DBFS CONTEXT SETUP (Connect to existing GX context from Step 2)
# =============================================================================

print("STEP 4.5 - DBFS CONTEXT SETUP")
print("-" * 50)

import os

# After restart, re-import Great Expectations
try:
    import great_expectations as gx
    print(f"Current Great Expectations {gx.__version__} ")
    

    is_databricks = False
    try:
        dbutils.fs.ls('/')
        is_databricks = True
    
    except NameError:

        is_databricks = False
    
    
    if is_databricks:
        dbfs_gx_path = "/dbfs/FileStore/great_expectations"
        print(f"DBFS context path: {dbfs_gx_path}")
        
  
        if os.path.exists(dbfs_gx_path):
            context = gx.get_context(project_root_dir=dbfs_gx_path)
         
            print(f"Context type: {type(context).__name__}")
        else:
          
            os.makedirs(dbfs_gx_path, exist_ok=True)
            context = gx.get_context(project_root_dir=dbfs_gx_path)
          
    else:
        
        local_gx_path = os.path.join(os.getcwd(), "great_expectations")
        context = gx.get_context(project_root_dir=local_gx_path)
       
    

    
except ImportError as e:

    raise ImportError("Great Expectations not available - run installation cell above")

STEP 4.5 - DBFS CONTEXT SETUP
--------------------------------------------------
Current Great Expectations 1.5.7 
DBFS context path: /dbfs/FileStore/great_expectations
Context type: FileDataContext


In [0]:

# Rules initialization




import json
from typing import Dict, List, Any


step4_5_results = {
    "status": "running",
    "rules_defined": 0,
    "rule_categories": {},
    "custom_rules": [],
    "table_specific_rules": {},
    "error_message": None
}



In [0]:

# CORE RULES-SPECIFIC FUNCTION





import json
from typing import Dict, List, Any


step4_5_results = {
    "status": "running",
    "rules_defined": 0,
    "rule_categories": {},
    "custom_rules": [],
    "table_specific_rules": {},
    "error_message": None
}



def get_core_data_quality_rules() -> List[Dict[str, Any]]:
    """
    Core data quality rules that apply to most datasets
    These are fundamental checks that should be performed on any table
    """
    
    core_rules = [
        # Table-level rules
        {
            "rule_id": "TBL_001",
            "category": "Critical",
            "expectation_type": "expect_table_row_count_to_be_between",
            "kwargs": {"min_value": 1, "max_value": 1000000},
            "meta": {
                "description": "Table should have reasonable number of rows",
                "business_impact": "Empty tables or extremely large tables may indicate data pipeline issues",
                "remediation": "Check data pipeline and source systems"
            }
        },
        {
            "rule_id": "TBL_002", 
            "category": "Important",
            "expectation_type": "expect_table_column_count_to_be_between",
            "kwargs": {"min_value": 1, "max_value": 100},
            "meta": {
                "description": "Table should have reasonable number of columns",
                "business_impact": "Schema changes may affect downstream systems",
                "remediation": "Verify table schema matches expectations"
            }
        }
    ]
    
    return core_rules

def get_hierarchy_id_rules() -> List[Dict[str, Any]]:
    """
    Rules specific to HIERARCHY_ID column
    These are critical for maintaining data integrity in hierarchical structures
    """
    
    hierarchy_rules = [
        {
            "rule_id": "HIE_001",
            "category": "Critical",
            "expectation_type": "expect_column_to_exist",
            "kwargs": {"column": "HIERARCHY_ID"},
            "meta": {
                "description": "HIERARCHY_ID column must exist",
                "business_impact": "Missing hierarchy column breaks organizational structure",
                "remediation": "Verify table schema and data source"
            }
        },
        {
            "rule_id": "HIE_002",
            "category": "Critical", 
            "expectation_type": "expect_column_values_to_not_be_null",
            "kwargs": {"column": "HIERARCHY_ID"},
            "meta": {
                "description": "HIERARCHY_ID should not be null",
                "business_impact": "Null hierarchy IDs break organizational reporting",
                "remediation": "Investigate source data quality and implement null checks"
            }
        },
        {
            "rule_id": "HIE_003",
            "category": "Important",
            "expectation_type": "expect_column_values_to_be_unique",
            "kwargs": {"column": "HIERARCHY_ID"},
            "meta": {
                "description": "HIERARCHY_ID should be unique",
                "business_impact": "Duplicate hierarchy IDs cause reporting inconsistencies",
                "remediation": "Implement unique constraints and deduplication logic"
            }
        },
        {
            "rule_id": "HIE_004",
            "category": "Important",
            "expectation_type": "expect_column_values_to_match_regex",
            "kwargs": {"column": "HIERARCHY_ID", "regex": r"^[A-Z0-9_-]+$"},
            "meta": {
                "description": "HIERARCHY_ID should follow standard format",
                "business_impact": "Non-standard formats may cause integration issues",
                "remediation": "Standardize hierarchy ID format across systems"
            }
        }
    ]
    
    return hierarchy_rules

def get_date_column_rules() -> List[Dict[str, Any]]:
    """
    Rules for date/timestamp columns like RECORD_CREATE_DATE
    These ensure audit trail integrity
    """
    
    date_rules = [
        {
            "rule_id": "DTE_001",
            "category": "Critical",
            "expectation_type": "expect_column_to_exist", 
            "kwargs": {"column": "RECORD_CREATE_DATE"},
            "meta": {
                "description": "RECORD_CREATE_DATE column must exist",
                "business_impact": "Missing audit dates affect compliance and traceability",
                "remediation": "Ensure audit columns are included in data pipeline"
            }
        },
        {
            "rule_id": "DTE_002",
            "category": "Critical",
            "expectation_type": "expect_column_values_to_not_be_null",
            "kwargs": {"column": "RECORD_CREATE_DATE"},
            "meta": {
                "description": "RECORD_CREATE_DATE should not be null", 
                "business_impact": "Missing audit timestamps affect regulatory compliance",
                "remediation": "Implement default timestamp logic in data pipeline"
            }
        },
        {
            "rule_id": "DTE_003",
            "category": "Important",
            "expectation_type": "expect_column_values_to_be_between",
            "kwargs": {
                "column": "RECORD_CREATE_DATE",
                "min_value": "2020-01-01",
                "max_value": "2030-12-31"
            },
            "meta": {
                "description": "RECORD_CREATE_DATE should be within reasonable range",
                "business_impact": "Invalid dates may indicate data corruption",
                "remediation": "Validate date ranges in source systems"
            }
        }
    ]
    
    return date_rules

print("Core rules library functions defined")

Core rules library functions defined


In [0]:
# =============================================================================
# BUSINESS-SPECIFIC RULES (CUSTOMIZABLE BY TEAMS)
# =============================================================================

def get_business_specific_rules() -> List[Dict[str, Any]]:
    """
    Business-specific rules that teams can customize
    Add your domain-specific validation rules here
    """
    
    business_rules = [
        {
            "rule_id": "BUS_001",
            "category": "Important",
            "expectation_type": "expect_column_values_to_not_be_null",
            "kwargs": {"column": "STATUS", "mostly": 0.95},
            "meta": {
                "description": "STATUS column should be mostly non-null",
                "business_impact": "Missing status affects operational reporting",
                "remediation": "Implement default status values"
            }
        },
        {
            "rule_id": "BUS_002", 
            "category": "Optional",
            "expectation_type": "expect_column_values_to_be_in_set",
            "kwargs": {"column": "PRIORITY", "value_set": ["HIGH", "MEDIUM", "LOW"]},
            "meta": {
                "description": "PRIORITY should be from valid set",
                "business_impact": "Invalid priority values affect workflow routing",
                "remediation": "Standardize priority value lists"
            }
        }
        # ADD YOUR CUSTOM BUSINESS RULES HERE
        # Example:
        # {
        #     "rule_id": "BUS_003",
        #     "category": "Critical",
        #     "expectation_type": "expect_column_values_to_be_between",
        #     "kwargs": {"column": "COST", "min_value": 0, "max_value": 1000000},
        #     "meta": {
        #         "description": "Cost values should be within business limits",
        #         "business_impact": "Invalid costs affect financial reporting",
        #         "remediation": "Review cost calculation logic"
        #     }
        # }
    ]
    
    return business_rules

def get_data_type_rules() -> List[Dict[str, Any]]:
    """
    Data type and format validation rules
    These ensure data types and formats are consistent
    """
    
    datatype_rules = [
        {
            "rule_id": "DTP_001",
            "category": "Important",
            "expectation_type": "expect_column_values_to_match_regex",
            "kwargs": {"column": "EMAIL", "regex": r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"},
            "meta": {
                "description": "EMAIL should follow valid email format",
                "business_impact": "Invalid emails affect communication and notifications",
                "remediation": "Implement email validation in input forms"
            }
        },
        {
            "rule_id": "DTP_002",
            "category": "Important", 
            "expectation_type": "expect_column_values_to_match_regex",
            "kwargs": {"column": "PHONE", "regex": r"^\+?[\d\s\-\(\)]+$"},
            "meta": {
                "description": "PHONE should follow valid phone format",
                "business_impact": "Invalid phone numbers affect customer contact",
                "remediation": "Standardize phone number format"
            }
        }
    ]
    
    return datatype_rules

print("Business-specific rules functions defined")

Business-specific rules functions defined


In [0]:
# =============================================================================
# TABLE-SPECIFIC RULES CONFIGURATION
# =============================================================================

def get_table_specific_rules() -> Dict[str, List[Dict[str, Any]]]:
    """
    Rules specific to particular tables
    Teams can add rules for specific tables here
    """
    
    table_rules = {
        "DQ_LOGIC": [
            {
                "rule_id": "DQL_001",
                "category": "Critical",
                "expectation_type": "expect_column_values_to_not_be_null",
                "kwargs": {"column": "LOGIC_TYPE", "mostly": 0.98},
                "meta": {
                    "description": "LOGIC_TYPE should be almost always populated for DQ_LOGIC table",
                    "business_impact": "Missing logic types affect data quality categorization",
                    "remediation": "Review data quality logic classification"
                }
            },
            {
                "rule_id": "DQL_002", 
                "category": "Important",
                "expectation_type": "expect_column_distinct_values_to_be_in_set",
                "kwargs": {"column": "ACTIVE_FLAG", "value_set": ["Y", "N", "1", "0", "TRUE", "FALSE"]},
                "meta": {
                    "description": "ACTIVE_FLAG should use standard boolean values",
                    "business_impact": "Non-standard flags affect filtering logic",
                    "remediation": "Standardize boolean value representation"
                }
            }
        ],
        
        "USER_PROFILES": [
            {
                "rule_id": "USR_001",
                "category": "Critical",
                "expectation_type": "expect_column_values_to_be_unique",
                "kwargs": {"column": "USER_ID"},
                "meta": {
                    "description": "USER_ID must be unique in USER_PROFILES",
                    "business_impact": "Duplicate user IDs cause authentication issues",
                    "remediation": "Implement unique constraints on user ID"
                }
            }
        ]
        
        # ADD YOUR TABLE-SPECIFIC RULES HERE
        # "YOUR_TABLE_NAME": [
        #     {
        #         "rule_id": "YTN_001",
        #         "category": "Critical",
        #         "expectation_type": "your_expectation_type",
        #         "kwargs": {"column": "your_column"},
        #         "meta": {
        #             "description": "Your rule description",
        #             "business_impact": "Impact on business",
        #             "remediation": "How to fix issues"
        #         }
        #     }
        # ]
    }
    
    return table_rules

print("Table-specific rules configuration defined")

Table-specific rules configuration defined


In [0]:
# ALL RULES CHECK

def compile_rules_for_table(table_name: str, available_columns: List[str]) -> List[Dict[str, Any]]:
    """
    Compile all applicable rules for a specific table
    
    Args:
        table_name: Name of the table to validate
        available_columns: List of columns available in the table
        
    Returns:
        List of applicable rules for the table
    """
    
    all_rules = []
    
    # 1. Add core rules (apply to all tables)
    all_rules.extend(get_core_data_quality_rules())
    
    # 2. Add hierarchy rules if HIERARCHY_ID column exists
    if "HIERARCHY_ID" in available_columns:
        all_rules.extend(get_hierarchy_id_rules())
    
    # 3. Add date rules if RECORD_CREATE_DATE column exists  
    if "RECORD_CREATE_DATE" in available_columns:
        all_rules.extend(get_date_column_rules())
    
    # 4. Add business rules (filter by available columns)
    business_rules = get_business_specific_rules()
    for rule in business_rules:
        if "column" in rule["kwargs"]:
            if rule["kwargs"]["column"] in available_columns:
                all_rules.append(rule)
        else:
            # Table-level rules without specific columns
            all_rules.append(rule)
    
    # 5. Add data type rules (filter by available columns)
    datatype_rules = get_data_type_rules()
    for rule in datatype_rules:
        if "column" in rule["kwargs"]:
            if rule["kwargs"]["column"] in available_columns:
                all_rules.append(rule)
    
    # 6. Add table-specific rules
    table_rules = get_table_specific_rules()
    if table_name in table_rules:
        for rule in table_rules[table_name]:
            if "column" in rule["kwargs"]:
                if rule["kwargs"]["column"] in available_columns:
                    all_rules.append(rule)
            else:
                all_rules.append(rule)
    
    return all_rules

def validate_rules(rules: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Validate that rules are properly formatted
    
    Args:
        rules: List of rules to validate
        
    Returns:
        Validation results
    """
    
    validation_results = {
        "valid_rules": 0,
        "invalid_rules": 0,
        "errors": []
    }
    
    required_fields = ["rule_id", "category", "expectation_type", "kwargs", "meta"]
    
    for i, rule in enumerate(rules):
        try:
            # Check required fields
            for field in required_fields:
                if field not in rule:
                    raise ValueError(f"Missing required field: {field}")
            
            # Check meta description
            if "description" not in rule["meta"]:
                raise ValueError("Missing description in meta")
            
            # Check category is valid
            if rule["category"] not in ["Critical", "Important", "Optional"]:
                raise ValueError(f"Invalid category: {rule['category']}")
            
            validation_results["valid_rules"] += 1
            
        except Exception as e:
            validation_results["invalid_rules"] += 1
            validation_results["errors"].append(f"Rule {i+1}: {str(e)}")
    
    return validation_results

print("Rules compilation and validation functions defined")

Rules compilation and validation functions defined


In [0]:
# FINAL COMPILED RULES RUN

try:
    print(f"COMPILING RULES FOR DEMONSTRATION")
    print("-" * 50)
    
    # Simulate available columns for specific table
    sample_columns = ["HIERARCHY_ID", "RECORD_CREATE_DATE", "STATUS", "LOGIC_TYPE", "ACTIVE_FLAG"]
    
    # Compile rules for specific table
    compiled_rules = compile_rules_for_table("dq_error_result", sample_columns)
    
    # Validate rules
    validation_results = validate_rules(compiled_rules)
    
    # Categorize rules
    rule_categories = {
        "Critical": [r for r in compiled_rules if r["category"] == "Critical"],
        "Important": [r for r in compiled_rules if r["category"] == "Important"], 
        "Optional": [r for r in compiled_rules if r["category"] == "Optional"]
    }
    
    # Update results
    step4_5_results.update({
        "rules_defined": len(compiled_rules),
        "rule_categories": {
            "Critical": len(rule_categories["Critical"]),
            "Important": len(rule_categories["Important"]),
            "Optional": len(rule_categories["Optional"])
        },
        "custom_rules": compiled_rules,
        "table_specific_rules": get_table_specific_rules(),
        "validation_results": validation_results,
        "status": "success"
    })
    
    # Display summary
    print(f"Rules compilation completed")
    print(f"   Total rules defined: {len(compiled_rules)}")
    print(f"   Critical rules: {len(rule_categories['Critical'])}")
    print(f"   Important rules: {len(rule_categories['Important'])}")
    print(f"   Optional rules: {len(rule_categories['Optional'])}")
    print(f"   Valid rules: {validation_results['valid_rules']}")
    print(f"   Invalid rules: {validation_results['invalid_rules']}")
    
    # Display rule details
    print(f"COMPILED RULES SUMMARY:")
    print("-" * 50)
    for i, rule in enumerate(compiled_rules, 1):
        print(f"  {i}. [{rule['category']}] {rule['meta']['description']}")
    
    if validation_results["errors"]:
        print(f"VALIDATION ERRORS:")
        for error in validation_results["errors"]:
            print(f"   • {error}")

except Exception as e:
    print(f"Rules compilation failed: {e}")
    step4_5_results["error_message"] = f"Rules compilation failed: {e}"
    step4_5_results["status"] = "error"

print(f"STEP 4.5 COMPLETED")
print(f"Rules library ready for Step 5")
print("=" * 80)

COMPILING RULES FOR DEMONSTRATION
--------------------------------------------------
Rules compilation completed
   Total rules defined: 12
   Critical rules: 6
   Important rules: 6
   Optional rules: 0
   Valid rules: 12
   Invalid rules: 0
COMPILED RULES SUMMARY:
--------------------------------------------------
  1. [Critical] Table should have reasonable number of rows
  2. [Important] Table should have reasonable number of columns
  3. [Critical] HIERARCHY_ID column must exist
  4. [Critical] HIERARCHY_ID should not be null
  5. [Important] HIERARCHY_ID should be unique
  6. [Important] HIERARCHY_ID should follow standard format
  7. [Critical] RECORD_CREATE_DATE column must exist
  8. [Critical] RECORD_CREATE_DATE should not be null
  9. [Important] RECORD_CREATE_DATE should be within reasonable range
  10. [Important] STATUS column should be mostly non-null
  11. [Critical] LOGIC_TYPE should be almost always populated for DQ_LOGIC table
  12. [Important] ACTIVE_FLAG should use

In [0]:
# Simple JSON cleanup and return  
def clean_for_json(obj):
    """Convert non-serializable types to JSON-compatible types"""
    import datetime
    import numpy as np
    
    if isinstance(obj, dict):
        return {k: clean_for_json(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [clean_for_json(v) for v in obj]
    elif isinstance(obj, (datetime.date, datetime.datetime, np.datetime64)):
        return str(obj)
    elif isinstance(obj, (np.integer, np.int32, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.bool_, bool)):
        return bool(obj)
    else:
        return obj

# Clean and return results
step4_5_results = clean_for_json(step4_5_results)
dbutils.notebook.exit(json.dumps(step4_5_results))

{"status": "success", "rules_defined": 12, "rule_categories": {"Critical": 6, "Important": 6, "Optional": 0}, "custom_rules": [{"rule_id": "TBL_001", "category": "Critical", "expectation_type": "expect_table_row_count_to_be_between", "kwargs": {"min_value": 1, "max_value": 1000000}, "meta": {"description": "Table should have reasonable number of rows", "business_impact": "Empty tables or extremely large tables may indicate data pipeline issues", "remediation": "Check data pipeline and source systems"}}, {"rule_id": "TBL_002", "category": "Important", "expectation_type": "expect_table_column_count_to_be_between", "kwargs": {"min_value": 1, "max_value": 100}, "meta": {"description": "Table should have reasonable number of columns", "business_impact": "Schema changes may affect downstream systems", "remediation": "Verify table schema matches expectations"}}, {"rule_id": "HIE_001", "category": "Critical", "expectation_type": "expect_column_to_exist", "kwargs": {"column": "HIERARCHY_ID"}, "