## Automated Data Quality Monitoring
**Objective**: Use Great Expectations to perform data profiling and write validation rules.

1. Data Profiling with Great Expectations
### Profile a CSV dataset containing customer information to inspect distribution patterns of 'Age' and 'Income' columns.
- Load the dataset using Great Expectations and create a data context.
- Generate a data asset to inspect the summary statistics.
- View the generated expectation suite to analyze data distributions.

In [None]:
# write your code from here
import great_expectations as ge
from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler
import os

# Step 1: Initialize or get existing GE context in current working directory
project_root_dir = os.getcwd()
context = ge.get_context(context_root_dir=project_root_dir)

# Step 2: Add datasource for CSV data (if not already added)
datasource_config = {
    "name": "customer_data_source",
    "class_name": "Datasource",
    "execution_engine": {"class_name": "PandasExecutionEngine"},
    "data_connectors": {
        "default_runtime_data_connector_name": {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["default_identifier_name"]
        }
    }
}
try:
    context.test_yaml_config(yaml.dump(datasource_config))
    context.add_datasource(**datasource_config)
except Exception as e:
    print(f"Datasource may already exist or error: {e}")

# Step 3: Load CSV as runtime batch
batch = context.get_batch({
    "datasource_name": "customer_data_source",
    "data_connector_name": "default_runtime_data_connector_name",
    "data_asset_name": "customer_data_asset",
    "runtime_parameters": {"path": "data/customer_data.csv"},
    "batch_identifiers": {"default_identifier_name": "default_identifier"},
})

# Step 4: Create expectation suite
suite_name = "customer_profile_suite"
try:
    context.create_expectation_suite(expectation_suite_name=suite_name, overwrite_existing=True)
except Exception as e:
    print(f"Expectation suite creation error or already exists: {e}")

# Step 5: Profile dataset to create expectations on 'Age' and 'Income'
expectation_suite = BasicDatasetProfiler.profile(batch)
context.save_expectation_suite(expectation_suite, suite_name)

# Step 6: Load and print generated expectations for inspection
suite = context.get_expectation_suite(suite_name)
for exp in suite.expectations:
    print(exp)

# Optional: Validate the batch and build data docs
validation_result = context.run_validation_operator(
    "action_list_operator",
    assets_to_validate=[batch],
    run_name="profiling_validation_run"
)
context.build_data_docs()
print("Validation Results:", validation_result)


2. Writing Validation Rules for Data Ingestion
### Write validation rules for a CSV file to ensure the 'Date' column follows a specific date format.
- Utilize expect_column_values_to_match_regex to enforce date format validation.
- Run the validation and interpret the output.

In [None]:
# write your code from here
import great_expectations as ge
import re
import os

project_root_dir = os.getcwd()
context = ge.get_context(context_root_dir=project_root_dir)

# Load CSV batch for validation
batch = context.get_batch({
    "datasource_name": "customer_data_source",
    "data_connector_name": "default_runtime_data_connector_name",
    "data_asset_name": "data_asset_date_validation",
    "runtime_parameters": {"path": "data/ingestion_data.csv"},
    "batch_identifiers": {"default_identifier_name": "date_validation"},
})

suite_name = "date_validation_suite"
try:
    context.create_expectation_suite(expectation_suite_name=suite_name, overwrite_existing=True)
except Exception as e:
    print(f"Suite creation or overwrite error: {e}")

# Define date regex, e.g. YYYY-MM-DD
date_regex = r"^\d{4}-\d{2}-\d{2}$"

# Add expectation to validate 'Date' column against regex
batch.expect_column_values_to_match_regex(column="Date", regex=date_regex)

# Save the expectation suite
context.save_expectation_suite(batch.get_expectation_suite(), suite_name)

# Validate batch against the suite
results = context.run_validation_operator(
    "action_list_operator",
    assets_to_validate=[batch],
    run_name="date_format_validation_run"
)

# Print summary of validation results
validation_result = results.list_validation_results()[0]
print(f"Validation Success: {validation_result.success}")
for result in validation_result.results:
    if result.expectation_config.expectation_type == "expect_column_values_to_match_regex":
        print(f"Expectation: {result.expectation_config.expectation_type}")
        print(f"Success: {result.success}")
        print(f"Unexpected Count: {result.result.get('unexpected_count', 0)}")
        print(f"Unexpected Values Sample: {result.result.get('partial_unexpected_list', [])}")

context.build_data_docs()
