## Real-World Case Studies

### Healthcare - Medical Prediction Errors:
**Description**: Implement validation rules using a healthcare dataset to reduce errors in
predictive models by automating data quality checks.

In [1]:
# write your code from here
import pandas as pd
from typing import Dict, Callable, List

def validate_healthcare_data(
    df: pd.DataFrame,
    required_columns: List[str],
    value_validators: Dict[str, Callable[[any], bool]]
) -> Dict[str, any]:
    """
    Validate a healthcare dataset to detect and reduce data quality issues 
    that may lead to prediction errors.

    Parameters:
        df (pd.DataFrame): The input healthcare dataset.
        required_columns (List[str]): Required non-null fields (e.g., 'age', 'diagnosis').
        value_validators (Dict[str, Callable]): Column validators (e.g., 'age': lambda x: x > 0).

    Returns:
        Dict[str, any]: Results including missing rates, invalid rates, and SLA violations.
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame.")

    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    results = {}
    total_rows = len(df)

    # Completeness Checks
    for col in required_columns:
        missing_rate = df[col].isnull().mean()
        results[f"missing_rate_{col}"] = missing_rate
        if missing_rate > 0.01:
            results[f"SLA_violation_missing_{col}"] = True

    # Value Range & Logic Checks
    for col, check in value_validators.items():
        if col in df.columns:
            invalid_rate = (~df[col].apply(check)).mean()
            results[f"invalid_rate_{col}"] = invalid_rate
            if invalid_rate > 0.01:
                results[f"SLA_violation_invalid_{col}"] = True

    # Final Status
    violations = [k for k in results if "SLA_violation" in k]
    results["total_violations"] = len(violations)
    results["SLA_status"] = "FAIL" if violations else "PASS"

    return results


# ---------------------------
# ✅ Example Usage
# ---------------------------

if __name__ == "__main__":
    # Simulated healthcare dataset
    data = {
        'patient_id': [1, 2, 3],
        'age': [34, -5, 70],
        'diagnosis': ['Diabetes', None, 'Hypertension'],
        'blood_pressure': [120, 85, 300]  # Note: 300 may be implausible
    }

    df = pd.DataFrame(data)

    required = ['age', 'diagnosis']
    validators = {
        'age': lambda x: pd.notnull(x) and isinstance(x, (int, float)) and 0 < x < 120,
        'blood_pressure': lambda x: pd.notnull(x) and 40 <= x <= 200
    }

    result = validate_healthcare_data(df, required, validators)

    print("Healthcare Data Quality Report:")
    for k, v in result.items():
        print(f"{k}: {v}")


Healthcare Data Quality Report:
missing_rate_age: 0.0
missing_rate_diagnosis: 0.3333333333333333
SLA_violation_missing_diagnosis: True
invalid_rate_age: 0.3333333333333333
SLA_violation_invalid_age: True
invalid_rate_blood_pressure: 0.3333333333333333
SLA_violation_invalid_blood_pressure: True
total_violations: 3
SLA_status: FAIL
