## Real-World Case Studies

### Healthcare - Medical Prediction Errors:
**Description**: Implement validation rules using a healthcare dataset to reduce errors in
predictive models by automating data quality checks.

In [None]:
# write your code from here
import pandas as pd
import numpy as np

def validate_healthcare_data(df):
    errors = []

    # Check missing critical fields
    critical_fields = ['patient_id', 'age', 'outcome']
    for col in critical_fields:
        missing = df[col].isnull().sum()
        if missing > 0:
            errors.append(f"Column '{col}' has {missing} missing values.")

    # Check data types and convert if necessary
    numeric_fields = ['age', 'blood_pressure', 'heart_rate', 'glucose']
    for col in numeric_fields:
        if col in df.columns:
            # Try conversion to numeric, coerce errors to NaN
            df[col] = pd.to_numeric(df[col], errors='coerce')
            non_numeric = df[col].isnull().sum() - df[col].isnull().sum()  # Adjust for original NaNs
            if non_numeric > 0:
                errors.append(f"Column '{col}' has {non_numeric} non-numeric entries converted to NaN.")

    # Range checks
    if 'age' in df.columns:
        out_of_range_age = df[(df['age'] < 0) | (df['age'] > 120)].shape[0]
        if out_of_range_age > 0:
            errors.append(f"Column 'age' has {out_of_range_age} values outside the range 0-120.")

    if 'blood_pressure' in df.columns:
        out_of_range_bp = df[(df['blood_pressure'] < 60) | (df['blood_pressure'] > 200)].shape[0]
        if out_of_range_bp > 0:
            errors.append(f"Column 'blood_pressure' has {out_of_range_bp} values outside the range 60-200.")

    # Check duplicate patient IDs
    if 'patient_id' in df.columns:
        duplicates = df['patient_id'].duplicated().sum()
        if duplicates > 0:
            errors.append(f"{duplicates} duplicate patient IDs found.")

    # Logical timestamp check
    if {'admission_date', 'discharge_date'}.issubset(df.columns):
        invalid_dates = df[df['admission_date'] > df['discharge_date']].shape[0]
        if invalid_dates > 0:
            errors.append(f"{invalid_dates} records have admission_date after discharge_date.")

    return errors, df

# Usage Example:
# df = pd.read_csv('healthcare_data.csv', parse_dates=['admission_date', 'discharge_date'])
# errors, cleaned_df = validate_healthcare_data(df)
# if errors:
#     for err in errors:
#         print("Data Quality Issue:", err)
# else:
#     print("All data quality checks passed.")
