## Real-World Case Studies

### Finance - Fraud Detection Models:
**Description**: Analyze a financial dataset, define SLAs for data accuracy and
completeness, and ensure high data quality for fraud detection models.

In [2]:
import pandas as pd
import numpy as np

def load_data(filepath):
    """
    Load CSV data into a DataFrame.
    Raises FileNotFoundError if file is missing.
    """
    try:
        df = pd.read_csv(filepath)
        print(f"Loaded data with {len(df)} rows and {len(df.columns)} columns.")
        return df
    except FileNotFoundError:
        print(f"ERROR: File not found - {filepath}")
        raise
    except Exception as e:
        print(f"ERROR: Failed to load data - {e}")
        raise

def check_completeness(df, required_fields):
    """
    Calculate completeness % for required fields.
    Returns a Series of completeness ratios per field.
    """
    missing_counts = df[required_fields].isnull().sum()
    completeness = 1 - (missing_counts / len(df))
    return completeness

def validate_transaction_amount(amount):
    """Check if transaction amount is a positive number."""
    if pd.isnull(amount):
        return False
    if not isinstance(amount, (int, float, np.number)):
        return False
    return amount > 0

def validate_transaction_date(date):
    """
    Check if transaction date is valid and not in the future.
    Accepts pandas Timestamp or string.
    """
    try:
        dt = pd.to_datetime(date, errors='coerce')
        if pd.isnull(dt) or dt > pd.Timestamp.today():
            return False
        return True
    except Exception:
        return False

def check_accuracy(df):
    """
    Perform accuracy checks on transaction_amount, transaction_date,
    and uniqueness of transaction_id.
    Returns a dict with error counts.
    """
    errors = {}

    # Validate amounts
    errors['invalid_amount'] = df['transaction_amount'].apply(lambda x: not validate_transaction_amount(x)).sum()

    # Validate dates
    errors['invalid_dates'] = df['transaction_date'].apply(lambda x: not validate_transaction_date(x)).sum()

    # Check duplicate transaction_id
    errors['duplicate_transaction_id'] = df['transaction_id'].duplicated().sum()

    return errors

def calculate_sla(completeness, accuracy_errors, total_records, sla_thresholds):
    """
    Check SLA compliance based on thresholds.
    Returns a dict of boolean pass/fail for each SLA metric.
    """
    sla_results = {}
    sla_results['completeness_pass'] = all(completeness >= sla_thresholds['completeness'])
    sla_results['invalid_amount_pass'] = (accuracy_errors['invalid_amount'] / total_records) <= sla_thresholds['max_invalid_amount_ratio']
    sla_results['invalid_dates_pass'] = (accuracy_errors['invalid_dates'] / total_records) <= sla_thresholds['max_invalid_date_ratio']
    sla_results['duplicate_txn_pass'] = (accuracy_errors['duplicate_transaction_id'] / total_records) <= sla_thresholds['max_duplicate_txn_ratio']

    return sla_results

def main():
    filepath = "financial_transactions.csv"
    required_fields = ['transaction_id', 'account_id', 'transaction_amount', 'transaction_date']
    sla_thresholds = {
        'completeness': 0.98,
        'max_invalid_amount_ratio': 0.01,
        'max_invalid_date_ratio': 0.005,
        'max_duplicate_txn_ratio': 0.001
    }

    df = load_data(filepath)

    # Ensure required columns exist
    missing_cols = set(required_fields) - set(df.columns)
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    completeness = check_completeness(df, required_fields)
    accuracy_errors = check_accuracy(df)
    sla_results = calculate_sla(completeness, accuracy_errors, len(df), sla_thresholds)

    print("\nData Quality Summary Report")
    print("----------------------------")
    print(f"Total records: {len(df)}")
    print(f"Completeness per field:\n{completeness}")
    print(f"Accuracy errors:\n{accuracy_errors}")
    print(f"SLA compliance:\n{sla_results}")

# Simple unit tests
def run_tests():
    # Test validate_transaction_amount
    assert validate_transaction_amount(100) == True
    assert validate_transaction_amount(-10) == False
    assert validate_transaction_amount('abc') == False
    assert validate_transaction_amount(None) == False

    # Test validate_transaction_date
    assert validate_transaction_date('2020-01-01') == True
    assert validate_transaction_date('3000-01-01') == False
    assert validate_transaction_date('invalid-date') == False
    assert validate_transaction_date(None) == False

    print("All tests passed!")

if __name__ == "__main__":
    run_tests()
    main()


All tests passed!
ERROR: File not found - financial_transactions.csv


FileNotFoundError: [Errno 2] No such file or directory: 'financial_transactions.csv'