## Real-World Case Studies

### Finance - Fraud Detection Models:
**Description**: Analyze a financial dataset, define SLAs for data accuracy and
completeness, and ensure high data quality for fraud detection models.

In [1]:
import pandas as pd
import numpy as np

def load_data(filepath):
    """Load financial dataset."""
    try:
        df = pd.read_csv(filepath)
        print(f"Loaded data with {len(df)} rows and {len(df.columns)} columns.")
        return df
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        raise
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

def check_completeness(df, required_fields):
    """Calculate completeness percentage for required fields."""
    missing_counts = df[required_fields].isnull().sum()
    completeness = 1 - (missing_counts / len(df))
    print("Completeness per field:")
    print(completeness)
    return completeness

def check_accuracy(df):
    """
    Apply basic accuracy rules, example:
    - transaction_amount > 0
    - transaction_date is valid and not future dated
    - account_id and transaction_id are unique identifiers
    """
    errors = {}

    # transaction_amount > 0
    errors['invalid_amount'] = (df['transaction_amount'] <= 0).sum()

    # transaction_date valid and <= today
    try:
        df['transaction_date'] = pd.to_datetime(df['transaction_date'], errors='coerce')
    except Exception as e:
        print(f"Date parsing error: {e}")
        errors['date_parse_fail'] = len(df)
        return errors

    errors['invalid_dates'] = (df['transaction_date'].isna() | (df['transaction_date'] > pd.Timestamp.today())).sum()

    # Unique transaction_id
    duplicate_txn = df['transaction_id'].duplicated().sum()
    errors['duplicate_transaction_id'] = duplicate_txn

    print("Accuracy check errors:")
    print(errors)
    return errors

def calculate_sla(completeness, accuracy_errors, total_records, sla_thresholds):
    """
    Compare quality metrics against SLA thresholds.
    sla_thresholds example: 
        {
            'completeness': 0.98,  # 98% completeness minimum
            'max_invalid_amount_ratio': 0.01,
            'max_invalid_date_ratio': 0.005,
            'max_duplicate_txn_ratio': 0.001
        }
    """
    sla_results = {}

    sla_results['completeness_pass'] = all(completeness >= sla_thresholds['completeness'])
    sla_results['invalid_amount_pass'] = (accuracy_errors['invalid_amount'] / total_records) <= sla_thresholds['max_invalid_amount_ratio']
    sla_results['invalid_dates_pass'] = (accuracy_errors['invalid_dates'] / total_records) <= sla_thresholds['max_invalid_date_ratio']
    sla_results['duplicate_txn_pass'] = (accuracy_errors['duplicate_transaction_id'] / total_records) <= sla_thresholds['max_duplicate_txn_ratio']

    print("SLA Compliance Results:")
    print(sla_results)
    return sla_results

def main():
    # Define file path to your financial transactions CSV
    filepath = "financial_transactions.csv"

    # Load data
    df = load_data(filepath)

    # Define required fields for completeness
    required_fields = ['transaction_id', 'account_id', 'transaction_amount', 'transaction_date']

    # Check completeness
    completeness = check_completeness(df, required_fields)

    # Check accuracy
    accuracy_errors = check_accuracy(df)

    # Define SLAs (example thresholds)
    sla_thresholds = {
        'completeness': 0.98,
        'max_invalid_amount_ratio': 0.01,
        'max_invalid_date_ratio': 0.005,
        'max_duplicate_txn_ratio': 0.001
    }

    # Calculate SLA compliance
    sla_results = calculate_sla(completeness, accuracy_errors, len(df), sla_thresholds)

    # Summary report
    print("\nData Quality Summary Report")
    print("----------------------------")
    print(f"Total records: {len(df)}")
    print(f"Completeness (per field):\n{completeness}")
    print(f"Accuracy errors:\n{accuracy_errors}")
    print(f"SLA compliance:\n{sla_results}")

if __name__ == "__main__":
    main()


File not found: financial_transactions.csv


FileNotFoundError: [Errno 2] No such file or directory: 'financial_transactions.csv'