# Edgy Backfill Validation

This notebook validates the CashConnectedGraphChangeEvent backfill processing through the Edgy service by comparing the events generated against expected events computed from Duplograph's graph state.

## Overview

The validation process:
1. Takes a list of account holder tokens as input
2. For each account holder:
   - Queries their active (non-merged) accounts
   - Finds assets connected to these accounts
   - Identifies other account holders connected to the same assets via active (non merged) accounts
   - Computes expected connection events based on shared assets
   - Validates generated event IDs against expected events to ensure all expected events are present, and no unexpected events are generated
   - Validates the source and target user labels attached to the events
   - Validates the timestamp of the events matches the earliest connection between the source and target user
3. Performs general sanity checks on the backfilled events

## Setup

First, install required dependencies and import necessary modules.

In [0]:
%pip install --extra-index-url https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple sq-pysnowflake

Python interpreter will be restarted.
Looking in indexes: https://pypi.org/simple, https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple
Python interpreter will be restarted.


In [0]:
import pandas as pd
import numpy as np
import csv
import fcntl
import json
import os
import time
from datetime import datetime
from typing import List, Set, Dict, Tuple, Optional
from pyspark.sql.functions import col
from pysnowflake import Session



## Helper Functions

### Query helpers

In [0]:
def snowflake_query(query: str) -> pd.DataFrame:
    """Execute a query in Snowflake and return results as a pandas DataFrame."""
    with Session(query_tag="ioanna", connection_override_args={'warehouse': 'etl__2xlarge'}) as sess:
        cursor = sess.execute(query)
        return cursor.fetch_pandas_all()

def spark_query(query: str) -> pd.DataFrame:
    """Execute a query in Spark and return results as a pandas DataFrame."""
    return spark.sql(query).toPandas()
        
def to_sql_list(inputs):
    """Convert a set/list of values to SQL IN clause format"""
    if not inputs:
        return "()"
    return "('" + "','".join(str(x) for x in inputs) + "')"

### Validation Result Class
Tracks validation outcomes and error details for each account holder.

In [0]:
class ValidationResult:
    def __init__(self, account_holder: str):
        self.account_holder = account_holder
        self.validation_status: Optional[str] = None
        self.missing_events: Set[str] = set()
        self.unexpected_events: Set[str] = set()
        self.label_errors: List[str] = []
        self.timestamp_errors: List[str] = []
        self.success: bool = True
        
    def add_error(self, error_type: str, details: str):
        """Records an error of the specified type with details"""
        self.success = False
        if error_type == "validation_status":
            self.validation_status = details
        elif error_type == "missing_event":
            self.missing_events.add(details)
        elif error_type == "unexpected_event":
            self.unexpected_events.add(details)
        elif error_type == "label":
            self.label_errors.append(details)
        elif error_type == "timestamp":
            self.timestamp_errors.append(details)
            
    def get_summary(self) -> str:
        """Generates a human-readable summary of validation results"""
        summary = []
        
        if self.validation_status:
            summary.append(f"ℹ️ {self.account_holder}: {self.validation_status}")
        elif self.success:
            summary.append(f"✅ {self.account_holder}: All validations passed")
        else:
            summary.append(f"❌ {self.account_holder}: Validation failed")
        
        if self.missing_events:
            summary.append(f"\nMissing events ({len(self.missing_events)}):")
            for event in sorted(self.missing_events):
                summary.append(f"  - {event}")
        if self.unexpected_events:
            summary.append(f"\nUnexpected events ({len(self.unexpected_events)}):")
            for event in sorted(self.unexpected_events):
                summary.append(f"  - {event}")
        if self.label_errors:
            summary.append(f"\nLabel errors ({len(self.label_errors)}):")
            for error in self.label_errors:
                summary.append(f"  - {error}")
        if self.timestamp_errors:
            summary.append(f"\nTimestamp errors ({len(self.timestamp_errors)}):")
            for error in self.timestamp_errors:
                summary.append(f"  - {error}")
        return "\n".join(summary)

    def write_to_csv(self, csv_path: str):
        """Writes this validation result to CSV in a thread-safe manner.
        
        Args:
            csv_path: Path to the CSV file
        """
        # Prepare the row data
        row_data = {
            'account_holder_token': self.account_holder,
            'validation_success': self.success,
            'validation_status': self.validation_status if self.validation_status else 
                               ('PASS' if self.success else 'FAIL'),
            'missing_events': '|'.join(sorted(self.missing_events)) if self.missing_events else '',
            'unexpected_events': '|'.join(sorted(self.unexpected_events)) if self.unexpected_events else '',
            'label_errors': '|'.join(self.label_errors) if self.label_errors else '',
            'timestamp_errors': '|'.join(self.timestamp_errors) if self.timestamp_errors else ''
        }
        
        # Define the field names (column headers)
        fieldnames = [
            'account_holder_token',
            'validation_success',
            'validation_status',
            'missing_events',
            'unexpected_events',
            'label_errors',
            'timestamp_errors'
        ]
        
        # Use file locking for thread-safe writing
        file_exists = os.path.exists(csv_path)
        
        with open(csv_path, 'a' if file_exists else 'w', newline='') as f:
            # Get an exclusive lock on the file
            fcntl.flock(f.fileno(), fcntl.LOCK_EX)
            try:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                
                # Write headers if this is a new file
                if not file_exists:
                    writer.writeheader()
                
                # Write the row
                writer.writerow(row_data)
                
                # Ensure the write is flushed to disk
                f.flush()
                os.fsync(f.fileno())
            finally:
                # Release the lock
                fcntl.flock(f.fileno(), fcntl.LOCK_UN)

### Event ID Generation
Replicates Edgy's event ID generation logic for consistency checking.

In [0]:
def connection_change_event_id(target_token: str, source_token: str, node_type: str) -> str:
    """Produces a ConnectionChange event_id in the same format as edgy
    
    The event ID format ensures consistency regardless of processing direction:
    - Format: "E/<token1>/<token2>/<nodeType>"
    - token1 and token2 are alphabetically sorted
    """
    tokens = "/".join(sorted([target_token, source_token]))
    return f"E/{tokens}/{node_type}"

### Label Validation
Functions to validate label propagation in events.

In [0]:
def get_account_holder_labels(
    account_holder_accounts: Dict[str, Set[str]],
    effective_at: str
) -> pd.DataFrame:
    """Gets active labels for account holders via their accounts.
    
    Args:
        account_holder_accounts: Dict mapping account holder tokens to their active account tokens
        effective_at: Timestamp for point-in-time query
        
    Returns:
        DataFrame with label information
    """
    # Flatten account tokens for the query
    all_accounts = {
        account 
        for accounts in account_holder_accounts.values() 
        for account in accounts
    }
    
    if not all_accounts:
        return pd.DataFrame(columns=[
            'ACCOUNT_HOLDER_TOKEN', 'ACCOUNT_TOKEN', 'LABEL',
            'LABEL_EFFECTIVE_AT', 'ACCOUNT_MAPPING_EFFECTIVE_AT'
        ])

    query = f"""
    WITH latest_label_events AS (
        SELECT node_token,
               label,
               present,
               effective_at,
               ROW_NUMBER() OVER (PARTITION BY node_token, label ORDER BY effective_at DESC) as rn
        FROM duplograph.public.label_events
        WHERE node_type = 'CASH_CUSTOMER'
        AND effective_at <= '{effective_at}'
        AND node_token IN {to_sql_list(all_accounts)}
    )
    SELECT 
        m.from_token AS ACCOUNT_HOLDER_TOKEN,
        m.to_token AS ACCOUNT_TOKEN,
        l.label AS LABEL,
        l.effective_at AS LABEL_EFFECTIVE_AT,
        m.effective_at AS ACCOUNT_MAPPING_EFFECTIVE_AT
    FROM latest_label_events l
    JOIN duplograph.public.edges m 
        ON l.node_token = m.to_token
    WHERE l.rn = 1 
    AND l.present = true
    AND m.from_type = 'ACCOUNT_HOLDER_CASH_CUSTOMER'
    AND m.to_type = 'CASH_CUSTOMER'
    AND m.effective_at <= '{effective_at}'
    AND m.from_token IN {to_sql_list(account_holder_accounts.keys())}
    ORDER BY 
        m.from_token,
        m.to_token,
        l.label
    """
    
    return snowflake_query(query)

In [0]:
def validate_labels(
    ccgce_df: pd.DataFrame,
    source_labels_df: pd.DataFrame,
    target_labels_df: pd.DataFrame,
    source_account_holder: str,
    validation_result: ValidationResult
) -> None:
    """Validates that labels are correctly propagated in CashConnectedGraphChangeEvents.
    
    This function checks:
    1. Source labels in events match the union of labels from all source account holder's accounts
    2. Target labels in events match the union of labels from all target account holder's accounts
    
    Note: ACCOUNT_DENYLISTED labels are excluded as they apply to accounts, not account holders.
    
    Args:
        ccgce_df: DataFrame containing the CashConnectedGraphChangeEvents
        source_labels_df: DataFrame with source account holder's label details from all their accounts
        target_labels_df: DataFrame with target account holders' label details from all their accounts
        source_account_holder: The source account holder token being validated
        validation_result: ValidationResult object to store any errors
        
    The label DataFrames should contain:
        - ACCOUNT_HOLDER_TOKEN: The account holder
        - ACCOUNT_TOKEN: The account with the label
        - LABEL: The label name
        - LABEL_EFFECTIVE_AT: When the label became effective
        - ACCOUNT_MAPPING_EFFECTIVE_AT: When the account was mapped to the holder
    """
    # Verify expected columns exist
    required_ccgce_columns = ['event_id', 'source_user_token', 'target_user_token', 
                            'source_user_labels', 'target_user_labels']
    required_label_columns = ['ACCOUNT_HOLDER_TOKEN', 'ACCOUNT_TOKEN', 'LABEL', 
                            'LABEL_EFFECTIVE_AT', 'ACCOUNT_MAPPING_EFFECTIVE_AT']
    
    if not all(col in ccgce_df.columns for col in required_ccgce_columns):
        validation_result.add_error(
            "label", 
            f"ccgce_df missing required columns. Expected: {required_ccgce_columns}"
        )
        return
    
    for df, name in [(source_labels_df, 'source_labels_df'), 
                     (target_labels_df, 'target_labels_df')]:
        if not all(col in df.columns for col in required_label_columns):
            validation_result.add_error(
                "label",
                f"{name} missing required columns. Expected: {required_label_columns}"
            )
            return

    # Get source account holder's active labels, excluding ACCOUNT_DENYLISTED
    source_labels = set()
    if not source_labels_df.empty:
        source_labels = set(
            source_labels_df[source_labels_df['LABEL'] != 'ACCOUNT_DENYLISTED']['LABEL'].tolist()
        )

    # Create mapping of target account holders to their active labels
    target_holder_labels = {}
    if not target_labels_df.empty:
        for account_holder in target_labels_df['ACCOUNT_HOLDER_TOKEN'].unique():
            holder_labels = target_labels_df[
                (target_labels_df['ACCOUNT_HOLDER_TOKEN'] == account_holder) &
                (target_labels_df['LABEL'] != 'ACCOUNT_DENYLISTED')
            ]['LABEL'].tolist()
            target_holder_labels[account_holder] = set(holder_labels)

    # Validate each event
    for _, event in ccgce_df.iterrows():
        # Verify this is the correct source account holder
        if event['source_user_token'] != source_account_holder:
            validation_result.add_error(
                "label",
                f"Event {event['event_id']} has unexpected source_user_token: "
                f"{event['source_user_token']} (expected {source_account_holder})"
            )
            continue

        event_source_labels = set(event['source_user_labels'])
        event_target_labels = set(event['target_user_labels'])
        target_holder = event['target_user_token']
        expected_target_labels = target_holder_labels.get(target_holder, set())

        # Validate source labels
        if event_source_labels != source_labels:
            validation_result.add_error(
                "label",
                f"Source label mismatch for event {event['event_id']}:\n"
                f"  Expected: {sorted(source_labels)}\n"
                f"  Actual: {sorted(event_source_labels)}\n"
                f"  Source label details:\n{source_labels_df.to_string()}"
            )

        # Validate target labels
        if event_target_labels != expected_target_labels:
            target_df_for_holder = target_labels_df[
                target_labels_df['ACCOUNT_HOLDER_TOKEN'] == target_holder
            ]
            validation_result.add_error(
                "label",
                f"Target label mismatch for event {event['event_id']}:\n"
                f"  Target account holder: {target_holder}\n"
                f"  Expected: {sorted(expected_target_labels)}\n"
                f"  Actual: {sorted(event_target_labels)}\n"
                f"  Target label details for this holder:\n"
                f"{target_df_for_holder.to_string()}"
            )

### Timestamp validation

Validation of change event timestamps

In [0]:
def validate_timestamps(
    ccgce_df: pd.DataFrame,
    asset_connections: pd.DataFrame, 
    validation_result: ValidationResult
):
    """Validates that event timestamps match the earliest effective connection time.
    
    The effective connection time between two account holders via an asset is the
    later of:
    1. When the source account connected to the asset
    2. When the target account connected to the asset
    
    Args:
        ccgce_df: DataFrame containing the CashConnectedGraphChangeEvents
        asset_connections: DataFrame with asset connection details including:
            - ACCOUNT_TOKEN: The connected account
            - ASSET_TOKEN: The shared asset
            - ASSET_TYPE: Type of the shared asset
            - ASSET_CONNECTION_TIME: When the account connected to the asset
            - SOURCE_CONNECTION_TIME: When the source account connected to the asset
            - ACCOUNT_HOLDER_TOKEN: The account holder owning the connected account
            - EARLIEST_CONNECTION_TIME: Later of source and target connection times
        validation_result: ValidationResult object to store any errors
    """
    for _, event in ccgce_df.iterrows():
        target_holder = event['target_user_token']
        source_holder = event['source_user_token']
        asset_type = event['changed_node_type']

        # Find connections for this target account holder via this asset type
        target_connections = asset_connections[
            (asset_connections['ACCOUNT_HOLDER_TOKEN'] == target_holder) &
            (asset_connections['ASSET_TYPE'] == asset_type)
        ]

        if target_connections.empty:
            validation_result.add_error(
                "timestamp",
                f"No shared assets found for connection between {source_holder} and {target_holder} via {asset_type}"
            )
            continue

        # The earliest_connection_time is already the later of source and target connection times
        expected_timestamp = pd.to_datetime(target_connections['EARLIEST_CONNECTION_TIME'].min())
        actual_timestamp = pd.to_datetime(event['effective_at_millis'], unit='ms')
        
        if actual_timestamp != expected_timestamp:
            validation_result.add_error(
                "timestamp",
                f"Timestamp mismatch for event {event['event_id']}:\n"
                f"  Expected (earliest effective connection): {expected_timestamp}\n"
                f"  Actual (event effective_at): {actual_timestamp}"
            )

### Account Holder Connection Analysis
Functions to analyze account holders and their connections.

In [0]:
def get_all_active_accounts(
    account_holders: Set[str],
    effective_at: str
) -> Dict[str, Set[str]]:
    """Gets all active (non-merged) accounts for a set of account holders.
    
    Args:
        account_holders: Set of account holder tokens to get accounts for
        effective_at: Timestamp for point-in-time query
        
    Returns:
        Dictionary mapping account holder tokens to their set of active (non-merged) account tokens.
        Account holders with no active accounts will have empty sets or be missing from the dictionary.
    """
    if not account_holders:
        return {}
        
    query = f"""
    WITH account_edges AS (
        -- Get all account mappings for the account holders
        SELECT 
            FROM_TOKEN AS account_holder_token,
            TO_TOKEN AS account_token,
            EFFECTIVE_AT AS mapping_effective_at
        FROM duplograph.public.edges
        WHERE FROM_TYPE = 'ACCOUNT_HOLDER_CASH_CUSTOMER'
        AND TO_TYPE = 'CASH_CUSTOMER'
        AND FROM_TOKEN IN {to_sql_list(account_holders)}
        AND EFFECTIVE_AT <= '{effective_at}'
    )
    SELECT 
        ae.account_holder_token,
        ae.account_token
    FROM account_edges ae
    WHERE NOT EXISTS (
        -- Exclude merged accounts
        SELECT 1
        FROM duplograph.public.edges m
        WHERE m.FROM_TOKEN = ae.account_token
        AND m.FROM_TYPE = 'CASH_CUSTOMER'
        AND m.TO_TYPE = 'CASH_CUSTOMER'
        AND m.EFFECTIVE_AT <= '{effective_at}'
    )
    """
    
    df = snowflake_query(query)
    
    # Convert to dictionary mapping account holders to their account sets
    result = {}
    if not df.empty:
        for account_holder in account_holders:
            accounts = df[df['ACCOUNT_HOLDER_TOKEN'] == account_holder]['ACCOUNT_TOKEN'].tolist()
            result[account_holder] = set(accounts)
    
    return result

In [0]:
def get_connected_account_holders(
    source_account_holder: str,
    source_active_accounts: Set[str],
    effective_at: str,
    asset_types: Optional[Set[str]] = None
) -> Tuple[pd.DataFrame, pd.DataFrame, Set[str]]:
    """Gets account holders connected to source accounts via shared assets.
    Note: Returns all connected accounts, including merged ones.
    Merged account filtering should be done by the caller.
    
    Args:
        source_account_holder: The source account holder token
        source_active_accounts: Set of active (non-merged) account tokens belonging to the source account holder
        effective_at: Timestamp for point-in-time query
        asset_types: Optional set of asset types to filter by
        
    Returns:
        Tuple of (
            asset_connections: DataFrame with asset connection details including:
                - ACCOUNT_TOKEN: The connected account
                - ASSET_TOKEN: The shared asset
                - ASSET_TYPE: Type of the shared asset
                - ASSET_CONNECTION_TIME: When the account connected to the asset
                - SOURCE_CONNECTION_TIME: When the source account connected to the asset
                - ACCOUNT_HOLDER_TOKEN: The account holder owning the connected account
                - EARLIEST_CONNECTION_TIME: Later of source and target connection times
            holder_mapping: DataFrame mapping account holders to their connected accounts:
                - ACCOUNT_HOLDER_TOKEN: The account holder
                - ACCOUNT_TOKEN: Their account that shares an asset
                - MAPPING_EFFECTIVE_AT: When the account was mapped to the holder
            connected_holders: Set of account holder tokens that share assets with the source
        )
    """
    if not source_active_accounts:
        return pd.DataFrame(), pd.DataFrame(), set()
        
    asset_type_clause = "1=1"
    if asset_types:
        asset_type_clause = f"TO_TYPE IN {to_sql_list(asset_types)}"
        
    query = f"""
    WITH source_assets AS (
        -- Get assets connected to source accounts
        SELECT DISTINCT 
            TO_TOKEN,
            TO_TYPE,
            MIN(EFFECTIVE_AT) as SOURCE_CONNECTION_TIME  -- When source first connected
        FROM duplograph.public.edges
        WHERE FROM_TYPE = 'CASH_CUSTOMER'
        AND FROM_TOKEN IN {to_sql_list(source_active_accounts)}
        AND TO_TYPE != 'CASH_CUSTOMER'
        AND {asset_type_clause}
        AND EFFECTIVE_AT <= '{effective_at}'
        GROUP BY TO_TOKEN, TO_TYPE
    ),
    connected_accounts AS (
        -- Get accounts connected to those assets, excluding source accounts
        SELECT DISTINCT
            e.FROM_TOKEN AS ACCOUNT_TOKEN,
            e.TO_TOKEN AS ASSET_TOKEN,
            e.TO_TYPE AS ASSET_TYPE,
            MIN(e.EFFECTIVE_AT) AS ASSET_CONNECTION_TIME,  -- When target first connected
            MIN(sa.SOURCE_CONNECTION_TIME) AS SOURCE_CONNECTION_TIME
        FROM duplograph.public.edges e
        JOIN source_assets sa ON e.TO_TOKEN = sa.TO_TOKEN 
            AND e.TO_TYPE = sa.TO_TYPE
        WHERE e.FROM_TYPE = 'CASH_CUSTOMER'
        AND e.FROM_TOKEN NOT IN {to_sql_list(source_active_accounts)}
        AND e.EFFECTIVE_AT <= '{effective_at}'
        GROUP BY e.FROM_TOKEN, e.TO_TOKEN, e.TO_TYPE
    ),
    account_holders AS (
        -- Get account holder relationships for connected accounts
        SELECT 
            e.FROM_TOKEN AS ACCOUNT_HOLDER_TOKEN,
            e.TO_TOKEN AS ACCOUNT_TOKEN,
            e.EFFECTIVE_AT AS MAPPING_EFFECTIVE_AT
        FROM duplograph.public.edges e
        WHERE e.FROM_TYPE = 'ACCOUNT_HOLDER_CASH_CUSTOMER'
        AND e.TO_TYPE = 'CASH_CUSTOMER'
        AND e.TO_TOKEN IN (SELECT ACCOUNT_TOKEN FROM connected_accounts)
        AND e.FROM_TOKEN != '{source_account_holder}'
        AND e.EFFECTIVE_AT <= '{effective_at}'
    ),
    connections AS (
        -- Join with account holders and get earliest connection times
        SELECT 
            ca.*,
            ah.ACCOUNT_HOLDER_TOKEN,
            ah.MAPPING_EFFECTIVE_AT,
            -- For each account holder, find the earliest effective connection time
            -- (the later of source connection and target connection)
            MIN(GREATEST(ca.ASSET_CONNECTION_TIME, ca.SOURCE_CONNECTION_TIME)) 
                OVER (PARTITION BY ah.ACCOUNT_HOLDER_TOKEN, ca.ASSET_TYPE) AS EARLIEST_CONNECTION_TIME
        FROM connected_accounts ca
        JOIN account_holders ah ON ca.ACCOUNT_TOKEN = ah.ACCOUNT_TOKEN
    )
    SELECT * FROM connections
    ORDER BY ACCOUNT_HOLDER_TOKEN, ASSET_TOKEN
    """
    
    df = snowflake_query(query)
    
    if df.empty:
        return df, pd.DataFrame(), set()
        
    # Extract unique account holder tokens
    connected_holders = set(df['ACCOUNT_HOLDER_TOKEN'].unique())
    
    # Create account holder mapping DataFrame
    holder_mapping = df[['ACCOUNT_HOLDER_TOKEN', 'ACCOUNT_TOKEN', 'MAPPING_EFFECTIVE_AT']].drop_duplicates()
    
    # Create asset connections DataFrame
    asset_connections = df[[
        'ACCOUNT_TOKEN', 'ASSET_TOKEN', 'ASSET_TYPE', 'ASSET_CONNECTION_TIME',
        'SOURCE_CONNECTION_TIME', 'ACCOUNT_HOLDER_TOKEN', 'EARLIEST_CONNECTION_TIME'
    ]].drop_duplicates()
    
    return asset_connections, holder_mapping, connected_holders

In [0]:
def validate_no_events_condition(
    account_holder: str,
    actual_events: Set[str],
    status_message: str
) -> ValidationResult:
    """Validates cases where no events should be present.
    
    Args:
        account_holder: The account holder being validated
        actual_events: Set of actual event IDs found
        status_message: Status message explaining why no events are expected
        
    Returns:
        ValidationResult: Result object with appropriate status and any unexpected events
    """
    result = ValidationResult(account_holder)
    
    if actual_events:
        # There are events when there shouldn't be any
        result.add_error("validation_status", status_message)
        for event_id in actual_events:
            result.add_error("unexpected_event", event_id)
    else:
        # This is correct - no events when none are expected
        result.success = True
        result.validation_status = status_message
    
    return result

## Main Validation Logic

The core validation functions that orchestrate the validation process for each account holder.

In [0]:
def validate_account_holder(
    source_account_holder: str,
    effective_at: str,
    ccgce_table: str,
    asset_types: Optional[Set[str]] = None
) -> ValidationResult:
    """Validates CashConnectedGraphChangeEvents for a source account holder.
    
    The validation process:
    1. Gets all active (non-merged) accounts for the source account holder
    2. Finds all account holders connected to the source via shared assets
    3. Gets all active accounts for those connected account holders
    4. Filters out account holders that only have merged accounts
    5. Validates that:
       - Events exist for each shared asset type between source and target
       - No unexpected events exist
       - Labels are correctly propagated from all accounts
       - Event timestamps match the connection times
    
    Args:
        source_account_holder: The account holder token to validate
        effective_at: Timestamp for point-in-time validation
        ccgce_table: The Databricks table containing CashConnectedGraphChangeEvents
        asset_types: Optional set of asset types to filter by
        
    Returns:
        ValidationResult containing any validation errors found:
        - No events expected if source has no active accounts
        - No events expected if no shared assets found
        - No events expected if no target holders have active accounts
        - Missing events if expected connections not found
        - Unexpected events if additional connections found
        - Label errors if propagation incorrect
        - Timestamp errors if connection times don't match
    """
    result = ValidationResult(source_account_holder)

    # Get actual events for this account holder using Spark SQL
    query = f"""
    SELECT 
        cash_connected_graph_change_event.event_id as event_id,
        cash_connected_graph_change_event.target_user_token as target_user_token
    FROM {ccgce_table}
    WHERE cash_connected_graph_change_event.event_source_type = 'BACKFILL'
    AND cash_connected_graph_change_event.source_user_token = '{source_account_holder}'
    """
    actual_events_df = spark_query(query)
    actual_events = set(actual_events_df['event_id'])

    # Get source account holder's active accounts
    source_accounts_mapping = get_all_active_accounts({source_account_holder}, effective_at)
    source_active_accounts = source_accounts_mapping.get(source_account_holder, set())
    
    # Check if source account holder has no active accounts
    if not source_active_accounts:
        return validate_no_events_condition(
            source_account_holder,
            actual_events,
            "Account holder has no active accounts. No change events expected."
        )

    # Get connected data using Snowflake
    asset_connections, holder_mapping, connected_holders = get_connected_account_holders(
        source_account_holder, source_active_accounts, effective_at, asset_types
    )
    
    # Check if there are no connected assets
    if asset_connections.empty:
        return validate_no_events_condition(
            source_account_holder,
            actual_events,
            "No other accounts connected to assets"
        )

    # Get all active accounts for target account holders
    target_accounts_mapping = get_all_active_accounts(connected_holders, effective_at)
    
    # Filter out target account holders that only have merged accounts
    target_active_holders = {
        holder for holder, accounts in target_accounts_mapping.items()
        if accounts
    }
    
    # Check if there are no active target holders
    if not target_active_holders:
        return validate_no_events_condition(
            source_account_holder,
            actual_events,
            "No target account holders with active accounts found"
        )
    
    # Create complete mapping including both source and target account holders
    all_account_mapping = {
        source_account_holder: source_active_accounts,
        **target_accounts_mapping
    }
    
    # Filter asset_connections to only include active (non-merged) accounts
    active_accounts = set().union(*all_account_mapping.values())
    asset_connections = asset_connections[
        asset_connections['ACCOUNT_TOKEN'].isin(active_accounts)
    ]

    # Generate expected event IDs based on shared assets
    expected_events = set()
    for target_holder in target_active_holders:
        target_accounts = target_accounts_mapping[target_holder]
        target_assets = asset_connections[
            asset_connections['ACCOUNT_TOKEN'].isin(target_accounts)
        ]['ASSET_TYPE'].unique()
        
        for asset_type in target_assets:
            event_id = connection_change_event_id(
                target_holder, source_account_holder, asset_type
            )
            expected_events.add(event_id)
    
    # If no events are expected, verify there are no actual events
    if not expected_events:
        return validate_no_events_condition(
            source_account_holder,
            actual_events,
            "No shared assets with active target accounts found. No change events expected."
        )
    
    # Check for missing/unexpected events
    missing = expected_events - actual_events
    unexpected = actual_events - expected_events
    
    for event in missing:
        result.add_error("missing_event", event)
    for event in unexpected:
        result.add_error("unexpected_event", event)
        
    # Get full event details needed for label validation using Spark SQL
    query = f"""
    SELECT 
        cash_connected_graph_change_event.event_id as event_id,
        cash_connected_graph_change_event.event_type as event_type,
        cash_connected_graph_change_event.target_user_token as target_user_token,
        cash_connected_graph_change_event.source_user_token as source_user_token,
        cash_connected_graph_change_event.effective_at_millis as effective_at_millis,
        cash_connected_graph_change_event.published_at_millis as published_at_millis,
        cash_connected_graph_change_event.user_type as user_type,
        cash_connected_graph_change_event.event_source_type as event_source_type,
        cash_connected_graph_change_event.connection_change.changed_node_type as changed_node_type,
        cash_connected_graph_change_event.connection_change.source_user_labels as source_user_labels,
        cash_connected_graph_change_event.connection_change.target_user_labels as target_user_labels
    FROM {ccgce_table}
    WHERE cash_connected_graph_change_event.event_source_type = 'BACKFILL'
    AND cash_connected_graph_change_event.source_user_token = '{source_account_holder}'
    AND event_id IN {to_sql_list(expected_events)}
    """
    ccgce_df = spark_query(query)

    # Get all labels in a single query using the complete account mapping
    source_and_target_labels = get_account_holder_labels(
        all_account_mapping, effective_at
    )
    
    source_labels = source_and_target_labels[
        source_and_target_labels['ACCOUNT_HOLDER_TOKEN'] == source_account_holder
    ]
    target_labels = source_and_target_labels[
        source_and_target_labels['ACCOUNT_HOLDER_TOKEN'].isin(target_active_holders)
    ]
    
    # Validate timestamps and labels
    validate_timestamps(
        ccgce_df=ccgce_df,
        asset_connections=asset_connections,
        validation_result=result
    )

    validate_labels(
        ccgce_df=ccgce_df,
        source_labels_df=source_labels,
        target_labels_df=target_labels,
        source_account_holder=source_account_holder,
        validation_result=result
    )

    return result

In [0]:
def validate_account_holders(
    account_holders: List[str],
    effective_at: str,
    ccgce_table: str,
    asset_types: Optional[Set[str]] = None,
    output_dir: Optional[str] = None,
    csv_path: Optional[str] = None
) -> List[ValidationResult]:
    """Validates edgy backfill processing for multiple account holders.
    
    Args:
        account_holders: List of account holder tokens to validate
        effective_at: Timestamp for point-in-time validation
        ccgce_table: The table containing CashConnectedGraphChangeEvents
        asset_types: Optional set of asset types to filter by
        output_dir: Optional directory for output files. If not provided,
                   defaults to current directory
        csv_path: Optionally set the output csv file explicitly. If not
                  provided, it will be auto-generated
    
    Returns:
        List of ValidationResult objects, one per account holder
    """
    print(f"Validating {len(account_holders)} account holders:")
    print("\n".join(f"- {ah}" for ah in account_holders))
    
    # Create output directory if it doesn't exist
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    else:
        output_dir = '.'
        
    # Create CSV filename with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    if csv_path is None:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        csv_path = os.path.join(output_dir, f'validation_results_{timestamp}.csv')
    
    results = []
    timings = {}
    
    for holder in account_holders:
        start_time = time.time()
        result = validate_account_holder(
            holder, effective_at, ccgce_table, asset_types
        )
        results.append(result)
        
        # Write result to CSV as soon as it's available
        result.write_to_csv(csv_path)
        
        end_time = time.time()
        timings[holder] = end_time - start_time
        
        # Print progress with timing and detailed results
        duration = timings[holder]
        print(f"\nTime to validate {holder}: {duration:.2f} seconds")
        print(result.get_summary())
        print()
    
    # Print summary statistics
    success_count = sum(1 for r in results if r.success)
    print(f"\nValidation complete: {success_count}/{len(results)} passed")
    
    # Timing statistics
    avg_time = sum(timings.values()) / len(timings)
    max_time = max(timings.values())
    min_time = min(timings.values())
    print(f"\nTiming statistics:")
    print(f"- Average time per account holder: {avg_time:.2f} seconds")
    print(f"- Fastest validation: {min_time:.2f} seconds")
    print(f"- Slowest validation: {max_time:.2f} seconds")
    print(f"- Total validation time: {sum(timings.values()):.2f} seconds")
    print(f"\nResults written to: {csv_path}")
    
    return results

In [0]:
# Parallelized batch validation

from concurrent.futures import ThreadPoolExecutor
import concurrent
import math

def validate_account_holders_batch(
    account_holders: List[str],
    effective_at: str,
    ccgce_table: str,
    output_dir: str,
    batch_size: int = 5,
    max_workers: int = 3
):
    """Validates account holders in parallel batches.
    
    Args:
        account_holders: List of account holder tokens to validate
        effective_at: Timestamp for point-in-time validation
        ccgce_table: The table containing CashConnectedGraphChangeEvents
        output_dir: Directory for output files
        batch_size: Number of account holders to process in each batch
        max_workers: Maximum number of parallel threads
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Create a single CSV filename for all batches
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    csv_path = os.path.join(output_dir, f'validation_results_{timestamp}.csv')
    
    # Split account holders into batches
    num_batches = math.ceil(len(account_holders) / batch_size)
    batches = [
        account_holders[i * batch_size:(i + 1) * batch_size]
        for i in range(num_batches)
    ]
    
    print(f"Processing {len(account_holders)} account holders in {num_batches} batches "
          f"of size {batch_size} using {max_workers} workers")
    print(f"Results will be written to: {csv_path}")
    
    # Process batches in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(
                validate_account_holders,
                batch,
                effective_at,
                ccgce_table,
                None,  # asset_types
                output_dir,
                csv_path  # Pass the specific CSV path
            )
        for batch in batches
        ]
        
        # Process results as they complete
        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            try:
                future.result()
                print(f"Completed batch {i + 1}/{num_batches}")
            except Exception as e:
                print(f"Batch {i + 1} failed with error: {str(e)}")

## Validation Execution

In [0]:
# File containing the list of backfilled account holder tokens
backfilled_acc_holders_csv = 'account_holders_20250509_071901.csv'

# Read the account holder tokens
account_holders_df = pd.read_csv(backfilled_acc_holders_csv)
account_holders_to_validate = account_holders_df['ACCOUNT_HOLDER_TOKEN'].tolist()

# Run validation in parallel batches
validate_account_holders_batch(
    account_holders=account_holders_to_validate,
    effective_at='2025-05-12 06:24:24.000000', 
    ccgce_table='cash_banking_ml_eng.cash_connected_graph_change_event.edgy_test_20250512',
    output_dir='validation_results_20250515',
    batch_size=5,  # Process 5 account holders per batch
    max_workers=20  # Run 20 batches in parallel
)

Processing 7141 account holders in 1429 batches of size 5 using 20 workers
Results will be written to: validation_results_20250515/validation_results_20250515_123151.csv
Validating 5 account holders:
- AH_jdyqqvemg
- AH_gbyq2n2t0
- AH_pzy8bcc6f
- AH_dvm895can
- AH_gpyqa52qh
Validating 5 account holders:
- AH_f0mhw3cqf
- AH_dpm0w1cq0
- AH_6smhxmej0
- AH_f9yhcxccg
- AH_z9m118ct0
Validating 5 account holders:
- AH_gbm01dejs
- AH_ktm81z2yg
- AH_02mhr82mf
- AH_5ey1j5can
- AH_3fya4ccs0
Validating 5 account holders:
- AH_s9m0rhctn
- AH_kzy0rcdan
- AH_znypdzd4n
- AH_tdmga3c4n
- AH_60mprnc6n
Validating 5 account holders:
- AH_bgmp1nd6s
- AH_ezm1pzean
- AH_kjyp4a2xg
- AH_seyawedmg
- AH_a1y1ea2xf
Validating 5 account holders:
- AH_swyq4e2as
- AH_3ky0et29n
- AH_bnyaehchn
- AH_73ygrh2x0
- AH_01m8kndq0
Validating 5 account holders:
- AH_rqmprp2n8
- AH_ngyhjc2t8
- AH_jvmg2weyn
- AH_03m1w9e60
- AH_fwmhcycys
Validating 5 account holders:
- AH_mxyqbecj0
- AH_6ym8pke0s
- AH_dzy8ev2h8
- AH_06mq1xdqs
- AH_

In [0]:
# Cell for manually validating a selection of account holders with no parallel processing

# Validate backfilled account holders from backfill inputs
account_holders_df = pd.read_csv(backfilled_acc_holders_csv)
account_holders_to_validate = account_holders_df['ACCOUNT_HOLDER_TOKEN'].head(5).tolist()

# Run validation
results = validate_account_holders(
    account_holders=account_holders_to_validate,
    effective_at='2025-04-17 06:05:05.000000',
    ccgce_table='cash_banking_ml_eng.cash_connected_graph_change_event.edgy_test',
    #output_dir='validation_results'
)

Validating 5 account holders:
- AH_jdyqqvemg
- AH_gbyq2n2t0
- AH_pzy8bcc6f
- AH_dvm895can
- AH_gpyqa52qh

Starting validation...


Time to validate AH_jdyqqvemg: 118.05 seconds
✅ AH_jdyqqvemg: All validations passed


Time to validate AH_gbyq2n2t0: 84.77 seconds
✅ AH_gbyq2n2t0: All validations passed


Time to validate AH_pzy8bcc6f: 11.62 seconds
ℹ️ AH_pzy8bcc6f: Account holder has no active accounts. No change events expected.


Time to validate AH_dvm895can: 66.48 seconds
✅ AH_dvm895can: All validations passed


Time to validate AH_gpyqa52qh: 76.41 seconds
✅ AH_gpyqa52qh: All validations passed


Validation complete: 5/5 passed

Timing statistics:
- Average time per account holder: 71.47 seconds
- Fastest validation: 11.62 seconds
- Slowest validation: 118.05 seconds
- Total validation time: 357.33 seconds

Results written to: ./validation_results_20250509_072423.csv


## Validation Result Analysis

In [0]:
# load file containing the validation results
validation_df = pd.read_csv('validation_results_20250515/validation_results_20250515_123151.csv')
validation_df

Unnamed: 0,account_holder_token,validation_success,validation_status,missing_events,unexpected_events,label_errors,timestamp_errors
0,AH_gbm01dejs,True,Account holder has no active accounts. No chan...,,,,
1,AH_2emha8cxn,True,Account holder has no active accounts. No chan...,,,,
2,AH_kamgcxctg,True,Account holder has no active accounts. No chan...,,,,
3,AH_s9m0rhctn,True,Account holder has no active accounts. No chan...,,,,
4,AH_rqmprp2n8,True,Account holder has no active accounts. No chan...,,,,
...,...,...,...,...,...,...,...
7136,AH_6nyhjxdag,True,PASS,,,,
7137,AH_c5yq4ddeh,True,PASS,,,,
7138,AH_qzm0wmc60,True,No other accounts connected to assets,,,,
7139,AH_j9ma9hce8,False,FAIL,,E/AH_22m0rt2y5/AH_j9ma9hce8/CASH_DEVICE|E/AH_2...,,


In [0]:
# Failed validationss
failed_validations_df = validation_df[validation_df['validation_success'] == False][['account_holder_token', 'missing_events', 'unexpected_events', 'label_errors', 'timestamp_errors']]
failed_validations_df

Unnamed: 0,account_holder_token,missing_events,unexpected_events,label_errors,timestamp_errors
250,AH_4pypjpe05,,E/AH_4pypjpe05/AH_6cy1q82nf/CASH_BACKUP_TAG|E/...,,
432,AH_prmgbv2ts,,,,Timestamp mismatch for event E/AH_bbm0cnccg/AH...
680,AH_0rmgxxcag,,E/AH_0dmg42deh/AH_0rmgxxcag/CASH_BACKUP_TAG|E/...,,
1114,AH_qqmq1v2c8,,,,Timestamp mismatch for event E/AH_dymp48d6f/AH...
1464,AH_z8mqeyc45,,E/AH_9xy8rtech/AH_z8mqeyc45/CASH_BACKUP_TAG|E/...,,
1477,AH_8aygrrph5,,E/AH_34mg2a2ng/AH_8aygrrph5/CASH_BACKUP_TAG|E/...,,
1689,AH_hampjx2hs,,E/AH_hampjx2hs/AH_k8mpkt2c8/CASH_BACKUP_TAG|E/...,,
1864,AH_baypq8dqn,,,,Timestamp mismatch for event E/AH_3cmgp829f/AH...
1899,AH_4nmgxzc9n,,E/AH_4nmgxzc9n/AH_sbyhpcd6h/SSN,,
1930,AH_cey1k5dh0,E/AH_04ygrcdt0/AH_cey1k5dh0/CASH_DEVICE|E/AH_0...,,,


In [0]:
if failed_validations_df.empty:
    print("All validations passed")
else:
    failed_validation_tokens = failed_validations_df['account_holder_token'].unique().tolist()
    num_failed = len(failed_validation_tokens)
    all_validated = validation_df['account_holder_token'].unique().tolist()
    num_validated = len(all_validated)
    print(f"{num_failed} out of {num_validated} account holders had failed validations ({num_failed/num_validated*100:.2f}%):\n\n{failed_validation_tokens}")

29 out of 7141 account holders had failed validations (0.41%):

['AH_4pypjpe05', 'AH_prmgbv2ts', 'AH_0rmgxxcag', 'AH_qqmq1v2c8', 'AH_z8mqeyc45', 'AH_8aygrrph5', 'AH_hampjx2hs', 'AH_baypq8dqn', 'AH_4nmgxzc9n', 'AH_cey1k5dh0', 'AH_vryh9aecf', 'AH_h7mgdccjh', 'AH_vyygc8cxn', 'AH_epya4dc00', 'AH_zay1phd40', 'AH_06m8wd2qn', 'AH_tkm1wscmn', 'AH_y3yqeceen', 'AH_snyaa2ee5', 'AH_9cypjad65', 'AH_cwya2p2tn', 'AH_3jm8dcdcs', 'AH_hxm8kpcqs', 'AH_q8mh1hdt0', 'AH_ddy0eyct8', 'AH_3xmpdz2x8', 'AH_mbm1ptct0', 'AH_5rygbhejh', 'AH_j9ma9hce8']


### ✅ Missing events

Account holders with expected events that were missing in the actual output.

 The account holders who failed validation have been manually checked. The reason for the missing events is a neighborhood size exceeding edgy's query limit of 5000.

In [0]:
# Filter rows where missing_events is not empty
missing_events_df = validation_df[validation_df['missing_events'].notna()]

if missing_events_df.empty:
    print("No missing events found in the validation results")
else:
    print(f"Found {len(missing_events_df)} account holders with missing events:\n{missing_events_df['account_holder_token'].to_list()}\n")
    for _, row in missing_events_df.iterrows():
        events = row['missing_events'].split('|')
        print(f"Account holder: {row['account_holder_token']}")
        print(f"Number of missing events: {len(events)}")
        print("Missing events:")
        for event in events:
            print(f"  - {event}")
        print()

Found 2 account holders with missing events:
['AH_cey1k5dh0', 'AH_hxm8kpcqs']

Account holder: AH_cey1k5dh0
Number of missing events: 105
Missing events:
  - E/AH_04ygrcdt0/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_0ky0j8dhg/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_1hm0wbe0f/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_1jmhase08/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_1mm0x9c6f/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_2mmqx0e6g/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_3tmqx3cm0/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_42y1ra2tg/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_43yhra2ng/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_4eyhjyde0/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_4jyprhdch/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_4vygkedh0/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_50yarddhh/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_51yprcden/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_54y04e2xn/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_5ay1rtdt0/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_5cygkd2a0/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_5cyhkc2yh/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_5eygka2as/AH_cey1k5dh0/CASH_DEVICE
  - E/AH_5

### ✅ Unexpected events

Account holders who had actual events that were not expected based on the validation.

A sample of the account holders that failed validation have been manually checked. In all cases the reason for the discrepancy was that the edge that caused the unexpected events exists in Duplograph's database (checked via RPC), but it's missing in `duplograph.public.edges` in Snowflake.

In [0]:
# Filter rows where unexpected_events is not empty
unexpected_events_df = validation_df[validation_df['unexpected_events'].notna()]

if unexpected_events_df.empty:
    print("No unexpected events found in the validation results")
else:
    print(f"Found {len(unexpected_events_df)} account holders with unexpected events: \n{unexpected_events_df['account_holder_token'].to_list()}\n")
    
    for _, row in unexpected_events_df.iterrows():
        events = row['unexpected_events'].split('|')
        
        print(f"Account holder: {row['account_holder_token']}")
        print("Unexpected events:")
        for event in events:
            print(f"  - {event}")
        print()

Found 15 account holders with unexpected events: 
['AH_4pypjpe05', 'AH_0rmgxxcag', 'AH_z8mqeyc45', 'AH_8aygrrph5', 'AH_hampjx2hs', 'AH_4nmgxzc9n', 'AH_vryh9aecf', 'AH_h7mgdccjh', 'AH_vyygc8cxn', 'AH_tkm1wscmn', 'AH_snyaa2ee5', 'AH_hxm8kpcqs', 'AH_ddy0eyct8', 'AH_5rygbhejh', 'AH_j9ma9hce8']

Account holder: AH_4pypjpe05
Unexpected events:
  - E/AH_4pypjpe05/AH_6cy1q82nf/CASH_BACKUP_TAG
  - E/AH_4pypjpe05/AH_6cy1q82nf/CASH_DEVICE
  - E/AH_4pypjpe05/AH_6cy1q82nf/CASH_OPAQUE_APP_TOKEN
  - E/AH_4pypjpe05/AH_ywyaqzccf/CASH_BACKUP_TAG
  - E/AH_4pypjpe05/AH_ywyaqzccf/CASH_DEVICE
  - E/AH_4pypjpe05/AH_ywyaqzccf/CASH_OPAQUE_APP_TOKEN

Account holder: AH_0rmgxxcag
Unexpected events:
  - E/AH_0dmg42deh/AH_0rmgxxcag/CASH_BACKUP_TAG
  - E/AH_0dmg42deh/AH_0rmgxxcag/CASH_DEVICE
  - E/AH_0dmg42deh/AH_0rmgxxcag/CASH_OPAQUE_APP_TOKEN
  - E/AH_0rmgxxcag/AH_7ymp2xdn5/CASH_BACKUP_TAG
  - E/AH_0rmgxxcag/AH_7ymp2xdn5/CASH_DEVICE
  - E/AH_0rmgxxcag/AH_7ymp2xdn5/CASH_OPAQUE_APP_TOKEN
  - E/AH_0rmgxxcag/AH_8gm0d

### ✅ Label mismatches

Mismatches between expected and actual labels on the change event source and target account holders. Excludes `ACCOUNT_DENYLISTED` labels which don't apply to account holders.

In [0]:
# Filter rows where label_errors is not empty
label_errors_df = validation_df[validation_df['label_errors'].notna()]

def parse_label_error(error_message: str) -> dict:
    """Parse a label error message into its components.
    
    Args:
        error_message: The label error message from validation
        
    Returns:
        Dictionary containing error details or None if parsing fails
    """
    try:
        # Determine if this is source or target label error
        is_source = 'Source label mismatch' in error_message
        is_target = 'Target label mismatch' in error_message
        if not (is_source or is_target):
            return None
            
        # Extract event ID
        event_start = error_message.find('event ') + len('event ')
        event_end = error_message.find(':', event_start)
        event_id = error_message[event_start:event_end].strip()
            
        # Extract target account holder for target label errors
        target_holder = None
        if is_target:
            target_start = error_message.find('Target account holder:') + len('Target account holder:')
            target_end = error_message.find('\n', target_start)
            target_holder = error_message[target_start:target_end].strip()
        
        # Extract expected and actual labels
        expected_start = error_message.find('Expected:') + len('Expected:')
        expected_end = error_message.find('\n', expected_start)
        actual_start = error_message.find('Actual:') + len('Actual:')
        actual_end = error_message.find('\n', actual_start) if '\n' in error_message[actual_start:] else len(error_message)
        
        expected_str = error_message[expected_start:expected_end].strip()
        actual_str = error_message[actual_start:actual_end].strip()
        
        # Convert string representations of lists to sets
        expected_labels = set(eval(expected_str))
        actual_labels = set(eval(actual_str))
        
        # Only ignore if the ACCOUNT_DENYLISTED label is in expected but not actual
        if ('ACCOUNT_DENYLISTED' in expected_labels and 
            'ACCOUNT_DENYLISTED' not in actual_labels and 
            expected_labels - {'ACCOUNT_DENYLISTED'} == actual_labels):
            return None
            
        return {
            'type': 'source' if is_source else 'target',
            'event_id': event_id,
            'target_holder': target_holder,
            'expected': sorted(expected_labels),
            'actual': sorted(actual_labels)
        }
    except:
        return None

if label_errors_df.empty:
    print("No label errors found in the validation results")
else:
    print(f"Found {len(label_errors_df)} account holders with label errors:\n{label_errors_df['account_holder_token'].to_list()}\n")
    
    relevant_errors = 0
    for _, row in label_errors_df.iterrows():
        errors = row['label_errors'].split('|')
        
        # Parse and filter errors
        parsed_errors = [
            parse_label_error(error) 
            for error in errors
        ]
        parsed_errors = [e for e in parsed_errors if e is not None]
        
        if parsed_errors:
            relevant_errors += 1
            print(f"Account holder: {row['account_holder_token']}")
            for error in parsed_errors:
                if error['type'] == 'source':
                    print(f"  Source label mismatch (event: {error['event_id']}):")
                    print(f"    Expected: {error['expected']}")
                    print(f"    Actual:   {error['actual']}")
                else:
                    print(f"  Target label mismatch (event: {error['event_id']}, target: {error['target_holder']}):")
                    print(f"    Expected: {error['expected']}")
                    print(f"    Actual:   {error['actual']}")
            print()
    
    print(f"Found {relevant_errors} account holders with label errors")
    if relevant_errors < len(label_errors_df):
        print(f"({len(label_errors_df) - relevant_errors} account holders had only expected ACCOUNT_DENYLISTED differences)")

No label errors found in the validation results


### ✅ Timestamp errors

Account holders with mismatches between expected event timestamps (based on the earliest connection time between source/target via the given asset type) and the actual `effective_at_millis` in the generated change events.

A sample of account holders with timestamp errors was checked manually. In all cases the reason for the discrepancy was a different `effective_at` timestamp in Duplograph's database (verified via RPC calls) compared to the `effective_at` timestamp in `duplograph.public.edges` for the same edge (i.e. same `from_node` and `to_node`).

In [0]:
# Filter rows where timestamp_errors is not empty and not NaN
timestamp_errors_df = validation_df[
    validation_df['timestamp_errors'].notna() & (validation_df['timestamp_errors'] != '')
]

if timestamp_errors_df.empty:
    print("No timestamp errors found in the validation results.")
else:
    print(f"Found {len(timestamp_errors_df)} account holders with timestamp errors:\n{timestamp_errors_df['account_holder_token'].to_list()}\n")
    for index, row in timestamp_errors_df.iterrows():
        account_holder = row['account_holder_token']
        # Split the pipe-separated error strings
        error_messages = row['timestamp_errors'].split('|')
        
        print(f"Account holder: {account_holder}")
        print(f"Number of timestamp errors: {len(error_messages)}")
        print("Timestamp error details:")
        for error_detail in error_messages:
            # Replace literal '\\n' from CSV with actual newlines for readable printing
            readable_error_detail = error_detail.replace('\\\\n', '\\n')
            print(f"  - {readable_error_detail}")
        print("\n") # Add a blank line for separation between account holders

Found 14 account holders with timestamp errors:
['AH_prmgbv2ts', 'AH_qqmq1v2c8', 'AH_baypq8dqn', 'AH_epya4dc00', 'AH_zay1phd40', 'AH_06m8wd2qn', 'AH_y3yqeceen', 'AH_9cypjad65', 'AH_cwya2p2tn', 'AH_3jm8dcdcs', 'AH_hxm8kpcqs', 'AH_q8mh1hdt0', 'AH_3xmpdz2x8', 'AH_mbm1ptct0']

Account holder: AH_prmgbv2ts
Number of timestamp errors: 1
Timestamp error details:
  - Timestamp mismatch for event E/AH_bbm0cnccg/AH_prmgbv2ts/CASH_DEVICE:
  Expected (earliest effective connection): 2023-12-31 07:20:15.872000
  Actual (event effective_at): 2023-12-31 07:20:15.798000


Account holder: AH_qqmq1v2c8
Number of timestamp errors: 1
Timestamp error details:
  - Timestamp mismatch for event E/AH_dymp48d6f/AH_qqmq1v2c8/CASH_DEVICE:
  Expected (earliest effective connection): 2024-01-08 03:59:10.678000
  Actual (event effective_at): 2024-01-08 03:59:10.626000


Account holder: AH_baypq8dqn
Number of timestamp errors: 1
Timestamp error details:
  - Timestamp mismatch for event E/AH_3cmgp829f/AH_baypq8dqn/EMA

## ✅ General backfill checks

Checks general properties of the backfilled event data set:
- Are there any events that have the same source_user_token and target_user_token?
- Are there duplicate events with the same event_id?
- Are there are any events where event_time_millis is different to cash_connected_graph_change_event.effective_at_millis?
- Are there any events where cash_connected_graph_change_event.connection_change.changed_node_type is not in the event_id?
- Are there are any events where the source_user_token and target_user_token are not in the event_id?

In [0]:
def parse_event_id(event_id: str) -> Tuple[str, str, str]:
    """Parse an event ID of format 'E/token1/token2/type' into its components.
    Returns (token1, token2, type)."""
    try:
        _, tokens_and_type = event_id.split('E/', 1)
        token1, token2, node_type = tokens_and_type.split('/')
        return token1, token2, node_type
    except:
        return None, None, None
    
def format_event(event: dict) -> str:
    """Format a single CashConnectedGraphChangeEvent in a readable way."""
    output = [
        f"Event ID: {event['event_id']}",
        f"Source: {event['source_user_token']}",
        f"Target: {event['target_user_token']}",
        f"Node Type: {event['changed_node_type']}",
        f"Event Time: {event['event_time_millis']}",
        f"Effective At: {event['effective_at_millis']}"
    ]
    return "\n  ".join(output)

def format_validation_sample(df: pd.DataFrame, validation_type: str, max_samples: int = 10) -> str:
    """Format validation results with samples of problematic events."""
    output = []
    for idx, event in df.head(max_samples).iterrows():
        output.append(f"\n{validation_type} {idx + 1}:")
        output.append("  " + format_event(event))
    return "\n".join(output)

In [0]:
def validate_backfilled_events(ccgce_table: str):
    """Validate general properties of backfilled events.
    
    Args:
        ccgce_table: Full name of the CashConnectedGraphChangeEvent table to validate
    """
    # Load all backfilled events into a DataFrame once
    query = f"""
    SELECT 
        event_time_millis,
        cash_connected_graph_change_event.event_id,
        cash_connected_graph_change_event.source_user_token,
        cash_connected_graph_change_event.target_user_token,
        cash_connected_graph_change_event.effective_at_millis,
        cash_connected_graph_change_event.connection_change.changed_node_type as changed_node_type,
        cash_connected_graph_change_event.event_source_type
    FROM {ccgce_table}
    WHERE cash_connected_graph_change_event.event_source_type = 'BACKFILL'
    """
    
    df = spark_query(query)
    total_events = len(df)
    print(f"Total backfilled events: {total_events}\n")

    # 1. Check for self-referential events
    self_ref_df = df[df['source_user_token'] == df['target_user_token']]
    print("\n1. Events with same source and target user:")
    print(f"Total count: {len(self_ref_df)}")
    if not self_ref_df.empty:
        print("\nSample of such events:")
        print(format_validation_sample(self_ref_df, "Self-referential event"))

    # 2. Check for duplicate event IDs
    event_counts = df['event_id'].value_counts()
    duplicates = event_counts[event_counts > 1]
    if len(duplicates) > 0:
        print("\n2. Events with duplicate event IDs:")
        print(f"Total count of unique event IDs with duplicates: {len(duplicates)}")
        print("\nSample of such events:")
        for idx, (event_id, count) in enumerate(duplicates.head(10).items(), 1):
            print(f"\nDuplicate Event {idx}:")
            print(f"  Event ID: {event_id}")
            print(f"  Count: {count}")
            print("  Instances:")
            duplicate_events = df[df['event_id'] == event_id]
            for i, event in enumerate(duplicate_events.to_dict('records'), 1):
                print(f"\n    Instance {i}:")
                print("      " + format_event(event).replace("\n  ", "\n      "))

    # 3. Check for mismatched timestamps
    timestamp_mismatch_df = df[df['event_time_millis'] != df['effective_at_millis']]
    print("\n\n3. Events with mismatched timestamps:")
    print(f"Total count: {len(timestamp_mismatch_df)}")
    if not timestamp_mismatch_df.empty:
        print("\nSample of such events:")
        print(format_validation_sample(timestamp_mismatch_df, "Mismatched timestamp"))

    # 4. Check for node type mismatches
    node_type_mismatches = []
    for _, row in df.iterrows():
        _, _, event_type = parse_event_id(row['event_id'])
        if event_type != row['changed_node_type']:
            node_type_mismatches.append(row)

    node_type_mismatch_df = pd.DataFrame(node_type_mismatches)
    print("\n\n4. Events where node type doesn't match event ID:")
    print(f"Total count: {len(node_type_mismatch_df)}")
    if not node_type_mismatch_df.empty:
        print("\nSample of such events:")
        print(format_validation_sample(node_type_mismatch_df, "Node type not in event_id"))

    # 5. Check for user token mismatches
    token_mismatches = []
    for _, row in df.iterrows():
        token1, token2, _ = parse_event_id(row['event_id'])
        source_token = row['source_user_token']
        target_token = row['target_user_token']
        
        # Check if both tokens are present in event ID
        tokens_in_id = {token1, token2}
        actual_tokens = {source_token, target_token}
        if tokens_in_id != actual_tokens:
            token_mismatches.append(row)

    token_mismatch_df = pd.DataFrame(token_mismatches)
    print("\n\n5. Events where user tokens don't match event ID:")
    print(f"Total count: {len(token_mismatch_df)}")
    if not token_mismatch_df.empty:
        print("\nSample of such events:")
        print(format_validation_sample(token_mismatch_df, "Source/target not in event_id"))

    # Print summary
    print("\n\nValidation Summary:")
    print(f"Total events validated: {total_events}")
    print(f"Self-referential events: {len(self_ref_df)}")
    print(f"Events with duplicate IDs: {len(duplicate_events)}")
    print(f"Events with timestamp mismatches: {len(timestamp_mismatch_df)}")
    print(f"Events with node type mismatches: {len(node_type_mismatch_df)}")
    print(f"Events with token mismatches: {len(token_mismatch_df)}")

The 2 events with duplicate IDs below are examples where the source and target account holder were both part of the backfill. We know this because their source/target user tokens are swapped. This is the intended behaviour in this case. Having an identical event_id and timestamp ensures they'll be handled as duplicates downstream, to avoid double-counting these connections in signals.

In [0]:
validate_backfilled_events("cash_banking_ml_eng.cash_connected_graph_change_event.edgy_test_20250512")

Total backfilled events: 62197


1. Events with same source and target user:
Total count: 0

2. Events with duplicate event IDs:
Total count of unique event IDs with duplicates: 2

Sample of such events:

Duplicate Event 1:
  Event ID: E/AH_8mmppzec5/AH_zrmga8ee8/VERIFIED_SSN
  Count: 2
  Instances:

    Instance 1:
      Event ID: E/AH_8mmppzec5/AH_zrmga8ee8/VERIFIED_SSN
      Source: AH_zrmga8ee8
      Target: AH_8mmppzec5
      Node Type: VERIFIED_SSN
      Event Time: 1658663258000
      Effective At: 1658663258000

    Instance 2:
      Event ID: E/AH_8mmppzec5/AH_zrmga8ee8/VERIFIED_SSN
      Source: AH_8mmppzec5
      Target: AH_zrmga8ee8
      Node Type: VERIFIED_SSN
      Event Time: 1658663258000
      Effective At: 1658663258000

Duplicate Event 2:
  Event ID: E/AH_8mmppzec5/AH_zrmga8ee8/SSN
  Count: 2
  Instances:

    Instance 1:
      Event ID: E/AH_8mmppzec5/AH_zrmga8ee8/SSN
      Source: AH_zrmga8ee8
      Target: AH_8mmppzec5
      Node Type: SSN
      Event Time: 16586

## Appendix: Obtaining the list of backfilled account holders

In [0]:
# Fetch the unique account holder tokens corresponding to the backfilled customer tokens and save them to a file 
# (run one off and save output_file as backfilled_acc_holders_csv)

# # Read the CASH_CUSTOMER tokens from the CSV file
# cash_customer_df = spark.read.csv(
#     "dbfs:/FileStore/Users/johanz/2025_04_17_9_27am.csv",
#     header=True
# ).toPandas()

# # Get the list of CASH_CUSTOMER tokens
# cash_customer_tokens = cash_customer_df['CUSTOMER_TOKEN'].unique().tolist()

# # Query Snowflake to get the unique ACCOUNT_HOLDER_CASH_CUSTOMER nodes
# query = f"""
# SELECT DISTINCT e.FROM_TOKEN as ACCOUNT_HOLDER_TOKEN
# FROM duplograph.public.edges e
# WHERE e.FROM_TYPE = 'ACCOUNT_HOLDER_CASH_CUSTOMER'
# AND e.TO_TYPE = 'CASH_CUSTOMER'
# AND e.TO_TOKEN IN {to_sql_list(cash_customer_tokens)}
# """

# account_holders_df = snowflake_query(query)

# # Save to a local CSV file with timestamp
# timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# output_file = f'account_holders_{timestamp}.csv'
# account_holders_df.to_csv(output_file, index=False)

# print(f"Found {len(account_holders_df)} unique account holders")
# print(f"Results saved to: {output_file}")


Found 7141 unique account holders
Results saved to: account_holders_20250509_071901.csv
