In [2]:
import pandas as pd
from typing import Dict, List
import logging

In [3]:
def setup_logging():
    """Configure logging for the script."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

In [4]:
"""Main execution function."""
setup_logging()
    
# Configuration
file_paths = {
    "asset-management": "./dummy-data/asset_custdata.csv",
    "trustees-digital": "./dummy-data/digital_trustees_custdata.csv",
    "trustees-traditional": "./dummy-data/traditional_trustees_custdata.csv",
    #"insurance": "./dummy-data/individual_insurance_custdata.csv",
    "registrars": "./dummy-data/registrars_shold.csv",
    "securities" : "./dummy-data/securities_custdata.csv"
}

columns_mapping = {
    "asset-management": {"id": "CustAID", "bvn": "CustomerBVN"},
    "trustees-digital": {"id": "platformuserid", "bvn": "Bvn"},
    "trustees-traditional": {"id": "CustAID", "bvn": "CustomerBVN"},
    #"insurance": {"id": "Customer ID", "bvn": "userBVN"},
    "registrars": {"id": "Acctno", "bvn": "bvn"},
    "securities": {"id": "CustAID", "bvn": "CustomerBVN"}
}

In [5]:
logging.info("Starting BVN duplicate analysis")

bvn_records: List[pd.DataFrame] = []
missing_bvn_records: List[pd.DataFrame] = []

2025-02-12 15:45:43,818 - INFO - Starting BVN duplicate analysis


In [6]:
print(bvn_records)
print(missing_bvn_records)

[]
[]


In [7]:
for dataset_name, file_path in file_paths.items():
    logging.info(f"Processing dataset: {dataset_name}")
    
    # Read dataset with only required columns
    df = pd.read_csv(
        file_path,
        usecols=[
            columns_mapping[dataset_name]["id"],
            columns_mapping[dataset_name]["bvn"]
        ]
    )
    
    # Rename columns for consistency
    df.rename(columns={
        columns_mapping[dataset_name]["id"]: "id",
        columns_mapping[dataset_name]["bvn"]: "BVN"
    }, inplace=True)
    
    # Add dataset identifier
    df["dataset"] = dataset_name
    
    # Handle missing BVNs
    missing_mask = df["BVN"].isna()
    if missing_mask.any():
        missing_records = df[missing_mask].copy()
        missing_records["reason"] = "Missing BVN"
        missing_bvn_records.append(missing_records)
        logging.warning(f"Found {missing_mask.sum()} missing BVNs in {dataset_name}")
    
    # Remove rows with missing BVNs for main analysis
    df = df[~missing_mask]
    
    bvn_records.append(df)

2025-02-12 15:45:48,297 - INFO - Processing dataset: asset-management
2025-02-12 15:45:48,349 - INFO - Processing dataset: trustees-digital
2025-02-12 15:45:48,358 - INFO - Processing dataset: trustees-traditional
2025-02-12 15:45:48,366 - INFO - Processing dataset: registrars
2025-02-12 15:45:48,379 - INFO - Processing dataset: securities


In [10]:
type(bvn_records[1])

pandas.core.frame.DataFrame

In [8]:
df.tail()

Unnamed: 0,id,BVN,dataset
995,dafbe7d3-7b53-4efd-bc17-aed273dba4c2,98233245278,securities
996,bb019bdb-294b-4bf9-b385-f522dbe12a19,43017346108,securities
997,3317f584-69e1-4e25-a927-932c23ae9eb9,82115310834,securities
998,872b39c8-f15e-4191-830e-b2c6e6d1c2fd,17806845015,securities
999,ef5832c9-1c94-4111-ac13-9dadf3771cfe,31769696006,securities


In [22]:
# Combine all valid records
bvn_df = pd.concat(bvn_records, ignore_index=True)
bvn_df.tail()

Unnamed: 0,id,BVN,dataset
4995,dafbe7d3-7b53-4efd-bc17-aed273dba4c2,98233245278,securities
4996,bb019bdb-294b-4bf9-b385-f522dbe12a19,43017346108,securities
4997,3317f584-69e1-4e25-a927-932c23ae9eb9,82115310834,securities
4998,872b39c8-f15e-4191-830e-b2c6e6d1c2fd,17806845015,securities
4999,ef5832c9-1c94-4111-ac13-9dadf3771cfe,31769696006,securities


In [20]:
serial_id = []
for i in range(5000):
    serial_id.append(i+1)

  bvn_df['serial'] = serial_id


In [23]:
bvn_df['serial'] = serial_id

In [24]:
bvn_df.head()

Unnamed: 0,id,BVN,dataset,serial
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,36334337114,asset-management,1
1,23b8c1e9-3924-46de-beb1-3b9046685257,80347130822,asset-management,2
2,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,89813391321,asset-management,3
3,972a8469-1641-4f82-8b9d-2434e465e150,97383703538,asset-management,4
4,17fc695a-07a0-4a6e-8822-e8f36c031199,45229118845,asset-management,5


In [None]:

# Process duplicates
bvn_df["duplicated?"] = bvn_df["BVN"].duplicated(keep=False)

# Find first occurrence of duplicated IDs
duplicate_mapping = bvn_df[bvn_df["duplicated?"]].groupby("BVN")["id"].first().to_dict()
bvn_df["duplicated_id"] = bvn_df["BVN"].map(lambda x: duplicate_mapping.get(x, ""))

# Create missing BVNs DataFrame if any were found
missing_bvn_df = pd.concat(missing_bvn_records, ignore_index=True) if missing_bvn_records else None

# Save results
bvn_df.to_csv("bvn_comparison.csv", index=False)
logging.info("BVN comparison saved to 'bvn_comparison.csv'")

if missing_bvn_df is not None:
    missing_bvn_df.to_csv("missing_bvns.csv", index=False)
    logging.info("Missing BVNs saved to 'missing_bvns.csv'")

In [None]:
def analyze_bvn_duplicates(file_paths: Dict[str, str], columns_mapping: Dict[str, Dict[str, str]]) -> tuple:
    """
    Analyze BVN duplicates across multiple datasets.
    
    Args:
        file_paths: Dictionary mapping dataset names to file paths
        columns_mapping: Dictionary mapping dataset names to their column mappings
        
    Returns:
        tuple: (DataFrame with BVN analysis, DataFrame with missing BVNs)
    """
    logging.info("Starting BVN duplicate analysis")
    
    bvn_records: List[pd.DataFrame] = []
    missing_bvn_records: List[pd.DataFrame] = []
    
    # Process each dataset
    for dataset_name, file_path in file_paths.items():
        try:
            logging.info(f"Processing dataset: {dataset_name}")
            
            # Read dataset with only required columns
            df = pd.read_csv(
                file_path,
                usecols=[
                    columns_mapping[dataset_name]["id"],
                    columns_mapping[dataset_name]["bvn"]
                ]
            )
            
            # Rename columns for consistency
            df.rename(columns={
                columns_mapping[dataset_name]["id"]: "id",
                columns_mapping[dataset_name]["bvn"]: "BVN"
            }, inplace=True)
            
            # Add dataset identifier
            df["dataset"] = dataset_name
            
            # Handle missing BVNs
            missing_mask = df["BVN"].isna()
            if missing_mask.any():
                missing_records = df[missing_mask].copy()
                missing_records["reason"] = "Missing BVN"
                missing_bvn_records.append(missing_records)
                logging.warning(f"Found {missing_mask.sum()} missing BVNs in {dataset_name}")
            
            # Remove rows with missing BVNs for main analysis
            df = df[~missing_mask]
            
            bvn_records.append(df)
            
        except FileNotFoundError:
            logging.error(f"File not found: {file_path}")
            continue
        except Exception as e:
            logging.error(f"Error processing {dataset_name}: {str(e)}")
            continue
    
    if not bvn_records:
        raise ValueError("No valid data found in any dataset")
    
    # Combine all valid records
    bvn_df = pd.concat(bvn_records, ignore_index=True)
    
    # Process duplicates
    bvn_df["duplicated?"] = bvn_df["BVN"].duplicated(keep=False)
    
    # Find first occurrence of duplicated IDs
    duplicate_mapping = bvn_df[bvn_df["duplicated?"]].groupby("BVN")["id"].first().to_dict()
    bvn_df["duplicated_id"] = bvn_df["BVN"].map(lambda x: duplicate_mapping.get(x, ""))
    
    # Create missing BVNs DataFrame if any were found
    missing_bvn_df = pd.concat(missing_bvn_records, ignore_index=True) if missing_bvn_records else None
    
    return bvn_df, missing_bvn_df

In [None]:
def main():
    

In [None]:
if __name__ == "__main__":
    main()