In [1]:
import pandas as pd
from typing import Dict, List
import logging
from itertools import combinations

In [2]:
def setup_logging():
    """Configure logging for the script."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

In [3]:
"""Main execution function."""
setup_logging()
    
# Configuration
file_paths = {
    "asset-management": "./dummy-data/asset_custdata.csv",
    "trustees-digital": "./dummy-data/digital_trustees_custdata.csv",
    "trustees-traditional": "./dummy-data/traditional_trustees_custdata.csv",
    #"insurance": "./dummy-data/individual_insurance_custdata.csv",
    "registrars": "./dummy-data/registrars_shold.csv",
    "securities" : "./dummy-data/securities_custdata.csv"
}

columns_mapping = {
    "asset-management": {"id": "CustAID", "bvn": "CustomerBVN"},
    "trustees-digital": {"id": "platformuserid", "bvn": "Bvn"},
    "trustees-traditional": {"id": "CustAID", "bvn": "CustomerBVN"},
    #"insurance": {"id": "Customer ID", "bvn": "userBVN"},
    "registrars": {"id": "Acctno", "bvn": "bvn"},
    "securities": {"id": "CustAID", "bvn": "CustomerBVN"}
}

In [4]:
logging.info("Starting BVN duplicate analysis")

bvn_records: List[pd.DataFrame] = []
missing_bvn_records: List[pd.DataFrame] = []

2025-02-13 15:15:10,380 - INFO - Starting BVN duplicate analysis


In [5]:
print(bvn_records)
print(missing_bvn_records)

[]
[]


In [7]:
for dataset_name, file_path in file_paths.items():
    logging.info(f"Processing dataset: {dataset_name}")
    
    # Read dataset with only required columns
    df = pd.read_csv(
        file_path,
        usecols=[
            columns_mapping[dataset_name]["id"],
            columns_mapping[dataset_name]["bvn"]
        ]
    )
    
    # Rename columns for consistency
    df.rename(columns={
        columns_mapping[dataset_name]["id"]: "id",
        columns_mapping[dataset_name]["bvn"]: "BVN"
    }, inplace=True)
    
    # Add dataset identifier
    df["entity"] = dataset_name
    
    # Handle missing BVNs
    missing_mask = (df["BVN"].isna()) | (df["BVN"]=="-")
    if missing_mask.any():
        missing_records = df[missing_mask].copy()
        missing_records["reason"] = "Missing BVN"
        missing_bvn_records.append(missing_records)
        logging.warning(f"Found {missing_mask.sum()} missing BVNs in {dataset_name}")
    
    # Remove rows with missing BVNs for main analysis
    df = df[~missing_mask]
    
    bvn_records.append(df)

2025-02-13 15:15:31,264 - INFO - Processing dataset: asset-management
2025-02-13 15:15:31,320 - INFO - Processing dataset: trustees-digital
2025-02-13 15:15:31,330 - INFO - Processing dataset: trustees-traditional
2025-02-13 15:15:31,348 - INFO - Processing dataset: registrars
2025-02-13 15:15:31,363 - INFO - Processing dataset: securities


In [8]:
type(bvn_records[1])

pandas.core.frame.DataFrame

In [9]:
df.tail()

Unnamed: 0,id,BVN,entity
995,dafbe7d3-7b53-4efd-bc17-aed273dba4c2,98233245278,securities
996,bb019bdb-294b-4bf9-b385-f522dbe12a19,43017346108,securities
997,3317f584-69e1-4e25-a927-932c23ae9eb9,82115310834,securities
998,872b39c8-f15e-4191-830e-b2c6e6d1c2fd,17806845015,securities
999,ef5832c9-1c94-4111-ac13-9dadf3771cfe,31769696006,securities


In [10]:
# Combine all valid records
bvn_df = pd.concat(bvn_records, ignore_index=True)
bvn_df.tail()

Unnamed: 0,id,BVN,entity
4985,dafbe7d3-7b53-4efd-bc17-aed273dba4c2,98233245278,securities
4986,bb019bdb-294b-4bf9-b385-f522dbe12a19,43017346108,securities
4987,3317f584-69e1-4e25-a927-932c23ae9eb9,82115310834,securities
4988,872b39c8-f15e-4191-830e-b2c6e6d1c2fd,17806845015,securities
4989,ef5832c9-1c94-4111-ac13-9dadf3771cfe,31769696006,securities


In [14]:
bvn_df[(bvn_df["entity"]=="trustees-traditional") | (bvn_df["entity"]=="trustees-digital")]["entity"] = "trustees"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bvn_df[(bvn_df["entity"]=="trustees-traditional") | (bvn_df["entity"]=="trustees-digital")]["entity"] = "trustees"


In [22]:
bvn_df.columns

Index(['id', 'BVN', 'entity'], dtype='object')

In [30]:
bvn_df.loc[(bvn_df.entity == "trustees-digital") | (bvn_df.entity == "trustees-traditional"), bvn_df.columns[2:]] = "trustees"

In [31]:
bvn_df[(bvn_df["entity"]=="trustees-traditional") | (bvn_df["entity"]=="trustees-digital")]

Unnamed: 0,id,BVN,entity


In [61]:
missing_bvn_records

[                                      id BVN            entity       reason
 7   6b65a6a4-8b81-48f6-b38a-088ca65ed389   -  asset-management  Missing BVN
 9   c241330b-01a9-471f-9e8a-774bcf36d58b   -  asset-management  Missing BVN
 14  43b7a3a6-9a8d-4a03-980d-7b71d8f56413   -  asset-management  Missing BVN
 21  fc377a4c-4a15-444d-85e7-ce8a3a578a8e   -  asset-management  Missing BVN,
                                       id  BVN      entity       reason
 37  b33cf9d7-e2f2-409b-ad3e-3d29b73a54d7    -  securities  Missing BVN
 53  c91645ab-9a8c-4a39-bbd7-bbd222e58336    -  securities  Missing BVN
 65  462c1066-7843-4846-a1f3-be268fc7f55c    -  securities  Missing BVN
 81  b73c5769-c20e-4172-8bbd-e737d28498fe  NaN  securities  Missing BVN
 83  1a33856a-7bb5-4b6e-9adc-07189f12e5b3  NaN  securities  Missing BVN
 98  2c0185b4-6324-4423-b1a7-9ab72bb7606f  NaN  securities  Missing BVN]

In [10]:
serial_id = []
for i in range(len(bvn_df)):
    serial_id.append(i+1)

In [57]:
bvn_df["serial_no"] = bvn_df.index + 1

In [62]:
bvn_df.head()

Unnamed: 0,id,BVN,entity,serial_no
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,36334337114,asset-management,1
1,23b8c1e9-3924-46de-beb1-3b9046685257,80347130822,asset-management,2
2,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,89813391321,asset-management,3
3,972a8469-1641-4f82-8b9d-2434e465e150,97383703538,asset-management,4
4,17fc695a-07a0-4a6e-8822-e8f36c031199,45229118845,asset-management,5


In [63]:
len(bvn_df)

4990

In [64]:
# Process duplicates
bvn_df["duplicated?"] = bvn_df["BVN"].duplicated(keep=False)

In [68]:
bvn_df.tail()

Unnamed: 0,id,BVN,entity,serial_no,duplicated?
4985,dafbe7d3-7b53-4efd-bc17-aed273dba4c2,98233245278,securities,4986,False
4986,bb019bdb-294b-4bf9-b385-f522dbe12a19,43017346108,securities,4987,False
4987,3317f584-69e1-4e25-a927-932c23ae9eb9,82115310834,securities,4988,False
4988,872b39c8-f15e-4191-830e-b2c6e6d1c2fd,17806845015,securities,4989,False
4989,ef5832c9-1c94-4111-ac13-9dadf3771cfe,31769696006,securities,4990,False


In [69]:
bvn_df[bvn_df["duplicated?"]==True].describe()

Unnamed: 0,serial_no
count,642.0
mean,3480.191589
std,376.573324
min,23.0
25%,3241.25
50%,3488.0
75%,3755.5
max,4118.0


In [70]:
bvn_df[bvn_df["duplicated?"]==True].head()

Unnamed: 0,id,BVN,entity,serial_no,duplicated?
22,aefcfad8-efc8-4849-b3aa-7efe4458a885,24788491720,asset-management,23,True
1009,88efe82b-1f3e-4a66-af6c-6f61cfefa2a2,24788491720,trustees-digital,1010,True
1024,10fd7164-819f-4393-82ce-ef41b2cfba8b,24788491720,trustees-digital,1025,True
1026,53a71719-bcdf-4302-8c37-e264d03cd20a,24788491720,trustees-digital,1027,True
1031,93683dd6-5050-427e-8e38-5d03a0d80651,24788491720,trustees-digital,1032,True


In [71]:
bvn_df[bvn_df["BVN"]=="indeed"]

Unnamed: 0,id,BVN,entity,serial_no,duplicated?
2996,19b51f73-939d-4b36-b310-49663273c75e,indeed,registrars,2997,True
3037,628a313c-46b4-4460-baa4-960b4b8becda,indeed,registrars,3038,True


In [72]:
# Find first occurrence of duplicated IDs
duplicate_mapping = bvn_df[bvn_df["duplicated?"]].groupby("BVN")["serial_no"].first().to_dict()
duplicate_mapping

{24788491720: 1010,
 '24788491720': 23,
 'Democrat': 3118,
 'TV': 3211,
 'a': 3184,
 'according': 3305,
 'action': 3183,
 'add': 3338,
 'adult': 3264,
 'affect': 2999,
 'against': 3870,
 'age': 3275,
 'agency': 3317,
 'almost': 3643,
 'already': 3150,
 'amount': 3147,
 'approach': 3259,
 'area': 3019,
 'article': 3358,
 'as': 3244,
 'ask': 3341,
 'attack': 3403,
 'authority': 3056,
 'baby': 3231,
 'bad': 3391,
 'bag': 3353,
 'ball': 3281,
 'base': 3661,
 'become': 3510,
 'between': 3020,
 'big': 3246,
 'bit': 3361,
 'black': 3480,
 'board': 3333,
 'book': 3162,
 'both': 3552,
 'box': 3733,
 'budget': 3090,
 'but': 3667,
 'buy': 3159,
 'camera': 3095,
 'care': 3582,
 'case': 3101,
 'catch': 3039,
 'center': 3018,
 'chance': 3023,
 'charge': 3248,
 'choice': 3034,
 'church': 3098,
 'coach': 3104,
 'collection': 3770,
 'color': 3348,
 'commercial': 3014,
 'compare': 3204,
 'consider': 3504,
 'contain': 3568,
 'could': 3498,
 'cover': 3109,
 'crime': 3602,
 'culture': 3062,
 'dark': 3053,


In [73]:
bvn_df["duplicated_serial_no"] = bvn_df["BVN"].map(lambda x: duplicate_mapping.get(x, ""))

In [74]:
bvn_df.tail(100)

Unnamed: 0,id,BVN,entity,serial_no,duplicated?,duplicated_serial_no
4890,e05b89bf-f610-45e4-bc3f-3acb9d0281d6,99380462806,securities,4891,False,
4891,cfa4c114-32d1-4de0-9140-df7717b611d5,43758911403,securities,4892,False,
4892,761bcd2c-a6df-4696-8c07-fad01d2b1c91,41433112707,securities,4893,False,
4893,bd65abbe-3baf-4a24-a13c-5923381b4cbd,85881941624,securities,4894,False,
4894,912606da-1a13-423c-93a0-188a312e673b,18192237948,securities,4895,False,
...,...,...,...,...,...,...
4985,dafbe7d3-7b53-4efd-bc17-aed273dba4c2,98233245278,securities,4986,False,
4986,bb019bdb-294b-4bf9-b385-f522dbe12a19,43017346108,securities,4987,False,
4987,3317f584-69e1-4e25-a927-932c23ae9eb9,82115310834,securities,4988,False,
4988,872b39c8-f15e-4191-830e-b2c6e6d1c2fd,17806845015,securities,4989,False,


In [75]:
bvn_df[bvn_df["duplicated_serial_no"]!=""]

Unnamed: 0,id,BVN,entity,serial_no,duplicated?,duplicated_serial_no
22,aefcfad8-efc8-4849-b3aa-7efe4458a885,24788491720,asset-management,23,True,23
1009,88efe82b-1f3e-4a66-af6c-6f61cfefa2a2,24788491720,trustees-digital,1010,True,1010
1024,10fd7164-819f-4393-82ce-ef41b2cfba8b,24788491720,trustees-digital,1025,True,1010
1026,53a71719-bcdf-4302-8c37-e264d03cd20a,24788491720,trustees-digital,1027,True,1010
1031,93683dd6-5050-427e-8e38-5d03a0d80651,24788491720,trustees-digital,1032,True,1010
...,...,...,...,...,...,...
3995,013f34a6-c49d-4d2f-8a84-cb5d110a98b7,article,registrars,3996,True,3358
4011,d6f810b5-3075-4c55-879d-53fcdf38e1dc,24788491720,securities,4012,True,23
4024,b0bc733a-ac3a-4219-a163-d059a9b49c8d,24788491720,securities,4025,True,23
4102,7d5ad5c5-6131-449d-a6b3-195f2c54cbcb,24788491720,securities,4103,True,23


In [76]:
bvn_df[(bvn_df["duplicated?"]==False)&(bvn_df["duplicated_serial_no"]!="")]

Unnamed: 0,id,BVN,entity,serial_no,duplicated?,duplicated_serial_no


In [77]:
# Create missing BVNs DataFrame if any were found
missing_bvn_df = pd.concat(missing_bvn_records, ignore_index=True) if missing_bvn_records else None

In [78]:
missing_bvn_df

Unnamed: 0,id,BVN,entity,reason
0,6b65a6a4-8b81-48f6-b38a-088ca65ed389,-,asset-management,Missing BVN
1,c241330b-01a9-471f-9e8a-774bcf36d58b,-,asset-management,Missing BVN
2,43b7a3a6-9a8d-4a03-980d-7b71d8f56413,-,asset-management,Missing BVN
3,fc377a4c-4a15-444d-85e7-ce8a3a578a8e,-,asset-management,Missing BVN
4,b33cf9d7-e2f2-409b-ad3e-3d29b73a54d7,-,securities,Missing BVN
5,c91645ab-9a8c-4a39-bbd7-bbd222e58336,-,securities,Missing BVN
6,462c1066-7843-4846-a1f3-be268fc7f55c,-,securities,Missing BVN
7,b73c5769-c20e-4172-8bbd-e737d28498fe,,securities,Missing BVN
8,1a33856a-7bb5-4b6e-9adc-07189f12e5b3,,securities,Missing BVN
9,2c0185b4-6324-4423-b1a7-9ab72bb7606f,,securities,Missing BVN


In [79]:
bvn_df.head()

Unnamed: 0,id,BVN,entity,serial_no,duplicated?,duplicated_serial_no
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,36334337114,asset-management,1,False,
1,23b8c1e9-3924-46de-beb1-3b9046685257,80347130822,asset-management,2,False,
2,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,89813391321,asset-management,3,False,
3,972a8469-1641-4f82-8b9d-2434e465e150,97383703538,asset-management,4,False,
4,17fc695a-07a0-4a6e-8822-e8f36c031199,45229118845,asset-management,5,False,


In [80]:
bvn_df.to_csv("bvn_comparison.csv")

In [81]:
# Load the CSV file
file_path = "bvn_comparison.csv"  # Change this to your actual file path
df = pd.read_csv(file_path, dtype={"BVN": str, "customer_id": str, "entity": str, "serial_no": str})

In [82]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,BVN,entity,serial_no,duplicated?,duplicated_serial_no
0,0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,36334337114,asset-management,1,False,
1,1,23b8c1e9-3924-46de-beb1-3b9046685257,80347130822,asset-management,2,False,
2,2,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,89813391321,asset-management,3,False,
3,3,972a8469-1641-4f82-8b9d-2434e465e150,97383703538,asset-management,4,False,
4,4,17fc695a-07a0-4a6e-8822-e8f36c031199,45229118845,asset-management,5,False,


In [83]:
df[df["BVN"].isnull()]

Unnamed: 0.1,Unnamed: 0,id,BVN,entity,serial_no,duplicated?,duplicated_serial_no


In [84]:
# Ensure BVN column is not empty
df = df.dropna(subset=["BVN"])

In [85]:
df.groupby("entity")["BVN"].nunique().reset_index()

Unnamed: 0,entity,BVN
0,asset-management,996
1,registrars,632
2,securities,991
3,trustees-digital,997
4,trustees-traditional,1000


In [86]:
# Unique BVNs per entity
unique_bvn_per_entity = df.groupby("entity")["BVN"].nunique().reset_index()
unique_bvn_per_entity

Unnamed: 0,entity,BVN
0,asset-management,996
1,registrars,632
2,securities,991
3,trustees-digital,997
4,trustees-traditional,1000


In [87]:
unique_bvn_per_entity.columns = ["Entity", "Unique_BVN_Count"]
unique_bvn_per_entity

Unnamed: 0,Entity,Unique_BVN_Count
0,asset-management,996
1,registrars,632
2,securities,991
3,trustees-digital,997
4,trustees-traditional,1000


In [88]:
# Identify BVNs appearing in multiple entities
bvn_entity_counts = df.groupby("BVN")["entity"].nunique().reset_index()
bvn_entity_counts

Unnamed: 0,BVN,entity
0,10027715804,1
1,10038334599,1
2,10054887150,1
3,10216834659,1
4,10231471601,1
...,...,...
4609,yeah,1
4610,yes,1
4611,yet,1
4612,you,1


In [91]:
# Cross-entity BVNs
cross_entity = df.groupby('BVN').agg(
    entity_count=('entity', 'nunique'),
    entities=('entity', lambda x: ', '.join(sorted(x.unique())))
).reset_index()
cross_entity.head()

Unnamed: 0,BVN,entity_count,entities
0,10027715804,1,asset-management
1,10038334599,1,trustees-traditional
2,10054887150,1,asset-management
3,10216834659,1,securities
4,10231471601,1,securities


In [93]:
# Generate all possible entity combinations and their counts
unique_entities = sorted(df['entity'].unique())
entity_combinations = []

In [98]:
unique_entities

['asset-management',
 'registrars',
 'securities',
 'trustees-digital',
 'trustees-traditional']

In [95]:
for i in range(2, len(unique_entities) + 1):
    print(i)

2
3
4
5


In [99]:
# For each possible number of entities (2 through total number of entities)
for i in range(2, len(unique_entities) + 1):
    # Generate all possible combinations of that size
    for combo in combinations(unique_entities, i):
        # Find BVNs that appear in all entities in this combination
        mask = cross_entity['entities'].apply(
            lambda x: all(entity in x.split(', ') for entity in combo)
        )
        bvns_in_combo = cross_entity[mask]['BVN'].tolist()
        
        if bvns_in_combo:  # Only add if there are matching BVNs
            entity_combinations.append({
                'Combination Size': i,
                'Entities': ' & '.join(combo),
                'BVN Count': len(bvns_in_combo),
                'BVNs': ', '.join(bvns_in_combo)
            })

In [100]:
entity_combinations

[{'Combination Size': 2,
  'Entities': 'asset-management & securities',
  'BVN Count': 1,
  'BVNs': '24788491720'},
 {'Combination Size': 2,
  'Entities': 'asset-management & trustees-digital',
  'BVN Count': 1,
  'BVNs': '24788491720'},
 {'Combination Size': 2,
  'Entities': 'securities & trustees-digital',
  'BVN Count': 1,
  'BVNs': '24788491720'},
 {'Combination Size': 3,
  'Entities': 'asset-management & securities & trustees-digital',
  'BVN Count': 1,
  'BVNs': '24788491720'}]

In [101]:
entity_combinations_df = pd.DataFrame(entity_combinations)
entity_combinations_df

Unnamed: 0,Combination Size,Entities,BVN Count,BVNs
0,2,asset-management & securities,1,24788491720
1,2,asset-management & trustees-digital,1,24788491720
2,2,securities & trustees-digital,1,24788491720
3,3,asset-management & securities & trustees-digital,1,24788491720


In [102]:
if not entity_combinations_df.empty:
    entity_combinations_df = entity_combinations_df.sort_values(
        ['Combination Size', 'BVN Count'], 
        ascending=[True, False]
    )

In [105]:
not entity_combinations_df.empty

True

In [103]:
entity_combinations_df

Unnamed: 0,Combination Size,Entities,BVN Count,BVNs
0,2,asset-management & securities,1,24788491720
1,2,asset-management & trustees-digital,1,24788491720
2,2,securities & trustees-digital,1,24788491720
3,3,asset-management & securities & trustees-digital,1,24788491720


In [107]:
merged_details = pd.merge(
            df[['BVN', 'entity', 'id', 'serial_no', 'duplicated?', 'duplicated_serial_no']],
            cross_entity[['BVN', 'entity_count', 'entities']],
            on='BVN',
            how='right'
        ).sort_values(['BVN', 'entity'])
merged_details

Unnamed: 0,BVN,entity,id,serial_no,duplicated?,duplicated_serial_no,entity_count,entities
0,10027715804,asset-management,e916da57-f248-43ab-977a-8a5f6ffe33b3,681,False,,1,asset-management
1,10038334599,trustees-traditional,bb66e104-01f5-4557-8859-e181ece2a489,2493,False,,1,trustees-traditional
2,10054887150,asset-management,069f14f1-4018-4c6e-9a8c-fa3c5283aac7,356,False,,1,asset-management
3,10216834659,securities,ecf35c83-4855-4330-9b4c-4132dca47da8,4008,False,,1,securities
4,10231471601,securities,4efa6160-cb76-4244-9da1-7db303b6418d,4954,False,,1,securities
...,...,...,...,...,...,...,...,...
4985,yeah,registrars,e2575b99-2421-4817-8fb0-970828dd2e89,3792,False,,1,registrars
4986,yes,registrars,5d066099-1231-48b8-bba4-6b47cb899eae,3682,False,,1,registrars
4987,yet,registrars,6f05d8fb-52bb-4f4b-83fd-3b41969b4652,3637,False,,1,registrars
4988,you,registrars,5e6601de-9ff7-486b-a422-c6ccb84736f2,3401,False,,1,registrars


In [108]:
merged_details[merged_details["entity_count"]>1]

Unnamed: 0,BVN,entity,id,serial_no,duplicated?,duplicated_serial_no,entity_count,entities
655,24788491720,asset-management,aefcfad8-efc8-4849-b3aa-7efe4458a885,23,True,23.0,3,"asset-management, securities, trustees-digital"
660,24788491720,securities,d6f810b5-3075-4c55-879d-53fcdf38e1dc,4012,True,23.0,3,"asset-management, securities, trustees-digital"
661,24788491720,securities,b0bc733a-ac3a-4219-a163-d059a9b49c8d,4025,True,23.0,3,"asset-management, securities, trustees-digital"
662,24788491720,securities,7d5ad5c5-6131-449d-a6b3-195f2c54cbcb,4103,True,23.0,3,"asset-management, securities, trustees-digital"
663,24788491720,securities,f6b138da-40b0-400e-94e0-c931340e9439,4118,True,23.0,3,"asset-management, securities, trustees-digital"
656,24788491720,trustees-digital,88efe82b-1f3e-4a66-af6c-6f61cfefa2a2,1010,True,1010.0,3,"asset-management, securities, trustees-digital"
657,24788491720,trustees-digital,10fd7164-819f-4393-82ce-ef41b2cfba8b,1025,True,1010.0,3,"asset-management, securities, trustees-digital"
658,24788491720,trustees-digital,53a71719-bcdf-4302-8c37-e264d03cd20a,1027,True,1010.0,3,"asset-management, securities, trustees-digital"
659,24788491720,trustees-digital,93683dd6-5050-427e-8e38-5d03a0d80651,1032,True,1010.0,3,"asset-management, securities, trustees-digital"


In [45]:

multi_entity_bvns = bvn_entity_counts[bvn_entity_counts["entity"] > 1]

In [None]:
# Get details of BVNs appearing in multiple entities
bvn_entity_mapping = df[df["BVN"].isin(multi_entity_bvns["BVN"])].groupby("BVN")["entity"].unique().reset_index()


In [46]:
df[df["BVN"].isin(multi_entity_bvns["BVN"])].groupby("BVN")["entity"].unique().reset_index()

Unnamed: 0,BVN,entity


In [None]:
bvn_entity_mapping.columns = ["BVN", "Entities"]

In [None]:
# Display results
print("\n📊 Unique BVN Count Per Entity:\n", unique_bvn_per_entity)
print("\n🔍 BVNs Found in Multiple Entities:\n", bvn_entity_mapping)

In [None]:
# Save the results to CSV
unique_bvn_per_entity.to_csv("unique_bvn_per_entity.csv", index=False)
bvn_entity_mapping.to_csv("bvn_in_multiple_entities.csv", index=False)

print("\n✅ Analysis completed. Results saved to CSV files.")


In [None]:
# if __name__ == "__main__":
#     main()

In [12]:
import pandas as pd

df = pd.read_csv("/home/diyiola/learning/projects/data-profiling/cross-entity/dummy-data/digital_trustees_custdata.csv")

In [13]:
df.head()

Unnamed: 0,platformuserid,Bvn,IdentificationType,IdentificationNumber,IdentificationIssueDate,IdentificationExpiryDate,Gender,RMID,IsPep,CustomerId
0,88917cf5-7d88-43d2-8e51-208396e83922,32531467689,NIN,890834863,2018-03-15,2025-09-27,Male,5e9a35cf-0699-476d-9288-b06ad51f03b1,middle,reach
1,1ceb08fe-f2de-4798-b31a-25b55fecaa6e,81410007737,Passport,762172263,2021-10-07,2026-10-08,Female,f2a83faa-f11a-41f5-8826-80f934010b60,audience,quickly
2,79900a27-6c7c-4eb7-8bb5-f24251057c35,32784064626,Passport,638064264,2023-10-22,2025-06-08,Male,71e007b0-bb6a-4647-ad2f-458d9237e145,cover,sound
3,47198f5c-f69b-4596-bf10-938caec7f267,18463059478,Driver License,272899544,2019-11-18,2030-03-18,Male,742f790e-d4be-44e4-a04d-3f5585594d77,not,know
4,2518ec09-efdd-4a87-88c6-21bee1228e75,27908188087,Driver License,163219122,2020-06-12,2030-12-19,Female,b75c3825-1c0f-487f-ab01-bba74b814a84,generation,bar


In [19]:
# Pivot the data
df_transformed = df.pivot_table(index="platformuserid", columns="IdentificationType", values="IdentificationNumber", aggfunc="first")

df_transformed.head()

IdentificationType,Driver License,NIN,Passport
platformuserid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01afb939-c4df-4b8d-84eb-8e5d6a6e94cf,,417429646.0,
01d3a033-3586-4e6f-9b24-955cb4fbdaf2,,295104998.0,
0265a223-71cf-4772-b167-d947efadf069,,,213275094.0
0273be68-e6c5-4969-8118-71cf79fdcd33,194352372.0,,
054046cc-16d5-4b84-a61f-7117dd926cbf,,506195741.0,


In [20]:
# Reset index to turn multi-index into columns
df_transformed.reset_index(inplace=True)
df_transformed

IdentificationType,platformuserid,Driver License,NIN,Passport
0,01afb939-c4df-4b8d-84eb-8e5d6a6e94cf,,417429646.0,
1,01d3a033-3586-4e6f-9b24-955cb4fbdaf2,,295104998.0,
2,0265a223-71cf-4772-b167-d947efadf069,,,213275094.0
3,0273be68-e6c5-4969-8118-71cf79fdcd33,194352372.0,,
4,054046cc-16d5-4b84-a61f-7117dd926cbf,,506195741.0,
...,...,...,...,...
491,fbf3a106-260b-43e0-8ce6-929f7a44bd05,,,423414243.0
492,fe60afde-4200-41aa-bc82-6fbe395e54f0,926950354.0,,
493,ff1b654d-6c65-459d-a2ef-1e217fb32568,,,966485850.0
494,ff7d39f7-9563-4c97-b65f-982b95eef45c,,892351849.0,


In [21]:
# Rename columns for clarity (optional)
df_transformed.columns.name = None  # Remove multi-index name

df_transformed.head()

Unnamed: 0,platformuserid,Driver License,NIN,Passport
0,01afb939-c4df-4b8d-84eb-8e5d6a6e94cf,,417429646.0,
1,01d3a033-3586-4e6f-9b24-955cb4fbdaf2,,295104998.0,
2,0265a223-71cf-4772-b167-d947efadf069,,,213275094.0
3,0273be68-e6c5-4969-8118-71cf79fdcd33,194352372.0,,
4,054046cc-16d5-4b84-a61f-7117dd926cbf,,506195741.0,


In [22]:
df_transformed.iloc[:,1:]

Unnamed: 0,Driver License,NIN,Passport
0,,417429646.0,
1,,295104998.0,
2,,,213275094.0
3,194352372.0,,
4,,506195741.0,
...,...,...,...
491,,,423414243.0
492,926950354.0,,
493,,,966485850.0
494,,892351849.0,


In [23]:
df_transformed.iloc[:,1:] = df_transformed.iloc[:,1:].apply(pd.to_numeric, errors='coerce').astype('Int64')
df_transformed

Unnamed: 0,platformuserid,Driver License,NIN,Passport
0,01afb939-c4df-4b8d-84eb-8e5d6a6e94cf,,417429646,
1,01d3a033-3586-4e6f-9b24-955cb4fbdaf2,,295104998,
2,0265a223-71cf-4772-b167-d947efadf069,,,213275094
3,0273be68-e6c5-4969-8118-71cf79fdcd33,194352372,,
4,054046cc-16d5-4b84-a61f-7117dd926cbf,,506195741,
...,...,...,...,...
491,fbf3a106-260b-43e0-8ce6-929f7a44bd05,,,423414243
492,fe60afde-4200-41aa-bc82-6fbe395e54f0,926950354,,
493,ff1b654d-6c65-459d-a2ef-1e217fb32568,,,966485850
494,ff7d39f7-9563-4c97-b65f-982b95eef45c,,892351849,


In [18]:
merged_details = pd.merge(
            df, df_transformed,
            on='platformuserid',
            how='right'
        ).sort_values(['platformuserid'])
merged_details

Unnamed: 0,platformuserid,Bvn,IdentificationType,IdentificationNumber,IdentificationIssueDate,IdentificationExpiryDate,Gender,RMID,IsPep,CustomerId,Driver License,NIN,Passport
0,01afb939-c4df-4b8d-84eb-8e5d6a6e94cf,90397031295,NIN,417429646,2016-11-29,2034-12-29,Male,49e785c7-cbd8-415e-ae98-b722cf97b016,we,usually,,417429646,
1,01d3a033-3586-4e6f-9b24-955cb4fbdaf2,34898665269,NIN,295104998,2019-11-14,2030-07-25,Male,78e00f78-2cda-40b2-bd43-c4fedfe91357,hope,off,,295104998,
2,0265a223-71cf-4772-b167-d947efadf069,78741322166,Passport,213275094,2018-04-24,2033-06-04,Male,05d86c95-f20b-4199-a8a5-7ee7bb6e0d06,your,few,,,213275094
3,0273be68-e6c5-4969-8118-71cf79fdcd33,27902876703,Driver License,194352372,2016-06-03,2028-03-16,Male,676a504c-dbeb-41e2-b9a0-1a0b64657945,rise,thing,194352372,,
4,054046cc-16d5-4b84-a61f-7117dd926cbf,43204897426,NIN,506195741,2016-11-18,2034-10-24,Female,9055e085-5f23-4322-a3b9-3a349d7739b2,week,and,,506195741,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,fbf3a106-260b-43e0-8ce6-929f7a44bd05,14252989355,Passport,423414243,2022-09-07,2033-10-10,Male,c4596172-9fdf-4b1b-9e05-b8b14e45a736,offer,agent,,,423414243
492,fe60afde-4200-41aa-bc82-6fbe395e54f0,48644704878,Driver License,926950354,2018-06-10,2033-08-17,Female,a3506764-37af-463b-82ac-1c61c9eaa5b9,reduce,simply,926950354,,
493,ff1b654d-6c65-459d-a2ef-1e217fb32568,89073134526,Passport,966485850,2018-03-23,2025-07-04,Female,f47bffba-f561-4747-82d2-2c7408ea6671,trip,site,,,966485850
494,ff7d39f7-9563-4c97-b65f-982b95eef45c,27388554601,NIN,892351849,2019-07-11,2032-08-17,Female,696f9074-4084-4290-be50-a000c474e1c0,onto,despite,,892351849,
