In [1]:
# import libraries
import numpy as np
import pandas as pd
from pandas import ExcelWriter
import time
from datetime import datetime
import datetime as dt
import warnings
warnings.simplefilter("ignore")

In [2]:
# output file name and location
current_date = time.strftime("%Y%m%d")
output_file = 'natural_persons_main' + current_date + '.xlsx'

In [3]:
# import data
file_path = '../data/report_2.xlsx'
df = pd.read_excel(file_path)

***

In [None]:
# filter for individuals
df_natural = df[df['Party Type']=='Natural Person']

In [4]:
#Preprocessing party name
df_natural.loc[:,'Party Name_pre']=df_natural['Party Name'].copy()
df_natural['Party Name_pre'] = df_natural['Party Name_pre'].str \
                                                           .lower() \
                                                           .replace(r'[^\w\s]',"")

In [None]:
#Prepocessing 'Identification Document Number' remove space & casing
df_natural.loc[:,'Identification Document Number_pre']=df_natural['Identification Document Number'].copy()
df_natural['Identification Document Number_pre'] = df_natural['Identification Document Number_pre'].str \
                                                                                                   .lower() \
                                                                                                   .replace(r'[^\w\s]',"")

In [None]:
# create beneficiary only list
bene_only = df_natural.groupby('Party ID')['Relationship (Party Role)'] \
                      .agg(set) \
                      .reset_index(name = 'role_set')

bene_only = bene_only[bene_only['role_set'] == {'Beneficiary'}]
bene_only_list = bene_only['Party ID'].to_list()

df_natural['bene_only'] = df_natural['Party ID'].isin(bene_only_list)

In [None]:
#dates
df_natural['Date of Birth']  = pd.to_datetime(df_natural['Date of Birth'], format='%Y-%m-%d', errors='coerce').dt.date

***

In [5]:
tab24 = df_natural.copy()

# 2.1 create concat column of same party name, DOB
tab24_1 = tab24
tab24_1['key'] = tab24_1['Party Name_pre'].astype(str) + tab24_1['Date of Birth'].astype(str)

# add a column to count no. of Party ID based on same party name, DOB & Party RM
tab24_1['unique_count'] = tab24_1.groupby(['key'])['Party ID'] \
                                 .transform('nunique')

# keep only rows where theres > 1 Party ID 
tab24_1 = tab24_1[tab24_1['unique_count'] > 1]

# 2.2 create concat column of same party name, ID type, ID #
tab24_2 = tab24
tab24_2['key'] = tab24_2["Party Name_pre"].astype(str) + \
                 tab24_2['Identification Document Type'].astype(str) + \
                 tab24_2['Identification Document Number'].astype(str)

# add column to count no. of party ID based on key
tab24_2['unique_count'] = tab24_2.groupby(['key'])['Party ID'] \
                                .transform('nunique')

# keep only rows where theres > 1 Party ID 
tab24_2 = tab24_2[tab24_2['unique_count'] > 1]

# combine
tab24_columns = pd.concat([tab24_1, tab24_2])

In [6]:
# include only parties that hold at least 1 operating role
party_roles = {'Account Holder','Authorised Representative','General Power of Attorney','Limited Power of Attorney',
               'Settlor or Asset Contributor','Founder','Policy Owner','Premium Payer','Subscriber'}

# create a [Relationship (Party Role)] set for each group
tab24_roles = tab24_columns.groupby('key')['Relationship (Party Role)'] \
                          .agg(set) \
                          .reset_index(name='role_set')
role_set = tab24_roles.role_set.tolist()

def check_roles(role_set): # function to check if role set does not contain any operating party roles
    result = role_set.intersection(party_roles)
    if result: return result
    else: return 'discard'
    
tab24_roles['check_roles'] = tab24_roles['role_set'].apply(lambda x: check_roles(x))
tab24_error = tab24_roles[tab24_roles['check_roles'] == 'discard'] #table of parties that do not have any operating roles
tab24_keep = tab24_roles[tab24_roles['check_roles'] != 'discard'] #table of parties that have at least 1 operating role

# compare keep list to original table
tab24_keep_list = tab24_keep['key'].tolist()
tab24_columns['exist'] = tab24_columns['key'].isin(tab24_keep_list)
tab24_columns = tab24_columns[tab24_columns['exist'] == True]

# columns for set of all roles a party holds
key_list = tab24_roles['key'].tolist()
roles_list = tab24_roles['role_set'].tolist()
roles_dict = {x:y for x,y in zip(key_list,roles_list)}
tab24_columns['role_set'] = tab24_columns['key'].map(roles_dict)

# state inconsistency type in new column 'Inconsistency'
tab24_columns['Inconsistency'] = 'Duplicate Parties with at least 1 operating role'

# remove duplicates
tab24_columns = tab24_columns.drop_duplicates(subset = ['Party ID', 'key']) \
                             .sort_values(by = ["key",'Party ID'])

# make numerical index
tab24_index_list = sorted(list(set(tab24_columns['key'].tolist())))
tab24_index_dict = {x:tab24_index_list.index(x)+1 for x in tab24_index_list}
tab24_columns['Index'] = tab24_columns['key'].map(tab24_index_dict)

***

In [7]:
tab24_columns = tab24_columns.drop(['Party Name_pre','Identification Document Number_pre','bene_only','unique_count',
                                    'exist'], axis=1)

# reordering and renaming columns
new_index = ['Index','key','Relationship (Party Role)','role_set',
             'Party ID','Party Name','Party Type','Date of Birth','Place of Birth','Identification Document Type',
             'Identification Document Number','Identification Document Issue Country','Identification Document Expiry Date',
             'Identification Date of Issue','Resident pass holder of','Address Type','Address - Country','Nationality','Booking Centre Officer',
             'Countries of Business / Income Source for this Party','RM UID.1','RM Name.1','RM Location (Code)','RM Team Name.1','RM Market Name',
             'Is Public','Vulnerable Client','Accredited Investor (AI) - Status','Override Expiry','Override Expiry Reason','Override Expiry Justification',
             'Date of Incorporation','Business Registration No','Country of Incorporation','Booking Centre','Portfolio Number',
             'Portfolio Name','Current Status','Relationship Type','Relationship Sub-Type','Category','Main Portfolio','Portfolio Creation Date',
             'RM UID','RM Name','RM Location Name','RM Team Name','Market Name','Booking Center Officer Name','Managed By EAM / EIA / Finder',
             'EAM / EIA Name','EAM / EIA Code','Finder Name','Finder Code','Collab Type','Incoming Channel','Collaboration Account Number',
             'Consolidated Performance CCY','Asset Classification','Service Model','Staff Relationship',
             'Inconsistency']

output = tab24_columns.reindex(columns = new_index)
output = output.rename(columns = {'RM UID.1':'Party RM UID',
                                  'RM Name.1':'Party RM Name',
                                  'RM Team Name.1':'Party RM Team Name'})

In [8]:
output.to_csv("duplicate party id for same client.csv", index=False)