## Party Data Validation

In [None]:
import pandas as pd
import numpy as np
from pandas import ExcelWriter
import time
from datetime import datetime
import warnings
warnings.simplefilter("ignore")

In [None]:
# output file name and location
current_date = time.strftime("%Y%m%d")
output_file = 'legal_entities_main_' + current_date + '.xlsx'
output_file_text = 'natural_persons_main_' + current_date + '.txt'

In [None]:
# get data source
file_path = input("input file path of report 2: ") #

In [None]:
# read data
df = pd.read_excel(file_path)

In [None]:
# create text file for links used
with open(output_file_text, 'w') as f:
    f.write('Report path:' + file_path)
f.close()

## For non-Individual

In [None]:
# filter for individuals
df_non_natural = df[df['Party Type']!='Natural Person']

### Preprocessing of Party Name & Business Registration No

In [None]:
df_non_natural.loc[:,'Party Name_pre']=df_non_natural['Party Name'].copy()

df_non_natural['Party Name_pre'] = df_non_natural['Party Name_pre'].str.strip()
df_non_natural['Party Name_pre'] = df_non_natural['Party Name_pre'].str.replace(".","")
df_non_natural['Party Name_pre'] = df_non_natural['Party Name_pre'].str.replace(" ","")
df_non_natural['Party Name_pre'] = df_non_natural['Party Name_pre'].str.replace('\t','')
df_non_natural['Party Name_pre'] = df_non_natural['Party Name_pre'].str.replace(",","")
df_non_natural['Party Name_pre'] = df_non_natural['Party Name_pre'].str.lower()

df_non_natural.loc[:,'Business Registration No_pre']=df_non_natural['Business Registration No'].copy()               
df_non_natural['Business Registration No_pre'] = df_non_natural['Business Registration No_pre'].str.replace(".","")
df_non_natural['Business Registration No_pre'] = df_non_natural['Business Registration No_pre'].str.replace(" ","")
df_non_natural['Business Registration No_pre'] = df_non_natural['Business Registration No_pre'].str.replace(",","")
df_non_natural['Business Registration No_pre'] = df_non_natural['Business Registration No_pre'].str.lower()

# beneficiary only list
bene_only = df_non_natural.groupby('Party ID')['Relationship (Party Role)'].agg(set).reset_index(name='role_set')
bene_only = bene_only[bene_only['role_set']=={'Beneficiary'}]
bene_only_list = bene_only['Party ID'].to_list()
df_non_natural['bene_only'] = df_non_natural['Party ID'].isin(bene_only_list)

#dates
df_non_natural['Date of Incorporation']  = pd.to_datetime(df_non_natural['Date of Incorporation'], format='%Y-%m-%d', errors='coerce').dt.date

In [None]:
# add universal columns
df_non_natural['Batch'] = input("Year, Quarter: (eg. format: 2023 Q1)")
df_non_natural[['Remarks','Action Required','Action Team','Status of Cleanup','Completion Date','Valid Exception']] = ""

### Scenario1: Identify records with Same Party Name, Same Date of Incorporation, Same Party RM UID but different Party ID

In [None]:
# pick up rows that have duplicated party name, date of incorporation, and RM UID 
#tab1_dup = df_non_natural[df_non_natural.duplicated(['Party Name','Date of Incorporation','RM UID.1'])]

tab1=df_non_natural.copy()
#create concat column of same party name, same DOI & same Party RM (key)
tab1['concat']=tab1['Party Name_pre'].astype(str) + tab1['Date of Incorporation'].astype(str) + tab1['RM UID.1'].astype(str)

tab1_unique_partyID=tab1.copy()
# Add a column to count no. of unique party IDs based on same Party Name, Party RM & DOI
tab1_unique_partyID['Unique Party ID Count']=tab1_unique_partyID.groupby(['concat'])['Party ID'].transform('nunique')
# Keep only rows where theres > 1 Party ID 
tab1_unique_partyID = tab1_unique_partyID[tab1_unique_partyID['Unique Party ID Count'] > 1]
# remove duplicate Party IDs
tab1_unique_partyID=tab1_unique_partyID.drop_duplicates(subset='Party ID')
# Sort by Party Name
tab1_test =tab1_unique_partyID.sort_values(by=['Party Name_pre'])
#state inconsistency type in new column 'Inconsistency'
tab1_test['Inconsistency'] = 'Same Name & DOI & RM, diff Party ID'

### Scenario2: Relationship (Party Role) == “Account Holder” OR “Beneficial Owner”. Identify records with Same Party ID but different Portfolio RM UID


In [None]:
# relationship (Party Role) == “Account Holder” OR “Beneficial Owner”
tab2_fil = df_non_natural[(df_non_natural['Relationship (Party Role)']=='Account Holder')
                         |(df_non_natural['Relationship (Party Role)']=='Beneficial Owner')]


# remove duplicate Party IDs and Portfolio RM UIDs
tab2_fil=tab2_fil.drop_duplicates(subset=['Party ID','RM UID'])

# Add a column to count no. of portfolio RM UID based on same Party ID
tab2_fil['Portfolio RM Count']=tab2_fil.groupby(['Party ID'])['RM UID'].transform('count')
# Keep only rows where theres > 1 Party ID 
tab2_dup = tab2_fil[tab2_fil['Portfolio RM Count'] > 1]
# Sort by Party Name
tab2_test=tab2_dup.sort_values(by=['Party Name_pre'])

tab2_test['Inconsistency'] = 'Party ID tagged to 2 portfolio RMs (Party Role = AH & BO only)'


### Scenario4: Identify records with same Date of Incorporation, same Business Registration No, same Country of Incorporation, but different Party Name

In [None]:
# create concat column identifier of DOI,BizReg,COI
tab4=df_non_natural.copy()

#create concat column of same DOI, biz reg no & COI
tab4['concat']=tab4['Date of Incorporation'].astype(str) + tab4['Business Registration No_pre'].astype(str) + tab4['Country of Incorporation'].astype(str)
#create column to count no. of unique party names
tab4['Unique party name count']=tab4.groupby(['concat'])['Party Name_pre'].transform('nunique')
#keep rows where unique party name count is more than 1
tab4=tab4[tab4['Unique party name count'] > 1]
tab4 = tab4.sort_values(by=['concat','Unique party name count']) 
#remove duplicate Party IDs
tab4_test=tab4.drop_duplicates(subset='Party ID')
tab4_test['Inconsistency'] = 'Same DOI, Biz Reg No. & COI, diff Party Name'

### Scenario5: Identify records with Same Party Name, Date of Incorporation but different Country of Incorporation

In [None]:
# identify records with same Same Party Name, Date of Incorporation
tab5_dup = df_non_natural.copy()
# create concat column identifier of Party name & DOI

tab5_dup['concat']=tab5_dup['Party Name_pre'].astype(str) + tab5_dup['Date of Incorporation'].astype(str)

# remove duplicate Party IDs
tab5_dup=tab5_dup .drop_duplicates(subset='Party ID')

# Add a column to count no. of unique COI based on same Party Name, & DOI
tab5_dup['Unique COI Count']=tab5_dup.groupby(['Date of Incorporation','Party Name_pre'])['Country of Incorporation'].transform('nunique')

tab5_test = tab5_dup[tab5_dup['Unique COI Count']>1]
tab5_test['Inconsistency'] = 'Same Party Name & DOI, diff COI'


### Scenario6: Identify records with Same Party Name, Country of Incorporation but different Date of Incorporation

In [None]:
# create concat column identifier of Party name & COI
tab6=df_non_natural.copy()
tab6['concat']=tab6['Party Name_pre'].astype(str) + tab6['Country of Incorporation'].astype(str)
tab6['Date of Incorporation']=tab6['Date of Incorporation'].astype(str)
tab6['Unique DOI Count']=tab6.groupby(['concat'])['Date of Incorporation'].transform('nunique')

tab6_diff = tab6[tab6['Unique DOI Count']>1]
tab6_test=tab6_diff.drop_duplicates(subset='Party ID')
tab6_test['Inconsistency'] = 'Same Party Name & COI, diff DOI'

### Scenario7: Identify records with  Address Type == Registered Business Address, Same Party Name, Date of Incorporation but different Address - Country

In [None]:
# filter for Address Type == Registered Business Address
tab7_fil = df_non_natural[df_non_natural['Address Type']=='Registered Business Address']

# create concat column identifier of Party name & DOI

tab7_fil['concat']=tab7_fil['Party Name_pre'].astype(str) + tab7_fil['Date of Incorporation'].astype(str)
tab7_fil['Unique Add Ctry Count']=tab7_fil.groupby(['concat'])['Address - Country'].transform('nunique')

tab7_diff = tab7_fil[tab7_fil['Unique Add Ctry Count']>1]
tab7_test =tab7_diff.drop_duplicates(subset='Party ID')
tab7_test['Inconsistency'] = 'Same Name, DOI & Add Type, diff Add Ctry'


In [None]:
# extract columns for tab
#tab7_test = tab7_diff[['Party ID', 'Party Name', 'RM UID.1','RM Name.1', 'Date of Incorporation', 'Country of Incorporation','Address Type','Address - Country','Business Registration No']] # extract certain columns
#tab7_test.columns = ['Party ID', 'Party Name', 'Party RM UID','Party RM Name', 'Date of Incorporation', 'Country of Incorporation','Address Type','Address - Country','Business Registration No'] # rename columns

# add in error type col
#tab7_test['Inconsistency'] = 'Same Name, DOI & Add Type, diff Add Ctry'
#tab7_test

### Scenario8: Identify records with Same Party Name, Date of Incorporation but different Business Registration No

In [None]:
tab8_dup = df_non_natural.copy()
# create concat column identifier of Party name & DOI

tab8_dup['concat']=tab8_dup['Party Name_pre'].astype(str) + tab8_dup['Date of Incorporation'].astype(str)
tab8_dup['Unique Biz Reg Count']=tab8_dup.groupby(['concat'])['Business Registration No_pre'].transform('nunique')

tab8_diff = tab8_dup[tab8_dup['Unique Biz Reg Count']>1]
tab8_diff =tab8_diff.drop_duplicates(subset='Party ID')
tab8_test=tab8_diff.sort_values(by=['Party Name_pre'])

tab8_test['Inconsistency'] = 'Same Name Same DOI, Diff Biz Reg No.'

In [None]:
# extract columns for tab
#tab8_test = tab8_diff[['Party ID', 'Party Name', 'RM UID.1','RM Name.1', 'Date of Incorporation', 'Country of Incorporation','Address Type','Address - Country','Business Registration No']] # extract certain columns
#tab8_test.columns = ['Party ID', 'Party Name', 'Party RM UID','Party RM Name', 'Date of Incorporation', 'Country of Incorporation','Address Type','Address - Country','Business Registration No'] # rename columns

# add in error type col
#tab8_test['Inconsistency'] = 'Same Name Same DOI, Diff Biz Reg No.'
#tab8_test

### Scenario9: Invalid Biz Reg No.

In [None]:
# Relationship (Party Role) not equals to “Beneficiary”
tab9_fil = df_non_natural[df_non_natural['Relationship (Party Role)'] != 'Beneficiary']


# extract rows which Business Registration No. contains invalid value
#is there any way to regex this or do we need to add to the list manually
tab9_fil['invalid_bizregno'] = tab9_fil['Business Registration No'].isin(['[]','()', np.nan, 'NA in iCare', 'N.A In Icare', 'n/a','na','N/A','N.A.','N.A','indeterminata','Indeterminata',' ','','.','-','0','0000000','000000','not available', 'not applicable', 'NIL', 'xx', 'not','XX', 'Not available'])
tab9 = tab9_fil[tab9_fil['invalid_bizregno']==True]
tab9_test =tab9.drop_duplicates(subset='Party ID')

tab9_test['Inconsistency'] = 'Invalid Biz Reg No.'


### Scenario10: Invalid Country of Incorp

In [None]:
# Relationship (Party Role) not equals to “Beneficiary”
tab10_fil = df_non_natural[df_non_natural['Relationship (Party Role)'] != 'Beneficiary']

# extract rows which Country of incorp. contains invalid value
#if its a dropdown, we can get a full list of countrys and do regex match
#wont get new errors. all the errors are old legacy accounts
tab10_fil['invalid_ctry'] = tab10_fil['Country of Incorporation'].isin(['[]','()', np.nan, 'NA in iCare', 'N.A In Icare', 'n/a','NA','N/A','N.A.','N.A','indeterminata','Indeterminata',' ','','.','-','0','0000000','000000','not available', 'not applicable', 'NIL', 'xx', 'not','XX', 'Not available', 'NBD/1717'])
tab10_test = tab10_fil[tab10_fil['invalid_ctry']==True]
#tab10_test = tab10[['Party ID', 'Relationship (Party Role)', 'Party Name', 'RM UID.1','RM Name.1', 'RM UID', 'RM Name', 'Business Registration No', 'Country of Incorporation','Address Type','Address - Country']]
#tab10_test.columns = ['Party ID', 'Party Role', 'Party Name', 'Party RM UID','Party RM Name', 'Portfolio RM UID', 'Portfolio RM Name', 'Business Registration No', 'Country of Incorporation','Address Type','Address - Country']
tab10_test['Inconsistency'] = 'Invalid Country of Incorp'


### Scenario11: Invalid Address-Country

In [None]:
#decommission?? check. this may be a mandatory field
# Relationship (Party Role) not equals to “Beneficiary”
tab11_fil = df_non_natural[df_non_natural['Relationship (Party Role)'] != 'Beneficiary']

# extract rows which Address Country contains invalid value
#wont get new errors. all the errors are old legacy accounts
tab11_fil['invalid_add'] = tab11_fil['Address - Country'].isin(['[]','()', np.nan, 'NA in iCare', 'N.A In Icare', 'n/a','NA','N/A','N.A.','N.A','indeterminata','Indeterminata',' ','','.','-','0','0000000','000000','not available', 'not applicable', 'NIL', 'xx', 'not','XX', 'Not available', 'NBD/1717'])
tab11_test= tab11_fil[tab11_fil['invalid_add']==True]
#tab11_test = tab11[['Party ID', 'Relationship (Party Role)', 'Party Name', 'RM UID.1','RM Name.1', 'RM UID', 'RM Name', 'Business Registration No', 'Country of Incorporation','Address Type','Address - Country']]
#tab11_test.columns = ['Party ID', 'Party Role', 'Party Name', 'Party RM UID','Party RM Name', 'Portfolio RM UID', 'Portfolio RM Name', 'Business Registration No', 'Country of Incorporation','Address Type','Address - Country']
tab11_test['Inconsistency'] = 'Invalid Address-Country'


### Merge for output

In [None]:
df_1 = [tab1_test, tab5_test, tab6_test, tab7_test, tab8_test]
df_2 = [tab2_test, tab9_test, tab10_test, tab11_test]

# solution 1
result_1 = pd.concat(df_1, join='outer', axis=0)
result_1 = result_1.sort_values(by=['Party Name_pre', 'Inconsistency','Party ID'])
#select columns for output
result_1 = result_1[['Batch','Inconsistency','Remarks','Action Required','Action Team','Status of Cleanup','Completion Date',
                     'Valid Exception',
                     'Party ID','Party Name','Party Type','RM UID.1','RM Name.1','Date of Incorporation',
                     'Country of Incorporation','Address Type','Address - Country','Business Registration No',
                     'bene_only']]
#rename columns
result_1.columns = ['Batch','Inconsistency','Remarks','Action Required','Action Team','Status of Cleanup','Completion Date',
                     'Valid Exception',
                     'Party ID','Party Name','Party Type','Party RM UID','Party RM Name','Date of Incorporation',
                     'Country of Incorporation','Address Type','Address - Country','Business Registration No',
                     'bene_only']


#display(result_1)

# solution 2
result_2 = pd.concat(df_2, join='outer', axis=0)
result_2 = result_2.sort_values(by=['Party Name_pre', 'Inconsistency','Party ID'])
result_2 = result_2[['Batch','Inconsistency','Remarks','Action Required','Action Team','Status of Cleanup','Completion Date',
                     'Valid Exception','Party ID', 'Relationship (Party Role)', 'Party Name', 'Party Type','RM UID.1','RM Name.1', 
                     'RM UID', 'RM Name', 'Business Registration No', 'Country of Incorporation','Address Type',
                     'Address - Country','bene_only']]
result_2.columns = ['Batch','Inconsistency','Remarks','Action Required','Action Team','Status of Cleanup','Completion Date',
                     'Valid Exception','Party ID', 'Party Role', 'Party Name','Party Type', 'Party RM UID','Party RM Name', 
                    'Portfolio RM UID', 'Portfolio RM Name', 'Business Registration No', 'Country of Incorporation',
                    'Address Type','Address - Country','bene_only']



# solution 3
# extract columns for tab
result_3 = tab4_test[['Batch','Inconsistency','Remarks','Action Required','Action Team','Status of Cleanup','Completion Date',
                     'Valid Exception','Party ID', 'Party Name', 'RM UID.1','RM Name.1', 'Date of Incorporation', 
                      'Business Registration No', 'Country of Incorporation',
                      'bene_only']] # extract certain columns
result_3.columns = ['Batch','Inconsistency','Remarks','Action Required','Action Team','Status of Cleanup','Completion Date',
                     'Valid Exception','Party ID', 'Party Name', 'Party RM UID','Party RM Name', 'Date of Incorporation', 
                    'Business Registration No', 'Country of Incorporation','bene_only'] # rename columns
result_3 = result_3.sort_values(by=['Date of Incorporation', 'Country of Incorporation','Business Registration No',
                                    'Inconsistency','Party ID'])
#display(result_3)



writer = ExcelWriter(output_file, mode='w')
result_1.to_excel(writer, 'Same_name_diff_static_info', index=False) #tab1
result_2.to_excel(writer, 'incomplete_fields_or_2RM', index=False) #tab2
result_3.to_excel(writer, 'Same_details_diff_name', index=False) #tab3
writer.save()

In [None]:
print("Completed")