In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import json
import csv

# KYC Preprocessing

The purpose of this notebook is to add occupation-specific indicators to the KYC data. Indicators are binary flags representing whether or not a client's occupation is part of a specific subset of occupations. These subsets were determined using Fintrac and financial crime recommendations. Mainly:

Per the [Fintrac Operational Alert](https://fintrac-canafe.canada.ca/intel/operation/oai-wildlife-eng), one indicator is "An individual is the owner, operator, employee or associated with an industry that could be used to facilitate illegal wildlife trade (e.g., import/export of goods, fisheries wholesaler, pet store, freight company, animal control)." (**G**)

The [Financial Crime Academy](https://financialcrimeacademy.org/wildlife-trade-risk-indicators-financial/?fbclid=IwAR1XSw09Vtl4mjOOQj_eTFuqZ_GKqM-SPsCJwQKcyFb-XWU4O6nO8zBo3JU) adds that another "indicator relates to activity involving politically exposed persons and wealthy businessmen/women, particularly those with environmental, game, or forestry oversight or environmental or wildlife-related businesses." (**G**)

We have therefore come up with the following indicators based on KYC occupation data:
- `occ_wealth` 
    - binary 
    - 1 if the client's occupation involves frequent exposure to wealthy people, 0 otherwise.
- `occ_animal` 
    - binary 
    - 1 if the client's occupation involves working with animals, 0 otherwise.
- `occ_int` 
    - binary 
    - 1 if the client works international trade, 0 otherwise.
- `occ_shipping`
    - binary
    - 1 if the client works in shipping/postal/cargo services
    - *there are no examples of this in our data*

## Merging Occupation and Customer Data

In [2]:
# Adding the occupation-based indicators to raw data
datapth = Path('../data/')
raw = pd.read_csv(datapth / 'raw' / 'kyc.csv')
occ = pd.read_csv(datapth / 'processed' / 'occupation_list.csv')

merged = raw.merge(occ, 'left', on='occupation')

# Re-order columns
cols = list(merged.columns)
cols.append(cols.pop(cols.index('label')))
merged = merged[cols]

## Merging Task 3 Data

In [3]:
kyc_df = merged.copy()

with open('../data/processed/names_metadata.json', 'r') as f:
    names = json.load(f)

In [4]:
kyc_df.loc[kyc_df['cust_id'].str.contains('CUST82758793')]

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label
0,JENNIFER WELLS,female,Architect,45.0,13.0,CUST82758793,0,0,0,0


In [5]:
kyc_df['named_trafficker']=0.0
kyc_df['parsed_name']=kyc_df['name'].str.lower()

for name, data in names.items():

    score = float(data['case_name_score'])
    
    #get number of matches
    l = len(kyc_df.loc[kyc_df['parsed_name'].str.contains(name)])
    
    #distribute score
    kyc_df.loc[kyc_df['parsed_name'].str.contains(name), 'named_trafficker'] = score/(100*l)    
    
kyc_df.drop(columns=['parsed_name'], inplace=True)

In [6]:
# Export
display(kyc_df.sample(3))
kyc_df.to_parquet(datapth / 'processed' / 'kyc.parquet', index=False)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,named_trafficker
166332,DAWN COMBS,female,Hotelier,34.0,0.0,CUST10732782,0,0,0,0,0.0
98722,CYNTHIA HUMPHREY,female,Security Guard,43.0,1.0,CUST91891237,0,0,0,0,0.0
175440,ZHAO CHAO,female,Graphologist,36.0,12.0,CUST91609141,0,0,0,0,0.0


## Merging KYC and Transactional Data

In [7]:
kyc_df = merged.copy()
kyc_df = kyc_df.drop(['occupation', 'gender', 'name', 'age', 'tenure'], axis=1)

# Wire Transfer
wire_df = pd.read_csv(datapth / 'raw' / 'wire.csv')
wire_df = wire_df.merge(kyc_df.add_suffix('_receiver', axis=1), on='cust_id_receiver', how='left')
wire_df = wire_df.merge(kyc_df.add_suffix('_sender', axis=1), on='cust_id_sender', how='left')
wire_df.to_parquet(datapth / 'processed' / 'wire.parquet', index=False)
display(wire_df.sample(3))

# Emt
emt_df = pd.read_csv(datapth / 'raw' / 'emt.csv')
emt_df = emt_df.merge(kyc_df.add_suffix('_receiver', axis=1), on='cust_id_receiver', how='left')
emt_df = emt_df.merge(kyc_df.add_suffix('_sender', axis=1), on='cust_id_sender', how='left')
emt_df.to_parquet(datapth / 'processed' / 'emt.parquet', index=False)
display(emt_df.sample(3))

# Cash
cash_df = pd.read_csv(datapth / 'raw' / 'cash.csv')
cash_df = cash_df.merge(kyc_df, on='cust_id', how='left')
cash_df.to_parquet(datapth / 'processed' / 'cash.parquet', index=False)
display(cash_df.sample(3))

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender
60216,EXTERNAL267637,CUST47257879,NICHOLAS SCHULTZ,KIRSTEN RAY,5280.0,CA,CA,QTRE82973392,0.0,0.0,1.0,1.0,,,,
21131,EXTERNAL139781,CUST81244085,TIFFANY WALKER,DR.PHILLIP MCCARTHY,2804.5,CA,CA,HSHD21194531,0.0,0.0,0.0,1.0,,,,
67587,CUST93583129,CUST83447103,LUO NING,DR.MICHAEL AGUILAR,1960.0,CA,CA,ZZEA57214593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,regex_flag,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender
83851,CUST48493273,EXTERNAL418853,MATTHEW REEVES,THOMAS RODRIGUEZ,,240.0,DSRS83508886,0,,,,,0.0,0.0,0.0,0.0
275819,EXTERNAL884231,CUST74813371,RYAN RODRIGUEZ,AMANDA MEDINA,,232.0,PCDU80506685,0,0.0,0.0,0.0,0.0,,,,
357668,EXTERNAL721015,CUST67595518,RACHEL VARGAS,DAVID GOLDEN,,260.0,WTHD97968094,0,0.0,0.0,0.0,0.0,,,,


Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label
158250,CUST49184208,5805,deposit,MJTO99178972,0,0,0,0
154098,CUST44067123,3665,withdrawal,KOQO31827357,0,0,0,0
155191,CUST55338360,6390,deposit,WTRU99755479,1,0,1,1
