In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import json
import csv

# KYC Preprocessing

The purpose of this notebook is to add occupation-specific indicators to the KYC data. Indicators are binary flags representing whether or not a client's occupation is part of a specific subset of occupations. These subsets were determined using Fintrac and financial crime recommendations. Mainly:

Per the [Fintrac Operational Alert](https://fintrac-canafe.canada.ca/intel/operation/oai-wildlife-eng), one indicator is "An individual is the owner, operator, employee or associated with an industry that could be used to facilitate illegal wildlife trade (e.g., import/export of goods, fisheries wholesaler, pet store, freight company, animal control)." (**G**)

The [Financial Crime Academy](https://financialcrimeacademy.org/wildlife-trade-risk-indicators-financial/?fbclid=IwAR1XSw09Vtl4mjOOQj_eTFuqZ_GKqM-SPsCJwQKcyFb-XWU4O6nO8zBo3JU) adds that another "indicator relates to activity involving politically exposed persons and wealthy businessmen/women, particularly those with environmental, game, or forestry oversight or environmental or wildlife-related businesses." (**G**)

We have therefore come up with the following indicators based on KYC occupation data:
- `occ_wealth` 
    - binary 
    - 1 if the client's occupation involves frequent exposure to wealthy people, 0 otherwise.
- `occ_animal` 
    - binary 
    - 1 if the client's occupation involves working with animals, 0 otherwise.
- `occ_int` 
    - binary 
    - 1 if the client works international trade, 0 otherwise.
- `occ_shipping`
    - binary
    - 1 if the client works in shipping/postal/cargo services
    - *there are no examples of this in our data*

## Merging Occupation and Customer Data

In [2]:
# Adding the occupation-based indicators to raw data
datapth = Path('../data/')
raw = pd.read_csv(datapth / 'raw' / 'kyc.csv')
occ = pd.read_csv(datapth / 'processed' / 'occupation_list.csv')

merged = raw.merge(occ, 'left', on='occupation')

# Re-order columns
cols = list(merged.columns)
cols.append(cols.pop(cols.index('label')))
merged = merged[cols]

## Merging Task 3 Data

In [3]:
kyc_df = merged.copy()

with open('../data/processed/names_metadata.json', 'r') as f:
    names = json.load(f)

In [4]:
kyc_df.loc[kyc_df['cust_id'].str.contains('CUST82758793')]

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label
0,JENNIFER WELLS,female,Architect,45.0,13.0,CUST82758793,0,0,0,0


In [5]:
kyc_df['named_trafficker']=0.0
kyc_df['parsed_name']=kyc_df['name'].str.lower()

for name, data in names.items():

    score = float(data['case_name_score'])
    
    #get number of matches
    l = len(kyc_df.loc[kyc_df['parsed_name'].str.contains(name)])
    
    #distribute score
    kyc_df.loc[kyc_df['parsed_name'].str.contains(name), 'named_trafficker'] = score/(100*l)    
    
kyc_df.drop(columns=['parsed_name'], inplace=True)

In [6]:
# Export
display(kyc_df.sample(3))
kyc_df.to_parquet(datapth / 'processed' / 'kyc.parquet', index=False)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,named_trafficker
16811,LARRY WALKER,male,Librarian,55.0,11.0,CUST89740513,0,0,0,0,0.0
40473,WESLEY SCHAEFER,male,Librarian Assistant,55.0,6.0,CUST32346578,0,0,0,0,0.0
112977,JOS VILLALOBOS AREVALO,female,Property Manager,28.0,10.0,CUST58277635,0,0,0,0,0.0


## Merging KYC and Transactional Data

In [7]:
kyc_df = merged.copy()
kyc_df = kyc_df.drop(['occupation', 'gender', 'name', 'age', 'tenure'], axis=1)

# Wire Transfer
wire_df = pd.read_csv(datapth / 'raw' / 'wire.csv')
wire_df = wire_df.merge(kyc_df.add_suffix('_receiver', axis=1), on='cust_id_receiver', how='left')
wire_df = wire_df.merge(kyc_df.add_suffix('_sender', axis=1), on='cust_id_sender', how='left')
wire_df.to_parquet(datapth / 'processed' / 'wire.parquet', index=False)
display(wire_df.sample(3))

# Emt
emt_df = pd.read_csv(datapth / 'raw' / 'emt.csv')
emt_df = emt_df.merge(kyc_df.add_suffix('_receiver', axis=1), on='cust_id_receiver', how='left')
emt_df = emt_df.merge(kyc_df.add_suffix('_sender', axis=1), on='cust_id_sender', how='left')
emt_df.to_parquet(datapth / 'processed' / 'emt.parquet', index=False)
display(emt_df.sample(3))

# Cash
cash_df = pd.read_csv(datapth / 'raw' / 'cash.csv')
cash_df = cash_df.merge(kyc_df, on='cust_id', how='left')
cash_df.to_parquet(datapth / 'processed' / 'cash.parquet', index=False)
display(cash_df.sample(3))

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender
57629,EXTERNAL754132,CUST87347462,KARA WILEY,MARIE GÉLINAS,1107.5,CA,CA,YWHV83356547,0.0,0.0,0.0,0.0,,,,
63150,EXTERNAL156069,CUST65894188,PAUL WILLIAMS,BARRY TURNER,1220.0,CA,CA,BGIP35021985,0.0,0.0,0.0,0.0,,,,
44348,CUST30961757,CUST83907021,BENJAMIN LÉVESQUE,DR.RICKY TUCKER,1035.5,CA,CA,GBAW53753561,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,regex_flag,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender
88378,CUST15121592,CUST24763722,LAUREN BUTLER,BENJAMIN CLARK,,159.5,UOMI80762145,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
353796,CUST97642545,EXTERNAL490081,ISAIAH BARR,JAMIE SMITH,,768.0,BEJY16697884,0,,,,,0.0,1.0,0.0,0.0
250861,CUST36347221,CUST65411836,VALERIE BOYD,KAYLA SCHMIDT,,286.0,FJZE39792673,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label
149789,CUST28550596,19650,deposit,REUA11692807,1,0,0,1
21287,CUST92659507,4965,deposit,ZOLC20994150,0,0,1,1
92266,CUST87648085,960,withdrawal,BZMW74490478,0,0,0,0
