In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import json

# KYC Preprocessing

The purpose of this notebook is to add occupation-specific indicators to the KYC data. Indicators are binary flags representing whether or not a client's occupation is part of a specific subset of occupations. These subsets were determined using Fintrac and financial crime recommendations. Mainly:

Per the [Fintrac Operational Alert](https://fintrac-canafe.canada.ca/intel/operation/oai-wildlife-eng), one indicator is "An individual is the owner, operator, employee or associated with an industry that could be used to facilitate illegal wildlife trade (e.g., import/export of goods, fisheries wholesaler, pet store, freight company, animal control)." (**G**)

The [Financial Crime Academy](https://financialcrimeacademy.org/wildlife-trade-risk-indicators-financial/?fbclid=IwAR1XSw09Vtl4mjOOQj_eTFuqZ_GKqM-SPsCJwQKcyFb-XWU4O6nO8zBo3JU) adds that another "indicator relates to activity involving politically exposed persons and wealthy businessmen/women, particularly those with environmental, game, or forestry oversight or environmental or wildlife-related businesses." (**G**)

We have therefore come up with the following indicators based on KYC occupation data:
- `occ_wealth` 
    - binary 
    - 1 if the client's occupation involves frequent exposure to wealthy people, 0 otherwise.
- `occ_animal` 
    - binary 
    - 1 if the client's occupation involves working with animals, 0 otherwise.
- `occ_int` 
    - binary 
    - 1 if the client works international trade, 0 otherwise.
- `occ_shipping`
    - binary
    - 1 if the client works in shipping/postal/cargo services
    - *there are no examples of this in our data*

## Merging Occupation and Customer Data

In [2]:
# Adding the occupation-based indicators to raw data
DATAPATH = Path('../data/')
raw = pd.read_csv(DATAPATH / 'raw' / 'kyc.csv')
occ = pd.read_csv(DATAPATH / 'processed' / 'occupation_list.csv')

merged = raw.merge(occ, 'left', on='occupation')

# Re-order columns
cols = list(merged.columns)
cols.append(cols.pop(cols.index('label')))
merged = merged[cols]

## Getting All Customers

## Merging Task 3 Data

In [3]:
kyc_df = merged.copy()

# with open('../data/processed/names_metadata.json', 'r') as f:
#     names = json.load(f)

In [4]:
# kyc_df.loc[kyc_df['cust_id'].str.contains('CUST82758793')]

In [5]:
# kyc_df['named_trafficker']=0.0
# kyc_df['parsed_name']=kyc_df['name'].str.lower()

# for name, data in names.items():

#     score = float(data['case_name_score'])
    
#     #get number of matches
#     l = len(kyc_df.loc[kyc_df['parsed_name'].str.contains(name)])
    
#     #distribute score
#     kyc_df.loc[kyc_df['parsed_name'].str.contains(name), 'named_trafficker'] = score/(100*l)    
    
# kyc_df.drop(columns=['parsed_name'], inplace=True)

In [7]:
# Export
display(kyc_df.sample(3))
kyc_df.to_parquet(DATAPATH / 'processed' / 'kyc.parquet', index=False)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label
63251,BAIJU RAVEL,male,Retail Salesperson,26.0,4.0,CUST83352324,0,0,0,0
124238,CAROLYN RIVERA,other,Ichthyologist,39.0,5.0,CUST83627775,0,1,0,0
38506,SCOTT JENNINGS,male,Postal Worker,37.0,10.0,CUST99596443,0,0,1,0


## Merging KYC and Transactional Data

In [9]:
kyc_df = merged.copy()
kyc_df = kyc_df.drop(['occupation', 'gender', 'name', 'age', 'tenure'], axis=1)

# Wire Transfer
wire_df = pd.read_csv(DATAPATH / 'raw' / 'wire.csv')
wire_df = wire_df.merge(kyc_df.add_suffix('_receiver', axis=1), on='cust_id_receiver', how='left')
wire_df = wire_df.merge(kyc_df.add_suffix('_sender', axis=1), on='cust_id_sender', how='left')
wire_df.to_parquet(DATAPATH / 'processed' / 'wire.parquet', index=False)
display(wire_df.sample(3))

# Emt
emt_df = pd.read_csv(DATAPATH / 'raw' / 'emt.csv')
emt_df = emt_df.merge(kyc_df.add_suffix('_receiver', axis=1), on='cust_id_receiver', how='left')
emt_df = emt_df.merge(kyc_df.add_suffix('_sender', axis=1), on='cust_id_sender', how='left')
emt_df.to_parquet(DATAPATH / 'processed' / 'emt.parquet', index=False)
display(emt_df.sample(3))

# Cash
cash_df = pd.read_csv(DATAPATH / 'raw' / 'cash.csv')
cash_df = cash_df.merge(kyc_df, on='cust_id', how='left')
cash_df.to_parquet(DATAPATH / 'processed' / 'cash.parquet', index=False)
display(cash_df.sample(3))

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender
25845,CUST68759034,CUST50001412,DAVID CARROLL,EMMANUEL MORIN,3930.0,CA,CA,ILID90074457,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9475,CUST17613186,CUST41718931,HUGUES-TIMOTHÉE BÉDARD,BRYAN ANDERSON,1950.0,CA,CA,COPA96049848,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
12821,CUST20215129,EXTERNAL776874,BHAMINI DUTT,BRIANNA HARRIS,3044.0,CA,CA,ZBBY52797636,,,,,0.0,0.0,0.0,1.0


Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,regex_flag,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender
447831,EXTERNAL458917,CUST35522952,CHRISTOPHER REED,DR.UMANG KATA,,201.5,QWON79108863,0,0.0,0.0,1.0,0.0,,,,
302027,EXTERNAL197281,CUST86724325,DR.MARCEL LACHANCE,TONY WILSON,,1520.0,FTPG74286728,0,0.0,0.0,0.0,1.0,,,,
210024,CUST26394675,EXTERNAL995248,RUBEN BELL,ADRIAN ESTES,,40.0,QYCX33749356,0,,,,,0.0,0.0,0.0,0.0


Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label
171088,CUST13760966,5885,deposit,PRJZ89275311,1,0,1,0
115151,CUST45205687,2590,withdrawal,PGSR76298392,0,0,0,0
132148,CUST18617152,12580,deposit,KFBA48145130,1,0,1,0
