In [1]:
import pandas as pd
import numpy as np

# Transaction Preprocessing

## E-transfer Transaction Features
- `e_at_risk`
    - General indicator
    - Probability that a message references a species at risk, or parts thereof
    - *could also be a binary flag*
    - *can look at CITES for this list*
- `e_cad_at_risk`
    - Export indicator
    - Probability that message references a species at risk in Canada, or parts thereof (bear, geese, etc.)
    - *could also be a binary flag*
    - *get animal list from fintrac doc and papers*
- `e_role`
    - General indicator
    - Probability that a message references an IWT related role (e.g. poacher, coordinator, supplier, breeder, trader)
    - *could also be a binary flag*
- `e_trad_med`
    - Import indicator
    - Probability that a message references traditional medicine
    - *could also be a binary flag*
 

*Do we want to separate these into TO and FROM?*

## Wire Transfer Transaction Features
- `w_to_country`
    - Import indicator
    - Binary flag
    - 1 if the wire transfer is to a jurisdiction of concern:
        - China
        - Hong Kong
        - South Africa
        - Australia
        - ...
- `w_from_country`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if from a jurisdiction of concern, 0 otherwise
- `w_external_to_animal`
    - Binary flag
    - 1 if from a jurisdiction of concern to an animal related business, 0 otherwise

## Cash Transaction Features
- `c_large_dep`
    - General indicator
    - Binary flag
    - 1 if large deposit from someone involved in international trade or wildlife-related business
    - *tbd what large means*
- `c_large_wthd`
    - General indicator
    - Binary flag
    - 1 if large withdrawal from someone involved in international trade or wildlife-related business
    - *tbd what large means*

*The financial crime academy has some additional specifications for these features*

# General Transaction Features
- `t_to_animal`
    - Import indicator
    - Binary flag
    - 1 if the transaction is to an animal related business
- `t_from_animal`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related business
- `t_to_animal_large`
    - Import indicator
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_from_animal_large`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_to_shipping`
    - Export indicator
    - Binary flag
    - 1 if the transaction is to someone in shipping/postal/cargo
    - *no label for this type of occupation yet*


In [1]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

In [3]:
emt_data = pd.read_csv('../data/raw/emt.csv')
wire_data = pd.read_csv('../data/raw/wire.csv')
cash_data = pd.read_csv('../data/raw/cash.csv')
kyc_data = pd.read_csv('../data/raw/kyc.csv')

In [4]:
# First, rename columns to have a consistent schema
cash_data.rename(columns={
    'amount': 'tx_amount',
    }, inplace=True)
emt_data.rename(columns={
    'emt value': 'tx_amount', 
    'id sender': 'cust_id_sender',
    'id receiver': 'cust_id_receiver',
    'name sender': 'name_sender',
    'name receiver': 'name_receiver',
    'emt message': 'tx_message',
    }, inplace=True)
wire_data.rename(columns={
    'wire value': 'tx_amount',
    'id sender': 'cust_id_sender',
    'id receiver': 'cust_id_receiver',
    'name sender': 'name_sender',
    'name receiver': 'name_receiver',
    'country sender': 'country_sender',
    'country receiver': 'country_receiver',
    }, inplace=True)

# Add a new column to each dataframe to indicate the type of transaction
cash_data['tx_type'] = 'cash'
emt_data['tx_type'] = 'emt'
wire_data['tx_type'] = 'wire'

# duplicate all transactions in emt_data and wire_data to have a sender and receiver as the cust_id
emt_data_sender = emt_data.copy()
emt_data_sender.rename(columns={
    'cust_id_sender': 'cust_id',
    'name_sender': 'name',
    'country_sender': 'country',
    }, inplace=True)
emt_data_receiver = emt_data.copy()
emt_data_receiver.rename(columns={
    'cust_id_receiver': 'cust_id',
    'name_receiver': 'name',
    'country_receiver': 'country',
    }, inplace=True)
wire_data_sender = wire_data.copy()
wire_data_sender.rename(columns={
    'cust_id_sender': 'cust_id',
    'name_sender': 'name',
    'country_receiver': 'country',
    }, inplace=True)
wire_data_receiver = wire_data.copy()
wire_data_receiver.rename(columns={
    'cust_id_receiver': 'cust_id',
    'name_receiver': 'name',
    'country_sender': 'country',
    }, inplace=True)
wire_data_sender['direction'] = 'sender'
wire_data_receiver['direction'] = 'receiver'
emt_data_sender['direction'] = 'sender'
emt_data_receiver['direction'] = 'receiver'

# drop columns that are not needed
emt_data_sender.drop(columns=['cust_id_receiver', 'name_receiver'], inplace=True)
emt_data_receiver.drop(columns=['cust_id_sender', 'name_sender'], inplace=True)
wire_data_sender.drop(columns=['cust_id_receiver', 'name_receiver', 'country_sender'], inplace=True)
wire_data_receiver.drop(columns=['cust_id_sender', 'name_sender', 'country_receiver'], inplace=True)

# drop rows where cust_id doesn't begin with 'CUST'
emt_data_sender = emt_data_sender[emt_data_sender['cust_id'].str.startswith('CUST')]
emt_data_receiver = emt_data_receiver[emt_data_receiver['cust_id'].str.startswith('CUST')]
wire_data_sender = wire_data_sender[wire_data_sender['cust_id'].str.startswith('CUST')]
wire_data_receiver = wire_data_receiver[wire_data_receiver['cust_id'].str.startswith('CUST')]

# remerge the dataframes
emt_data = pd.concat([emt_data_sender, emt_data_receiver], ignore_index=True)
wire_data = pd.concat([wire_data_sender, wire_data_receiver], ignore_index=True)

wire_data = wire_data.drop(columns=['name'])
emt_data = emt_data.drop(columns=['name'])

In [6]:
"""t_to_animal
    - Import indicator
    - Binary flag
    - 1 if the transaction is to an animal related business
"""
emt_data['t_to_animal'] = np.where(emt_data['occ_animal']==1 and emt_data['direction']=='sender', 1, 0)
wire_data['t_to_animal'] = np.where(wire_data['occ_animal']==1 and wire_data['direction']=='sender', 1, 0)
cash_data['t_to_animal'] = np.where(cash_data['occ_animal']==1, 1, 0)

"""- `t_from_animal`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related business"""
emt_data['t_from_animal'] = np.where(emt_data['occ_animal']==1 and emt_data['direction']=='receiver', 1, 0)
wire_data['t_from_animal'] = np.where(wire_data['occ_animal']==1 and wire_data['direction']=='receiver', 1, 0)
cash_data['t_from_animal'] = 0

"""- `t_to_animal_large`
    - Import indicator
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*"""
threshold = 1000 # TODO: look at distribution of tx_amount for animal related transactions
emt_data['t_to_animal_large'] = np.where(emt_data['occ_animal']==1 and emt_data['direction']=='sender' and emt_data['tx_amount'] > threshold, 1, 0)
wire_data['t_to_animal_large'] = np.where(wire_data['occ_animal']==1 and wire_data['direction']=='sender' and wire_data['tx_amount'] > threshold, 1, 0)
cash_data['t_to_animal_large'] = np.where(cash_data['occ_animal']==1 and cash_data['tx_amount'] > threshold, 1, 0)

"""- `t_from_animal_large`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*"""
threshold = 1000 # TODO: look at distribution of tx_amount for animal related transactions
emt_data['t_from_animal_large'] = np.where(emt_data['occ_animal']==1 and emt_data['direction']=='receiver' and emt_data['tx_amount'] > threshold, 1, 0)
wire_data['t_from_animal_large'] = np.where(wire_data['occ_animal']==1 and wire_data['direction']=='receiver' and wire_data['tx_amount'] > threshold, 1, 0)
cash_data['t_from_animal_large'] = 0

"""- `t_to_shipping`
    - Export indicator
    - Binary flag
    - 1 if the transaction is to someone in shipping/postal/cargo
    - *no label for this type of occupation yet*"""

emt_data['t_to_shipping'] = np.where(emt_data['occ_shipping']==1 and emt_data['direction']=='sender', 1, 0)
wire_data['t_to_shipping'] = np.where(wire_data['occ_shipping']==1 and wire_data['direction']=='sender', 1, 0)
cash_data['t_to_shipping'] = np.where(cash_data['occ_shipping']==1, 1, 0)

"""- `t_from_shipping`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from someone in shipping/postal/cargo
    - *no label for this type of occupation yet*"""
emt_data['t_from_shipping'] = np.where(emt_data['occ_shipping']==1 and emt_data['direction']=='receiver', 1, 0)
wire_data['t_from_shipping'] = np.where(wire_data['occ_shipping']==1 and wire_data['direction']=='receiver', 1, 0)
cash_data['t_from_shipping'] = 0

Unnamed: 0,cust_id,tx_message,tx_amount,trxn_id,tx_type,direction
0,CUST26232205,for the bike u lent me,154.0,WFEZ76031047,emt,sender
1,CUST35533148,,518.0,XQJS86205330,emt,sender
2,CUST59096559,,46.0,WPXP45854083,emt,sender
3,CUST69049633,,570.0,OIRZ70883325,emt,sender
4,CUST27403977,,480.0,TRNT55099512,emt,sender
...,...,...,...,...,...,...
704566,CUST74979363,,119.0,USHN74907347,emt,receiver
704567,CUST68693554,,208.0,VXES44436032,emt,receiver
704568,CUST90504001,Fox racing motocross gear,150.0,LTUK21435620,emt,receiver
704569,CUST99824006,,270.0,OIRO35201076,emt,receiver
