In [11]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import regex

# Transaction Preprocessing

## E-transfer Transaction Features
- `e_at_risk`
    - General indicator
    - Binary flag: 1 if message references a species at risk, or parts thereof, 0 o.w.
- `e_role`
    - General indicator
    - Binary flag: 1 if message references an IWT role (e.g. poacher, breeder), 0 o.w.
    - EDA Note: regex search for these words did not yield anything, nor did semantic search for poacher, nor semantic search for breeder
- `e_trad_med`
    - Import indicator
    - Binary flag: 1 if message references traditional medicine terms (e.g. poacher, breeder), 0 o.w.
    - EDA Note: nothing found during EDA semantic search.
    - *could also be a binary flag*
    - Taken from [Canadian animal/plant ingredients for traditional medicine](https://www.canada.ca/en/environment-climate-change/services/convention-international-trade-endangered-species/publications/animal-plant-ingredients-traditional-medicine.html#_03)

In [18]:
DATAPATH = Path('../data')
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'
emt_df = pd.read_parquet(EMTPATH)

emt_df['trxn_message'] = emt_df['trxn_message'].fillna('')

In [19]:
def classify_message(messages: pd.Series, positives: list, negatives=None):
    """Flags messages that match postive flags and don't match negative flags
    
    Args: 
        messages: a pandas series containing e-transfer messages
        positives: a list of regex flags to match
        negatives: (optional) a list of regex flags that result in non-matches
    
    Returns:
        A pandas series containing {0,1}
    """
    positive_pattern = '|'.join([f'\\b{p}\\b' for p in positives])
    classification = (messages.str.contains(positive_pattern, case=False, na=False))
    
    if negatives is not None:
        negative_pattern = '|'.join([f'{n}' for n in negatives])
        classification = classification & (~messages.str.contains(negative_pattern, case=False, na=False))
    
    return classification.astype(int)

In [21]:
# Classifying messages related to animal trafficking
with open('./flags/animal_pos.txt') as f: 
    positive_flags = f.read().splitlines()

with open('./flags/animal_neg.txt') as f: 
    negative_flags = f.read().splitlines()

#Labelling data
emt_df['e_at_risk'] = classify_message(emt_df['trxn_message'], positive_flags, negative_flags)

In [22]:
# Classifying messages containing IWT roles
with open('./flags/role_pos.txt') as f: 
    positive_flags = f.read().splitlines()

with open('./flags/role_neg.txt') as f: 
    negative_flags = f.read().splitlines()

#Labelling data
emt_df['e_role'] = classify_message(emt_df['trxn_message'], positive_flags, negative_flags)

In [23]:
# Classifying messages containing traditional medicine flags
with open('./flags/med_pos.txt') as f: 
    positive_flags = f.read().splitlines()

with open('./flags/med_neg.txt') as f: 
    negative_flags = f.read().splitlines()

#Labelling data
emt_df['e_trad_med'] = classify_message(emt_df['trxn_message'], positive_flags, negative_flags)

In [24]:
emt_df.to_parquet(EMTPATH, index=False)
emt_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,...,trxn_type,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,e_agg,g_agg,score
447715,CUST87325435,CUST77864856,TIMOTHY TODD,THERESA DAVIS,,271.0,WFWN25057576,1.0,0.0,1.0,...,emt,0,0,0,0,1,0,0,0.5,0.5
277,EXTERNAL460693,CUST18777827,MICHAEL KIM,MX. LORI LOPEZ,,80.0,LTFA10333792,0.0,0.0,0.0,...,emt,0,0,0,0,0,0,0,0.0,0.0
220236,EXTERNAL759465,CUST40076474,SHARI JOYCE,MEGAN BRYANT,,212.0,HZRI51773644,0.0,0.0,0.0,...,emt,0,0,0,0,0,0,0,0.0,0.0


## Wire Transfer Transaction Features
- `w_to_country`
    - Import indicator
    - Binary flag
    - 1 if the wire transfer is to a jurisdiction of concern:
        - China
        - Hong Kong
        - South Africa
        - Australia
        - ...
- `w_from_country`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if from a jurisdiction of concern, 0 otherwise
- `w_external_to_animal`
    - Binary flag
    - 1 if from a jurisdiction of concern to an animal related business, 0 otherwise

In [12]:
WIREPATH = DATAPATH / 'processed' / 'wire.parquet'
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'

SUSCOUNTRIES = ['CN', 'AU']

In [13]:
#Load Data
kyc_df = pd.read_parquet(KYCPATH)
kyc_df = kyc_df[['cust_id', 'occ_animal']]

wire_df = pd.read_parquet(WIREPATH)

#w_to_country
wire_df['w_to_country'] = wire_df['country_receiver'].map(lambda x: 1 if x in SUSCOUNTRIES else 0)

#w_from_country
wire_df['w_from_country'] = wire_df['country_sender'].map(lambda x: 1 if x in SUSCOUNTRIES else 0)

#w_external_to_animal
wire_df['w_external_to_animal'] = wire_df.apply(lambda r: 1 if r.country_sender!='CA' and r.occ_animal_receiver==1 else 0, axis=1)

In [14]:
wire_df.to_parquet(WIREPATH, index=False)
wire_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender,w_to_country,w_from_country,w_external_to_animal
37141,CUST12773804,CUST65989424,BRENT THOMPSON,SUN GUI ZHEN,1072.0,CA,CA,WQHQ49990011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
26040,CUST56627949,EXTERNAL816843,JESSICA LEONARD,ERICA RODRIGUEZ,10118.0,CA,AU,CLBC72453505,,,,,0.0,0.0,0.0,0.0,1,0,0
51524,CUST57073542,CUST58945465,KANG JIAN,PATRICK TANGUAY-PILON,2017.0,CA,CA,KLBA27666770,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0


## Cash Transaction Features
- `c_large`
    - General indicator
    - Binary flag
    - 1 if a deposit or withdrawal is an outlier (above Q3+1.5*IQR)
    - To determine outliers, data is grouped by occupation and by transaction type (deposit/withdrawal)

In [15]:
DATAPATH = Path('../data/')
CASHPATH = DATAPATH / 'processed' / 'cash.parquet'
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'

UPPERBOUND = 1.5 #Q3 + UPPERBOUND*IQR gives the transaction amount beyond which transactions are classified as outliers

In [16]:
#Load Data
kyc_df = pd.read_parquet(KYCPATH)
kyc_df = kyc_df[['cust_id', 'occupation']]

cash_df = pd.read_parquet(CASHPATH)
cash_df = cash_df.merge(kyc_df, on='cust_id', how='left')

#Get IQR for Outlier Test
q3 = cash_df.groupby(['type', 'occupation'])['trxn_amount'].quantile(0.75)
q1 = cash_df.groupby(['type', 'occupation'])['trxn_amount'].quantile(0.25)
iqr = q3-q1
fence = q3+UPPERBOUND*iqr #upper bound for outlier tests
fence_dict = fence.to_dict()

#Test outliers
def outlier_test(row, fence): 
    outlier = row.trxn_amount > fence[(row.type, row.occupation)]

    return 1 if outlier else 0

cash_df['c_large'] = cash_df.apply(lambda row: outlier_test(row, fence_dict), axis=1)
print(f'Classified {cash_df.c_large.sum()} of {cash_df.c_large.count()} as outliers')

Classified 9774 of 212532 as outliers


In [17]:
cash_df.to_parquet(CASHPATH, index=False)
cash_df.sample(3)

Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label,occupation,c_large
150217,CUST21200622,6195,withdrawal,MGTW69411361,0,0,0,0,Property Manager,0
42736,CUST96302373,2360,deposit,VSQV71199035,0,0,0,0,Tattoo Artist,0
29944,CUST73707764,340,withdrawal,VANW47481352,0,0,0,0,Software Developer,0


# General Transaction Features
- `t_to_animal`
    - Import indicator
    - Binary flag
    - 1 if the transaction is to an animal related business
- `t_from_animal`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related business
- `t_to_animal_large`
    - Import indicator
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_from_animal_large`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_to_shipping`
    - Export indicator
    - Binary flag
    - 1 if the transaction is to someone in shipping/postal/cargo
    - *no label for this type of occupation yet*


In [18]:
DATAPATH = Path('../data/')
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'
WIREPATH = DATAPATH / 'processed' / 'wire.parquet'

In [19]:
emt_data = pd.read_parquet(EMTPATH)
wire_data = pd.read_parquet(WIREPATH)
kyc_data = pd.read_parquet(KYCPATH)

threshold = 1000

In [20]:
# Add a new column to each dataframe to indicate the type of transaction
# cash_data['trxn_type'] = 'cash'
emt_data['trxn_type'] = 'emt'
wire_data['trxn_type'] = 'wire'

cols = ['gender', 'occupation', 'age', 'tenure', 'cust_id', 'occ_wealth', 'occ_animal', 'occ_int']

emt_data['t_to_animal'] = np.where((emt_data['occ_animal_receiver']==1), 1, 0)
wire_data['t_to_animal'] = np.where((wire_data['occ_animal_receiver']==1), 1, 0)
# cash_data['t_to_animal'] = np.where(cash_data['occ_animal']==1, 1, 0)

emt_data['t_from_animal'] = np.where((emt_data['occ_animal_sender']==1), 1, 0)
wire_data['t_from_animal'] = np.where((wire_data['occ_animal_sender']==1), 1, 0)
# cash_data['t_from_animal'] = 0

emt_data['t_to_animal_large'] = np.where((emt_data['occ_animal_receiver']==1)  & (emt_data['emt_value'] > threshold), 1, 0)
wire_data['t_to_animal_large'] = np.where((wire_data['occ_animal_receiver']==1)  & (wire_data['trxn_value'] > threshold), 1, 0)
# cash_data['t_to_animal_large'] = np.where((cash_data['occ_animal']==1) & (cash_data['trxn_amount'] > threshold), 1, 0)

emt_data['t_from_animal_large'] = np.where((emt_data['occ_animal_sender']==1) & (emt_data['emt_value'] > threshold), 1, 0)
wire_data['t_from_animal_large'] = np.where((wire_data['occ_animal_sender']==1) & (wire_data['trxn_value'] > threshold), 1, 0)
# cash_data['t_from_animal_large'] = 0

emt_data['t_to_int'] = np.where((emt_data['occ_int_receiver']==1), 1, 0)
wire_data['t_to_int'] = np.where((wire_data['occ_int_receiver']==1), 1, 0)
# cash_data['t_to_int'] = np.where(cash_data['occ_int']==1, 1, 0)

emt_data['t_from_int'] = np.where((emt_data['occ_int_sender']==1), 1, 0)
wire_data['t_from_int'] = np.where((wire_data['occ_int_sender']==1), 1, 0)
# cash_data['t_from_int'] = 0

In [21]:
emt_data.to_parquet('../data/processed/emt.parquet')
wire_data.to_parquet('../data/processed/wire.parquet')