In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import regex

# Transaction Preprocessing

## E-transfer Transaction Features
- `e_at_risk`
    - General indicator
    - Binary flag: 1 if message references a species at risk, or parts thereof, 0 o.w.
- `e_role`
    - General indicator
    - Binary flag: 1 if message references an IWT role (e.g. poacher, breeder), 0 o.w.
    - EDA Note: regex search for these words did not yield anything, nor did semantic search for poacher, nor semantic search for breeder
- `e_trad_med`
    - Import indicator
    - Binary flag: 1 if message references traditional medicine terms (e.g. poacher, breeder), 0 o.w.
    - EDA Note: nothing found during EDA semantic search.
    - *could also be a binary flag*
    - Taken from [Canadian animal/plant ingredients for traditional medicine](https://www.canada.ca/en/environment-climate-change/services/convention-international-trade-endangered-species/publications/animal-plant-ingredients-traditional-medicine.html#_03)

In [2]:
DATAPATH = Path('../data')
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'
emt_df = pd.read_parquet(EMTPATH)

emt_df['trxn_message'] = emt_df['trxn_message'].fillna('')

In [3]:
def classify_message(messages: pd.Series, positives: list, negatives=None):
    """Flags messages that match postive flags and don't match negative flags
    
    Args: 
        messages: a pandas series containing e-transfer messages
        positives: a list of regex flags to match
        negatives: (optional) a list of regex flags that result in non-matches
    
    Returns:
        A pandas series containing {0,1}
    """
    positive_pattern = '|'.join([f'\\b{p}\\b' for p in positives])
    classification = (messages.str.contains(positive_pattern, case=False, na=False))
    
    if negatives is not None:
        negative_pattern = '|'.join([f'{n}' for n in negatives])
        classification = classification & (~messages.str.contains(negative_pattern, case=False, na=False))
    
    return classification.astype(int)

In [4]:
# Classifying messages related to animal trafficking
with open('./flags/animal_pos.txt') as f: 
    positive_flags = f.read().splitlines()

with open('./flags/animal_neg.txt') as f: 
    negative_flags = f.read().splitlines()

#Labelling data
emt_df['e_at_risk'] = classify_message(emt_df['trxn_message'], positive_flags, negative_flags)

In [5]:
# Classifying messages containing IWT roles
with open('./flags/role_pos.txt') as f: 
    positive_flags = f.read().splitlines()

with open('./flags/role_neg.txt') as f: 
    negative_flags = f.read().splitlines()

#Labelling data
emt_df['e_role'] = classify_message(emt_df['trxn_message'], positive_flags, negative_flags)

In [6]:
# Classifying messages containing traditional medicine flags
with open('./flags/med_pos.txt') as f: 
    positive_flags = f.read().splitlines()

with open('./flags/med_neg.txt') as f: 
    negative_flags = f.read().splitlines()

#Labelling data
emt_df['e_trad_med'] = classify_message(emt_df['trxn_message'], positive_flags, negative_flags)

In [7]:
emt_df.to_parquet(EMTPATH, index=False)
emt_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,regex_flag,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender,e_at_risk,e_role,e_trad_med
295399,EXTERNAL554027,CUST79589583,GREGORY HUERTA,OCÉANE RODRIGUE,,115.0,LHSV26016199,0,0.0,0.0,0.0,0.0,,,,,0,0,0
339184,EXTERNAL316916,CUST44600228,RYAN MENDEZ,DR.WILLIAM MEYER,,141.0,ZDSV88071740,0,1.0,0.0,1.0,1.0,,,,,0,0,0
200315,EXTERNAL602459,CUST65944127,SUI HONG,XIE MIN,tix money,46.0,FZMR51495186,0,0.0,0.0,0.0,0.0,,,,,0,0,0


## Wire Transfer Transaction Features
- `w_to_country`
    - Import indicator
    - Binary flag
    - 1 if the wire transfer is to a jurisdiction of concern:
        - China
        - Hong Kong
        - South Africa
        - Australia
        - ...
- `w_from_country`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if from a jurisdiction of concern, 0 otherwise
- `w_external_to_animal`
    - Binary flag
    - 1 if from a jurisdiction of concern to an animal related business, 0 otherwise

In [8]:
WIREPATH = DATAPATH / 'processed' / 'wire.parquet'
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'

SUSCOUNTRIES = ['CN', 'AU']

In [9]:
#Load Data
kyc_df = pd.read_parquet(KYCPATH)
kyc_df = kyc_df[['cust_id', 'occ_animal']]

wire_df = pd.read_parquet(WIREPATH)

#w_to_country
wire_df['w_to_country'] = wire_df['country_receiver'].map(lambda x: 1 if x in SUSCOUNTRIES else 0)

#w_from_country
wire_df['w_from_country'] = wire_df['country_sender'].map(lambda x: 1 if x in SUSCOUNTRIES else 0)

#w_external_to_animal
wire_df['w_external_to_animal'] = wire_df.apply(lambda r: 1 if r.country_sender!='CA' and r.occ_animal_receiver==1 else 0, axis=1)

In [10]:
wire_df.to_parquet(WIREPATH, index=False)
wire_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender,w_to_country,w_from_country,w_external_to_animal
58222,EXTERNAL753090,CUST37972909,ANTHONY BRADY,DR.LOUIS CHAN,6350.0,CA,CA,ATSP38506427,0.0,0.0,0.0,1.0,,,,,0,0,0
25811,EXTERNAL724321,CUST20998362,RYAN SMITH,ROBERT TRAN,3782.0,US,CA,YTYN91941450,0.0,0.0,0.0,0.0,,,,,0,0,0
28184,CUST84652008,CUST26980206,PATRICK TAYLOR,DR.MARY YOUNG,1224.5,CA,CA,YHGQ43780793,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0


## Cash Transaction Features
- `c_large`
    - General indicator
    - Binary flag
    - 1 if a deposit or withdrawal is an outlier (above Q3+1.5*IQR)
    - To determine outliers, data is grouped by occupation and by transaction type (deposit/withdrawal)

In [11]:
DATAPATH = Path('../data/')
CASHPATH = DATAPATH / 'processed' / 'cash.parquet'
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'

UPPERBOUND = 1.5 #Q3 + UPPERBOUND*IQR gives the transaction amount beyond which transactions are classified as outliers

In [12]:
#Load Data
kyc_df = pd.read_parquet(KYCPATH)
kyc_df = kyc_df[['cust_id', 'occupation']]

cash_df = pd.read_parquet(CASHPATH)
cash_df = cash_df.merge(kyc_df, on='cust_id', how='left')

#Get IQR for Outlier Test
q3 = cash_df.groupby(['type', 'occupation'])['trxn_amount'].quantile(0.75)
q1 = cash_df.groupby(['type', 'occupation'])['trxn_amount'].quantile(0.25)
iqr = q3-q1
fence = q3+UPPERBOUND*iqr #upper bound for outlier tests
fence_dict = fence.to_dict()

#Test outliers
def outlier_test(row, fence): 
    outlier = row.trxn_amount > fence[(row.type, row.occupation)]

    return 1 if outlier else 0

cash_df['c_large'] = cash_df.apply(lambda row: outlier_test(row, fence_dict), axis=1)
print(f'Classified {cash_df.c_large.sum()} of {cash_df.c_large.count()} as outliers')

Classified 9774 of 212532 as outliers


In [13]:
cash_df.to_parquet(CASHPATH, index=False)
cash_df.sample(3)

Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label,occupation,c_large
46974,CUST76948662,1510,withdrawal,SKJR30698790,0,0,0,0,Wedding Planner,0
152046,CUST80832205,23175,deposit,BDDW79703356,1,0,0,0,Casino Operator,1
8094,CUST60109527,125,withdrawal,XOEX64495406,0,0,0,0,Marine Engineer,0


# General Transaction Features
- `t_to_animal`
    - Import indicator
    - Binary flag
    - 1 if the transaction is to an animal related business
- `t_from_animal`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related business
- `t_to_animal_large`
    - Import indicator
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_from_animal_large`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_to_shipping`
    - Export indicator
    - Binary flag
    - 1 if the transaction is to someone in shipping/postal/cargo
    - *no label for this type of occupation yet*


In [14]:
DATAPATH = Path('../data/')
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'
WIREPATH = DATAPATH / 'processed' / 'wire.parquet'

In [15]:
emt_data = pd.read_parquet(EMTPATH)
wire_data = pd.read_parquet(WIREPATH)
kyc_data = pd.read_parquet(KYCPATH)

threshold = 1000

In [16]:
def classify_external_as_shipping(cust_names: pd.Series, positives: list, negatives=None):
    """Flags external customers that are shipping businesses.
    
    Args: 
        cust_name: a pandas series containing e-transfer messages
        positives: a list of regex flags to match
        negatives: (optional) a list of regex flags that result in non-matches
    
    Returns:
        A pandas series containing {0,1}
    """
    positive_pattern = '|'.join([f'\\b{p}\\b' for p in positives])
    classification = (messages.str.contains(positive_pattern, case=False, na=False))
    
    if negatives is not None:
        negative_pattern = '|'.join([f'{n}' for n in negatives])
        classification = classification & (~messages.str.contains(negative_pattern, case=False, na=False))
    
    return classification.astype(int)

In [17]:
# Add a new column to each dataframe to indicate the type of transaction
# cash_data['trxn_type'] = 'cash'
emt_data['trxn_type'] = 'emt'
wire_data['trxn_type'] = 'wire'

cols = ['gender', 'occupation', 'age', 'tenure', 'cust_id', 'occ_wealth', 'occ_animal', 'occ_int']

# t_to_animal
emt_data['t_to_animal'] = np.where((emt_data['occ_animal_receiver']==1), 1, 0)
wire_data['t_to_animal'] = np.where((wire_data['occ_animal_receiver']==1), 1, 0)

# t_from_animal
emt_data['t_from_animal'] = np.where((emt_data['occ_animal_sender']==1), 1, 0)
wire_data['t_from_animal'] = np.where((wire_data['occ_animal_sender']==1), 1, 0)

# t_to_animal_large
emt_data['t_to_animal_large'] = np.where((emt_data['occ_animal_receiver']==1)  & (emt_data['emt_value'] > threshold), 1, 0)
wire_data['t_to_animal_large'] = np.where((wire_data['occ_animal_receiver']==1)  & (wire_data['trxn_value'] > threshold), 1, 0)

# t_from_animal_large
emt_data['t_from_animal_large'] = np.where((emt_data['occ_animal_sender']==1) & (emt_data['emt_value'] > threshold), 1, 0)
wire_data['t_from_animal_large'] = np.where((wire_data['occ_animal_sender']==1) & (wire_data['trxn_value'] > threshold), 1, 0)

# t_to_int
emt_data['t_to_int'] = np.where((emt_data['occ_int_receiver']==1), 1, 0)
wire_data['t_to_int'] = np.where((wire_data['occ_int_receiver']==1), 1, 0)

# t_from_int
emt_data['t_from_int'] = np.where((emt_data['occ_int_sender']==1), 1, 0)
wire_data['t_from_int'] = np.where((wire_data['occ_int_sender']==1), 1, 0)

# t_to_shipping
with open('./flags/shipping_pos.txt') as f: 
    positive_flags = f.read().splitlines()

with open('./flags/shipping_neg.txt') as f: 
    negative_flags = f.read().splitlines()

emt_data['t_to_shipping'] = classify_message(emt_data['name_receiver'], positive_flags, negative_flags)
wire_data['t_to_shipping'] = classify_message(wire_data['name_receiver'], positive_flags, negative_flags)

In [18]:
emt_data.to_parquet('../data/processed/emt.parquet')
wire_data.to_parquet('../data/processed/wire.parquet')