In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import regex

# Transaction Preprocessing

## E-transfer Transaction Features
- `e_at_risk`
    - General indicator
    - Probability that a message references a species at risk, or parts thereof
    - *could also be a binary flag*
    - *can look at CITES for this list*
- `e_cad_at_risk`
    - Export indicator
    - Probability that message references a species at risk in Canada, or parts thereof (bear, geese, etc.)
    - *could also be a binary flag*
    - *get animal list from fintrac doc and papers*
    - Maaz: I don't think e_at_risk as defined above is useful. CITES list is very long. I will just focus on the Canadian at-risk. And I will call it e_at_risk.
        - VVCP12477508, BMIY44394974, ZGAN21196261, MXBU42708741, VYOC17525287, XGAL12621643 -- for sure
        - IBCA52577297 -- 'moose meat'. David Wilkerson sent 978 to Steven Cruz. Wilkerson is also the guy who bought the bear fangs and skin. idk what to write for this.
- `e_role`
    - General indicator
    - Probability that a message references an IWT related role (e.g. poacher, coordinator, supplier, breeder, trader)
    - *could also be a binary flag*
    - Maaz: regex search for these words did not yield anything, nor did semantic search for poacher, nor semantic search for breeder
- `e_trad_med`
    - Import indicator
    - Probability that a message references traditional medicine
    - *could also be a binary flag*
    - Maaz: semantic search for traditional medicine only brought up one possible hit: lion's mane. But, this is a plant. So idk.
 

*Do we want to separate these into TO and FROM?*

In [2]:
DATAPATH = Path('../data')
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'
emt_df = pd.read_parquet(EMTPATH)

emt_df['trxn_message'] = emt_df['trxn_message'].fillna('')

In [3]:
# function takes iterable of messages and returns list of if its about animal trafficking or no
def classify_message_animal_trafficking(messages):
    messages = pd.Series(messages) # to use str contains

    keywords = [
        "bears?", 
        "cougars?", 
        "goose", 
        "geese", 
        "lynx", 
        "moose", 
        "crab?", 
        "eel", 
        "elver", 
        "lobsters?", 
        "turtles?", 
        "sharks?", 
        "wolf", 
        "wolves",
        "ivory", 
        "tusks?", 
        "scales?", 
        "fins?", 
        "tortoise", 
        "gecko", 
        "salamander", 
        "pangolin",
        "musk deer",
        "macaque",
        "gall",
        "bile",
        "gall ?stones?",
        "tiger",
        "leopard",
        "bones?",
        "rhinoceros",
        "rhino",
        "shell",
        "seahorses?",
        "alligator",
        "crocodile",
        "horns?",
        "skin",
        "fangs?"
    ]
    pattern = '|'.join(['\\b{}\\b'.format(k) for k in keywords])

    negatives = [
        'shark week',
        'shark ?skin',
        'shark tank',
        'lone wolf',
        'bear grylls',
        'fortnite',
        'fornite',
        'snake ?skin',
        'drank ur bear',
        'wolf-brand',
        'bear market inves',
        'wolf of wall street',
        'electric eel',
        'goose island',
        'shark vacuum',
        'lobster dinner',
        'lobster fest',
        'crab shack',
        'bulldog skin care',
        'turtle wax',
        'goose down',
        'phone skin',
        'fitness club',
        'oil change',
        'duck duck goose',
        'kayak',
        'king crab',
        'soap',
        'phone case',
        'pizzeria',
        'kettle',
        'steakhouse',
        'leopard print',
        'art',
        'tiger balm',
        'leopard spot',
        'tiger woods',
        'tiger king',
        'screen protector',
        'donation',
        'seahorse earrings',
        'tickets',
        'power bank',
        'beach towel',
        'diving service',
        'bedliner',
        'camera'
    ]
    negative_pattern = '|'.join(['{}'.format(k) for k in negatives])

    classification = (messages.str.contains(pattern, case=False, na=False)) & (~messages.str.contains(negative_pattern, case=False, na=False))
    return classification.astype(int)

In [4]:
# fuzzy matching with regex example
# regexpr = regex.compile(r"(\bbear\b){e<=1}|(\bfox\b){e<=1}")
# emt_df['e_at_risk'] = emt_df['trxn_message'].map(lambda m : bool(regexpr.search(m)))

# if you want to implement this in the function I wrote, just change how `pattern` is constructed
# pattern = regex.compile('|'.join(['(\\b{}\\b){{e<=1}}'.format(k) for k in keywords]))

# one challenge is e.g. fox would match to for, which gets you lots of false positives
# read https://pypi.org/project/regex/ and you can see that you can specify which character sets to allow for the fuzzy match
# e.g., {s<=2:[a-z]} at most 2 substitutions, which must be in the character set [a-z]
# so to fuzzy match fox but not for, use something like that to exclude matching the `r` character?

# and of course would have to change the str.contains to a map and lambda function

In [5]:
emt_df['e_at_risk'] = classify_message_animal_trafficking(emt_df['trxn_message'])

In [6]:
# function takes iterable of messages and returns list of if its about a traditional medicine plant or not
# I have left this here but I am not sure if this is useful. the animals are already covered in the previous function.
# based on: https://www.canada.ca/en/environment-climate-change/services/convention-international-trade-endangered-species/publications/animal-plant-ingredients-traditional-medicine.html#_03
def classify_message_trad_medicine_plant(messages):
    messages = pd.Series(messages) # to use str contains

    keywords = [
        "aucklandia",
        "costus",
        "root",
        "eagle ?wood",
        "fern",
        "golden haired dog",
        "golden hair dog",
        "nard"
    ]
    pattern = '|'.join(['\\b{}\\b'.format(k) for k in keywords])

    negatives = []
    negative_pattern = '|'.join(['{}'.format(k) for k in negatives])

    classification = (messages.str.contains(pattern, case=False, na=False)) & (~messages.str.contains(negative_pattern, case=False, na=False))

    return classification

In [7]:
# dummy
def classify_messsage_iwt_role(messages):
    messages = pd.Series(messages)

    classification = [0 for m in messages]

    return classification

In [8]:
emt_df['e_role'] = classify_messsage_iwt_role(emt_df['trxn_message'])

In [9]:
# dummy
def classify_messsage_trad_med(messages):
    messages = pd.Series(messages)

    classification = [0 for m in messages]

    return classification

In [10]:
emt_df['e_trad_med'] = classify_messsage_trad_med(emt_df['trxn_message'])

In [11]:
emt_df.to_parquet(EMTPATH, index=False)
emt_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender,e_at_risk,e_role,e_trad_med
478085,CUST77870413,CUST76948563,JENNIFER EDWARDS,ALEX DESCHÊNES,,130.0,QAYU16126501,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0,0
405587,CUST27646189,EXTERNAL187875,ÉLÉONORE-JULIETTE BOUCHARD,THOMAS SANDOVAL,,2854.0,XULI36665750,,,,,0.0,0.0,0.0,0.0,0,0,0
225433,CUST61448860,EXTERNAL772776,DR.DEBORAH ROGERS,LUKE HANSON,,130.0,USFO20741362,,,,,0.0,0.0,0.0,0.0,0,0,0


## Wire Transfer Transaction Features
- `w_to_country`
    - Import indicator
    - Binary flag
    - 1 if the wire transfer is to a jurisdiction of concern:
        - China
        - Hong Kong
        - South Africa
        - Australia
        - ...
- `w_from_country`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if from a jurisdiction of concern, 0 otherwise
- `w_external_to_animal`
    - Binary flag
    - 1 if from a jurisdiction of concern to an animal related business, 0 otherwise

In [12]:
WIREPATH = DATAPATH / 'processed' / 'wire.parquet'
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'

SUSCOUNTRIES = ['CN', 'AU']

In [13]:
#Load Data
kyc_df = pd.read_parquet(KYCPATH)
kyc_df = kyc_df[['cust_id', 'occ_animal']]

wire_df = pd.read_parquet(WIREPATH)

#w_to_country
wire_df['w_to_country'] = wire_df['country_receiver'].map(lambda x: 1 if x in SUSCOUNTRIES else 0)

#w_from_country
wire_df['w_from_country'] = wire_df['country_sender'].map(lambda x: 1 if x in SUSCOUNTRIES else 0)

#w_external_to_animal
wire_df['w_external_to_animal'] = wire_df.apply(lambda r: 1 if r.country_sender!='CA' and r.occ_animal_receiver==1 else 0, axis=1)

In [14]:
wire_df.to_parquet(WIREPATH, index=False)
wire_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender,w_to_country,w_from_country,w_external_to_animal
40643,EXTERNAL537623,CUST84170829,WEN FEI,JOSE REYNOLDS,510.0,CN,CA,RZYL22511752,0.0,0.0,0.0,0.0,,,,,0,1,0
41220,EXTERNAL222191,CUST23667861,NADIA BENITO CEDILLO,DANIELLE HOLDER,20087.0,MX,CA,ULNI98238987,0.0,0.0,0.0,0.0,,,,,0,0,0
41423,EXTERNAL509275,CUST13366393,LI GANG,CHARLES DUNCAN,5920.0,CN,CA,HHSU85248158,0.0,0.0,0.0,0.0,,,,,0,1,0


## Cash Transaction Features
- `c_large`
    - General indicator
    - Binary flag
    - 1 if a deposit or withdrawal is an outlier (above Q3+1.5*IQR)
    - To determine outliers, data is grouped by occupation and by transaction type (deposit/withdrawal)

In [15]:
DATAPATH = Path('../data/')
CASHPATH = DATAPATH / 'processed' / 'cash.parquet'
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'

UPPERBOUND = 1.5 #Q3 + UPPERBOUND*IQR gives the transaction amount beyond which transactions are classified as outliers

In [16]:
#Load Data
kyc_df = pd.read_parquet(KYCPATH)
kyc_df = kyc_df[['cust_id', 'occupation']]

cash_df = pd.read_parquet(CASHPATH)
cash_df = cash_df.merge(kyc_df, on='cust_id', how='left')

#Get IQR for Outlier Test
q3 = cash_df.groupby(['type', 'occupation'])['trxn_amount'].quantile(0.75)
q1 = cash_df.groupby(['type', 'occupation'])['trxn_amount'].quantile(0.25)
iqr = q3-q1
fence = q3+UPPERBOUND*iqr #upper bound for outlier tests
fence_dict = fence.to_dict()

#Test outliers
def outlier_test(row, fence): 
    outlier = row.trxn_amount > fence[(row.type, row.occupation)]

    return 1 if outlier else 0

cash_df['c_large'] = cash_df.apply(lambda row: outlier_test(row, fence_dict), axis=1)
print(f'Classified {cash_df.c_large.sum()} of {cash_df.c_large.count()} as outliers')

Classified 9774 of 212532 as outliers


In [17]:
cash_df.to_parquet(CASHPATH, index=False)
cash_df.sample(3)

Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label,occupation,c_large
104583,CUST87013328,6315,withdrawal,QOZN63774842,1,0,1,0,Real Estate Broker,0
108944,CUST53242926,1020,withdrawal,DGVA70883497,0,0,0,0,Pharmacist,0
64816,CUST65285916,3200,deposit,RUHG92075258,0,0,0,0,Business Owner,0


# General Transaction Features
- `t_to_animal`
    - Import indicator
    - Binary flag
    - 1 if the transaction is to an animal related business
- `t_from_animal`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related business
- `t_to_animal_large`
    - Import indicator
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_from_animal_large`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_to_shipping`
    - Export indicator
    - Binary flag
    - 1 if the transaction is to someone in shipping/postal/cargo
    - *no label for this type of occupation yet*


In [18]:
DATAPATH = Path('../data/')
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'
WIREPATH = DATAPATH / 'processed' / 'wire.parquet'

In [19]:
emt_data = pd.read_parquet(EMTPATH)
wire_data = pd.read_parquet(WIREPATH)
kyc_data = pd.read_parquet(KYCPATH)

threshold = 1000

In [20]:
# Add a new column to each dataframe to indicate the type of transaction
# cash_data['trxn_type'] = 'cash'
emt_data['trxn_type'] = 'emt'
wire_data['trxn_type'] = 'wire'

cols = ['gender', 'occupation', 'age', 'tenure', 'cust_id', 'occ_wealth', 'occ_animal', 'occ_int']

emt_data['t_to_animal'] = np.where((emt_data['occ_animal_receiver']==1), 1, 0)
wire_data['t_to_animal'] = np.where((wire_data['occ_animal_receiver']==1), 1, 0)
# cash_data['t_to_animal'] = np.where(cash_data['occ_animal']==1, 1, 0)

emt_data['t_from_animal'] = np.where((emt_data['occ_animal_sender']==1), 1, 0)
wire_data['t_from_animal'] = np.where((wire_data['occ_animal_sender']==1), 1, 0)
# cash_data['t_from_animal'] = 0

emt_data['t_to_animal_large'] = np.where((emt_data['occ_animal_receiver']==1)  & (emt_data['emt_value'] > threshold), 1, 0)
wire_data['t_to_animal_large'] = np.where((wire_data['occ_animal_receiver']==1)  & (wire_data['trxn_value'] > threshold), 1, 0)
# cash_data['t_to_animal_large'] = np.where((cash_data['occ_animal']==1) & (cash_data['trxn_amount'] > threshold), 1, 0)

emt_data['t_from_animal_large'] = np.where((emt_data['occ_animal_sender']==1) & (emt_data['emt_value'] > threshold), 1, 0)
wire_data['t_from_animal_large'] = np.where((wire_data['occ_animal_sender']==1) & (wire_data['trxn_value'] > threshold), 1, 0)
# cash_data['t_from_animal_large'] = 0

emt_data['t_to_int'] = np.where((emt_data['occ_int_receiver']==1), 1, 0)
wire_data['t_to_int'] = np.where((wire_data['occ_int_receiver']==1), 1, 0)
# cash_data['t_to_int'] = np.where(cash_data['occ_int']==1, 1, 0)

emt_data['t_from_int'] = np.where((emt_data['occ_int_sender']==1), 1, 0)
wire_data['t_from_int'] = np.where((wire_data['occ_int_sender']==1), 1, 0)
# cash_data['t_from_int'] = 0

In [21]:
emt_data.to_parquet('../data/processed/emt.parquet')
wire_data.to_parquet('../data/processed/wire.parquet')