In [1]:
from fractions import Fraction
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import yaml

# Suspicion Score

There are three reasons why a node might be suspicious: 
1. The KYC data (node data, eg. occupation) is suspicious
2. The transactions (edges, e.g. e-transfer messages) around that node are suspicious
3. The graph structure around that node is suspicious (e.g. in a 'mule' structure). 

The purpose of this notebook is to aggregate those three factors (and the indicators contained therein) to come up with an explainable total node suspicion score. Because graph structures are complicated and composed of international & external customers - (of which we only have a small fraction), we only focus on the first two reasons, and assume that suspicious graph structures will emerge downstream during pagerank.

If you want to edit weights. Edit the weights.yml file in the current folder.

In [2]:
WEIGHTPATH = Path('./weights.yml')

with open(WEIGHTPATH, 'r') as file: 
    weights = yaml.safe_load(file)
    
weights

{'NODE_WEIGHTS': {'NAMED_TRAFFICKER': 10,
  'LABEL': 1,
  'OCC_INT': '1/6',
  'OCC_WEALTH': '1/6',
  'OCC_ANIMAL': '1/6',
  'C_LARGE': '1/2'},
 'EDGE_WEIGHTS': {'E_AT_RISK': 10,
  'E_ROLE': 1,
  'E_TRAD_MED': 1,
  'W_TO_COUNTRY': '1/3',
  'W_FROM_COUNTRY': '1/3',
  'W_EXTERNAL_TO_ANIMAL': '1/3',
  'T_ANIMAL': '1/4',
  'T_ANIMAL_LARGE': '1/4',
  'T_INT': '1/4',
  'T_TO_SHIPPING': '1/4'}}

# Node Suspicion Score

In [3]:
DATAPATH = Path('../data/processed')
KYCPATH = DATAPATH / 'kyc.parquet'
CASHPATH = DATAPATH / 'cash.parquet'

node_weights = {w: Fraction(v) for w, v in weights['NODE_WEIGHTS'].items()}

## KYC Aggregation
We first combine the KYC features `named_trafficker`, `label`, `occ_int`, `occ_wealth`, `occ_animal` into one one score using the weights in `weights.yml`

In [4]:
def kyc_agg(r, node_weights): 
    occ_agg = \
        node_weights['LABEL']*r.label +\
        node_weights['OCC_INT']*r.occ_int +\
        node_weights['OCC_WEALTH']*r.occ_wealth +\
        node_weights['OCC_ANIMAL']*r.occ_animal +\
        node_weights['NAMED_TRAFFICKER']*r.named_trafficker

    return float(occ_agg)

kyc_df = pd.read_parquet(KYCPATH)

kyc_df['kyc_agg'] = kyc_df.apply(lambda r: kyc_agg(r, node_weights), axis=1)
kyc_df.sample(3)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,named_trafficker,kyc_agg
102765,AMANDA SMITH,female,School Teacher,36.0,4.0,CUST51156361,0,0,0,0,0.0,0.0
176009,DR.RAPHAËL SMITH,male,"Freelancer (e.g., Graphic Designer, Writer)",35.0,8.0,CUST35974717,0,0,0,0,0.0,0.0
69048,JOSEPH CHAREST-LAPLANTE,male,Chemist,35.0,2.0,CUST92971100,0,0,0,0,0.0,0.0


## Cash Aggregation
For the purposes of this project, we consider cash transactions as only occuring between one customer (i.e. the bank does *not* count as a customer). This prevents graph neighbourhoods from being too dense (everyone is connected to the bank, so everyone is a 2-hop neighbour). 

For practical purposes, this means that indicators related to deposit/withdrawal size become node scores, rather than edge scores.

There is only one cash feature currently, `c_large`. `cash_agg` is therefore defined as the proportion of a customers cash transactions that have been defined as being unusually large.

In [5]:
cash_df = pd.read_parquet(CASHPATH)
cash_df.sample(3)

Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label,occupation,c_large
144695,CUST27453472,350,deposit,VVVQ87832593,0,0,0,0,Student,0
155608,CUST20460391,2735,withdrawal,GZMJ17015642,1,0,0,0,Musician,0
199697,CUST17019522,585,withdrawal,EOPT63909716,0,0,0,0,Research Scientist,0


In [6]:
cash_df = pd.read_parquet(CASHPATH)
cash_agg_df = cash_df.groupby('cust_id')['c_large'].mean().reset_index()
cash_agg_df = cash_agg_df.rename(columns={'c_large':'c_agg'})
cash_agg_df.sample(3)

Unnamed: 0,cust_id,c_agg
4277,CUST16300829,0.0
1136,CUST11675082,0.0
56684,CUST93830500,0.0


## Node Score Calculation
We assume that the total node suspicion score is an average of the aggregate kyc suspicion score  and the aggregate cash suspicion score above.

In [7]:
#Join KYC and Cash Data
merged = kyc_df.merge(cash_agg_df, on='cust_id', how='left')
merged['c_agg'] = merged['c_agg'].fillna(0)

#Aggregate 
node_df = merged[['cust_id', 'kyc_agg', 'c_agg']].copy()
node_df['score'] = (node_df['kyc_agg'] + node_weights['C_LARGE']*node_df['c_agg'])

#Clean
node_df = node_df[['cust_id', 'score']]

#Export
node_df.to_parquet(DATAPATH/ 'pagerank' / 'input_node_scores.parquet', index=False)
node_df.sort_values('score', ascending=False)

Unnamed: 0,cust_id,score
59304,CUST76986222,10.904762
12819,CUST60968343,10.733333
98666,CUST73079564,10.125
170931,CUST62443175,10.0
187972,CUST54548565,10.0
...,...,...
73768,CUST26647065,0.0
73769,CUST54834228,0.0
73770,CUST13638671,0.0
73771,CUST65292775,0.0


# Edge Suspicion Score
Features starting with **agg** are aggregates of pre-computed transaction features.

In [8]:
DATAPATH = Path('../data/processed')
EPATH = DATAPATH / 'emt.parquet'
WIREPATH = DATAPATH / 'wire.parquet'

edge_weights = {w: Fraction(v) for w, v in weights['EDGE_WEIGHTS'].items()}

## E-transfer aggregation

In [9]:
def e_transfer_agg(r, edge_weights): 
    e_agg = \
        edge_weights['E_ROLE']*r.e_role +\
        edge_weights['E_AT_RISK']*r.e_at_risk +\
        edge_weights['E_TRAD_MED']*r.e_trad_med

    return float(e_agg)

edf = pd.read_parquet(EPATH)
edf['e_agg'] = edf.apply(lambda r: e_transfer_agg(r, edge_weights), axis=1)
edf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,regex_flag,occ_wealth_receiver,occ_animal_receiver,...,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,t_to_shipping,e_agg,g_agg,score
486798,EXTERNAL133366,CUST79886744,MARY RODRIGUEZ,ISABEL LOPEZ,,211.5,KUDN22039182,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
63371,CUST46218264,CUST65776788,DR. DOUGLAS BOYD,TINA CRUZ,,782.0,LKFC77942785,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
226366,CUST19280042,EXTERNAL357180,MAURICE MARTIN,JAKE HERNANDEZ,,202.0,RAPH27248372,0,,,...,0,0,0,0,0,1,0,0.0,0.125,0.125


## Wire transfer aggregation
The aggregate wire transfer score is an average of the wire transfer indicators `w_to_country`, `w_from_country`, `w_external_to_animal`.

In [10]:
def wire_transfer_agg(r, edge_weights):
    w_agg = \
        edge_weights['W_TO_COUNTRY']*r.w_to_country +\
        edge_weights['W_FROM_COUNTRY']*r.w_from_country +\
        edge_weights['W_EXTERNAL_TO_ANIMAL']*r.w_external_to_animal
    
    return float(w_agg)

wdf = pd.read_parquet(WIREPATH)
wdf['w_agg'] = wdf.apply(lambda r: wire_transfer_agg(r, edge_weights), axis=1)
wdf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,...,w_external_to_animal,trxn_type,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,t_to_shipping,w_agg
61907,CUST36459557,CUST49028237,MICHAEL SMITH,KELLY LEE,1219.0,CA,CA,CWNL41900460,0.0,0.0,...,0,wire,0,0,0,0,1,0,0,0.0
53819,CUST74327086,EXTERNAL260509,DR.PARI DYAL,SUSAN CAMPBELL,9123.0,CA,CA,HIJM55229860,,,...,0,wire,0,0,0,0,0,0,0,0.0
9060,CUST33398337,CUST57584886,MARIA SANDOVAL,MICHÈLE OUELLET,1717.0,CA,CA,VJLW44610996,1.0,0.0,...,0,wire,0,0,0,0,1,0,0,0.0


## General Score Calculation

In [11]:
def compute_score(r, edge_weights, receiver=False):
    score = \
        edge_weights['T_INT']*r.iloc[2] +\
        edge_weights['T_ANIMAL']*r.iloc[1] +\
        edge_weights['T_ANIMAL_LARGE']*r.iloc[0]
    
    if receiver:
        score += edge_weights['T_TO_SHIPPING']*r.iloc[3]
    
    return float(score)

In [12]:
#Col names
sender_cols = ['t_from_animal', 't_from_animal_large', 't_from_int']
receiver_cols = ['t_to_animal', 't_to_animal_large', 't_to_int', 't_to_shipping']

#***E-TRANSFER**
edf['score_sender'] = edf[sender_cols].apply(compute_score, axis=1, args=(edge_weights, False))
edf['score_receiver'] = edf[receiver_cols].apply(compute_score, axis=1, args=(edge_weights, True))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
edf['g_agg'] = (edf['score_sender'] + edf['score_receiver'])/2
# edf['g_agg'] = (edf['g_agg'] - edf['g_agg'].min())/(edf['g_agg'].max() - edf['g_agg'].min())
edf.drop(columns=['score_sender', 'score_receiver'], inplace=True)

#**WIRE-TRANSFER
wdf['score_sender'] = wdf[sender_cols].apply(compute_score, axis=1, args=(edge_weights, False))
wdf['score_receiver'] = wdf[receiver_cols].apply(compute_score, axis=1, args=(edge_weights, True))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
wdf['g_agg'] = (wdf['score_sender'] + wdf['score_receiver'])/2
# wdf['g_agg'] = (wdf['g_agg'] - wdf['g_agg'].min())/(wdf['g_agg'].max() - wdf['g_agg'].min())
wdf.drop(columns=['score_sender', 'score_receiver'], inplace=True)

## Edge Score Calculation

In [13]:
edf['score'] = edf['g_agg'] + edf['e_agg']
wdf['score'] = wdf['g_agg'] + wdf['w_agg']

#exporting
edf.to_parquet(EPATH, index=False)
wdf.to_parquet(WIREPATH, index=False)

s1 = edf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s2 = wdf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s1['type'] = 'e'
s2['type'] = 'w'

pd.concat([s1,s2]).to_parquet(DATAPATH / 'pagerank' / 'input_edge_scores.parquet')