In [1]:
from fractions import Fraction
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import yaml

# Suspicion Score

There are three reasons why a node might be suspicious: 
1. The KYC data (node data, eg. occupation) is suspicious
2. The transactions (edges, e.g. e-transfer messages) around that node are suspicious
3. The graph structure around that node is suspicious (e.g. in a 'mule' structure). 

The purpose of this notebook is to aggregate those three factors (and the indicators contained therein) to come up with an explainable total node suspicion score. Because graph structures are complicated and composed of international & external customers - (of which we only have a small fraction), we only focus on the first two reasons, and assume that suspicious graph structures will emerge downstream during pagerank.

If you want to edit weights. Edit the weights.yml file in the current folder.

In [2]:
WEIGHTPATH = Path('./weights.yml')

with open(WEIGHTPATH, 'r') as file: 
    weights = yaml.safe_load(file)
    
weights

{'NODE_WEIGHTS': {'NAMED_TRAFFICKER': 2,
  'LABEL': 1,
  'OCC_INT': '1/6',
  'OCC_WEALTH': '1/6',
  'OCC_ANIMAL': '1/6',
  'C_LARGE': '1/2'},
 'EDGE_WEIGHTS': {'E_AT_RISK': 1,
  'E_ROLE': 1,
  'E_TRAD_MED': 1,
  'W_TO_COUNTRY': '1/3',
  'W_FROM_COUNTRY': '1/3',
  'W_EXTERNAL_TO_ANIMAL': '1/3',
  'T_ANIMAL': '1/4',
  'T_ANIMAL_LARGE': '1/4',
  'T_INT': '1/4',
  'T_TO_SHIPPING': '1/4'}}

# Node Suspicion Score

In [3]:
DATAPATH = Path('../data/processed')
KYCPATH = DATAPATH / 'kyc.parquet'
CASHPATH = DATAPATH / 'cash.parquet'

node_weights = {w: Fraction(v) for w, v in weights['NODE_WEIGHTS'].items()}

## KYC Aggregation
We first combine the KYC features `named_trafficker`, `label`, `occ_int`, `occ_wealth`, `occ_animal` into one one score using the weights in `weights.yml`

In [4]:
def kyc_agg(r, node_weights): 
    occ_agg = \
        node_weights['LABEL']*r.label +\
        node_weights['OCC_INT']*r.occ_int +\
        node_weights['OCC_WEALTH']*r.occ_wealth +\
        node_weights['OCC_ANIMAL']*r.occ_animal +\
        node_weights['NAMED_TRAFFICKER']*r.named_trafficker

    return float(occ_agg)

kyc_df = pd.read_parquet(KYCPATH)

kyc_df['kyc_agg'] = kyc_df.apply(lambda r: kyc_agg(r, node_weights), axis=1)
kyc_df.sample(3)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,named_trafficker,kyc_agg
39180,DR.JENNIFER WASHINGTON,female,Postal Worker,44.0,9.0,CUST96760751,0,0,1,0,0.0,0.166667
178736,CRISTINA NELSON,female,Lawyer,36.0,0.0,CUST15287594,1,0,0,0,0.0,0.166667
75987,MUHIN LUCHEZAR GORDEEVICH,male,"Freelancer (e.g., Graphic Designer, Writer)",29.0,1.0,CUST99202100,0,0,0,0,0.0,0.0


## Cash Aggregation
For the purposes of this project, we consider cash transactions as only occuring between one customer (i.e. the bank does *not* count as a customer). This prevents graph neighbourhoods from being too dense (everyone is connected to the bank, so everyone is a 2-hop neighbour). 

For practical purposes, this means that indicators related to deposit/withdrawal size become node scores, rather than edge scores.

There is only one cash feature currently, `c_large`. `cash_agg` is therefore defined as the proportion of a customers cash transactions that have been defined as being unusually large.

In [5]:
cash_df = pd.read_parquet(CASHPATH)
cash_df.sample(3)

Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label,occupation,c_large
145126,CUST40235379,1265,deposit,DHUA77517152,0,0,0,0,Auto Dealer,0
192449,CUST71750993,8010,deposit,IGQA64731028,1,0,1,1,Shell Company Operator,0
44969,CUST17988852,605,withdrawal,WPTI90656283,0,0,0,0,Sound Engineer,0


In [6]:
cash_df = pd.read_parquet(CASHPATH)
cash_agg_df = cash_df.groupby('cust_id')['c_large'].mean().reset_index()
cash_agg_df = cash_agg_df.rename(columns={'c_large':'c_agg'})
cash_agg_df.sample(3)

Unnamed: 0,cust_id,c_agg
55621,CUST92285655,0.0
10290,CUST25271040,0.0
30931,CUST55942636,0.0


## Node Score Calculation
We assume that the total node suspicion score is an average of the aggregate kyc suspicion score  and the aggregate cash suspicion score above.

In [7]:
#Join KYC and Cash Data
merged = kyc_df.merge(cash_agg_df, on='cust_id', how='left')
merged['c_agg'] = merged['c_agg'].fillna(0)

#Aggregate 
node_df = merged[['cust_id', 'kyc_agg', 'c_agg']].copy()
node_df['score'] = (node_df['kyc_agg'] + node_weights['C_LARGE']*node_df['c_agg'])

#Clean
node_df = node_df[['cust_id', 'score']]

#Export
node_df.to_parquet(DATAPATH/ 'pagerank' / 'input_node_scores.parquet', index=False)
node_df.sort_values('score', ascending=False)

Unnamed: 0,cust_id,score
59304,CUST76986222,3.304762
12819,CUST60968343,3.213333
98666,CUST73079564,2.925
77174,CUST71422073,2.193333
129270,CUST34711252,2.193333
...,...,...
73768,CUST26647065,0.0
73769,CUST54834228,0.0
73770,CUST13638671,0.0
73771,CUST65292775,0.0


# Edge Suspicion Score
Features starting with **agg** are aggregates of pre-computed transaction features.

In [8]:
DATAPATH = Path('../data/processed')
EPATH = DATAPATH / 'emt.parquet'
WIREPATH = DATAPATH / 'wire.parquet'

edge_weights = {w: Fraction(v) for w, v in weights['EDGE_WEIGHTS'].items()}

## E-transfer aggregation

In [9]:
def e_transfer_agg(r, edge_weights): 
    e_agg = \
        edge_weights['E_ROLE']*r.e_role +\
        edge_weights['E_AT_RISK']*r.e_at_risk +\
        edge_weights['E_TRAD_MED']*r.e_trad_med

    return float(e_agg)

edf = pd.read_parquet(EPATH)
edf['e_agg'] = edf.apply(lambda r: e_transfer_agg(r, edge_weights), axis=1)
edf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,regex_flag,occ_wealth_receiver,occ_animal_receiver,...,e_trad_med,trxn_type,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,t_to_shipping,e_agg
268620,CUST73853048,EXTERNAL583432,DR.PAM EDWARDS DVM,PATRICK FISHER,,216.5,YPYT29191389,0,,,...,0,emt,0,0,0,0,0,0,0,0.0
364562,EXTERNAL458899,CUST79476760,JENNIFER SULLIVAN,CHARLES FOURNIER,,100.0,CXZH93948445,0,0.0,0.0,...,0,emt,0,0,0,0,0,0,0,0.0
333369,CUST96136647,CUST67730951,TONY PERKINS,LAI JIAN JUN,,1135.0,SJTA57399949,0,1.0,0.0,...,0,emt,0,0,0,0,0,0,0,0.0


## Wire transfer aggregation
The aggregate wire transfer score is an average of the wire transfer indicators `w_to_country`, `w_from_country`, `w_external_to_animal`.

In [10]:
def wire_transfer_agg(r, edge_weights):
    w_agg = \
        edge_weights['W_TO_COUNTRY']*r.w_to_country +\
        edge_weights['W_FROM_COUNTRY']*r.w_from_country +\
        edge_weights['W_EXTERNAL_TO_ANIMAL']*r.w_external_to_animal
    
    return float(w_agg)

wdf = pd.read_parquet(WIREPATH)
wdf['w_agg'] = wdf.apply(lambda r: wire_transfer_agg(r, edge_weights), axis=1)
wdf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,...,w_external_to_animal,trxn_type,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,t_to_shipping,w_agg
44833,EXTERNAL299553,CUST74398909,GEORGES BOULANGER,KYLE WELLS,3103.0,CA,CA,PLMV36877615,0.0,0.0,...,0,wire,0,0,0,0,0,0,0,0.0
56562,CUST24246843,EXTERNAL361450,CENG JIE,KATHY DEAN,2616.0,CA,CA,ZJCS91888907,,,...,0,wire,0,0,0,0,0,0,0,0.0
38848,EXTERNAL617628,CUST26477463,SR(A). MIGUEL ÁNGEL MADERA,NANCY MCCLAIN,7278.5,CA,CA,OTKL12372719,1.0,0.0,...,0,wire,0,0,0,0,1,0,0,0.0


## General Score Calculation

In [11]:
def compute_score(r, edge_weights, receiver=False):
    score = \
        edge_weights['T_INT']*r.iloc[2] +\
        edge_weights['T_ANIMAL']*r.iloc[1] +\
        edge_weights['T_ANIMAL_LARGE']*r.iloc[0]
    
    if receiver:
        score += edge_weights['T_TO_SHIPPING']*r.iloc[3]
    
    return float(score)

In [12]:
#Col names
sender_cols = ['t_from_animal', 't_from_animal_large', 't_from_int']
receiver_cols = ['t_to_animal', 't_to_animal_large', 't_to_int', 't_to_shipping']

#***E-TRANSFER**
edf['score_sender'] = edf[sender_cols].apply(compute_score, axis=1, args=(edge_weights, False))
edf['score_receiver'] = edf[receiver_cols].apply(compute_score, axis=1, args=(edge_weights, True))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
edf['g_agg'] = (edf['score_sender'] + edf['score_receiver'])/2
# edf['g_agg'] = (edf['g_agg'] - edf['g_agg'].min())/(edf['g_agg'].max() - edf['g_agg'].min())
edf.drop(columns=['score_sender', 'score_receiver'], inplace=True)

#**WIRE-TRANSFER
wdf['score_sender'] = wdf[sender_cols].apply(compute_score, axis=1, args=(edge_weights, False))
wdf['score_receiver'] = wdf[receiver_cols].apply(compute_score, axis=1, args=(edge_weights, True))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
wdf['g_agg'] = (wdf['score_sender'] + wdf['score_receiver'])/2
# wdf['g_agg'] = (wdf['g_agg'] - wdf['g_agg'].min())/(wdf['g_agg'].max() - wdf['g_agg'].min())
wdf.drop(columns=['score_sender', 'score_receiver'], inplace=True)

## Edge Score Calculation

In [13]:
edf['score'] = edf['g_agg'] + edf['e_agg']
wdf['score'] = wdf['g_agg'] + wdf['w_agg']

#exporting
edf.to_parquet(EPATH, index=False)
wdf.to_parquet(WIREPATH, index=False)

s1 = edf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s2 = wdf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s1['type'] = 'e'
s2['type'] = 'w'

pd.concat([s1,s2]).to_parquet(DATAPATH / 'pagerank' / 'input_edge_scores.parquet')