In [13]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow

# Suspicion Score

There are three reasons why a node might be suspicious: 
1. The KYC data (node data, eg. occupation) is suspicious
2. The transactions (edges, e.g. e-transfer messages) around that node are suspicious
3. The graph structure around that node is suspicious (e.g. in a 'mule' structure)

The purpose of this notebook is to aggregate those three factors (and the indicators contained therein) to come up with an explainable total node suspicion score. 

TODO:

- ~Aggregate node indicators (JWB)~
- Aggregate e-transfer features (MM)
- Aggregate wire-transfer features (JWB)
- ~Aggregate cash-transfer features (JWB)~
- Aggregate general features (AGG)

# Node Suspicion Score

In [14]:
DATAPATH = Path('../data/processed')
KYCPATH = DATAPATH / 'kyc.parquet'
CASHPATH = DATAPATH / 'cash.parquet'

## KYC Aggregation
We first combine the KYC features `label`, `occ_int`, `occ_wealth`, `occ_animal` into one one KYC score. To do this, we assume that the occupation flags are additive, and then take the max of the occupation aggregate score with the `label` feature. 

Basically: If they have been flagged for money laundering, their node should have the heighest suspision score, otherwise we take a weighted average of the occupation flags.

`kyc_agg`$=\max$(`label`, $\texttt{avg}$(`occ_int`, `occ_wealth`, `occ_animal`))


In [15]:
#Optional feature weights
W_OCC_INT = 1/3
W_OCC_WEALTH = 1/3
W_OCC_ANIMAL = 1/3

In [16]:
kyc_df = pd.read_parquet(KYCPATH)

def kyc_agg(r): 
    occ_agg = W_OCC_INT*r.occ_int + W_OCC_WEALTH*r.occ_wealth + W_OCC_ANIMAL*r.occ_animal
    return max(r.label, occ_agg)

kyc_df['kyc_agg'] = kyc_df.apply(lambda r: kyc_agg(r), axis=1)
kyc_df.sample(3)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,kyc_agg
132702,ANGELA EDWARDS,female,Auto Dealer,48.0,12.0,CUST16218455,0,0,0,1,1.0
21789,TYLER WEBB,male,Plumber,44.0,20.0,CUST22732842,0,0,0,0,0.0
25200,ANDREW DAVIS JR.,male,Physicist,42.0,1.0,CUST21475601,0,0,0,0,0.0


## Cash Aggregation
For the purposes of this project, we consider cash transactions as only occuring between one customer (i.e. the bank does *not* count as a customer). This prevents graph neighbourhoods from being too dense (everyone is connected to the bank, so everyone is a 2-hop neighbour). 

For practical purposes, this means that indicators related to deposit/withdrawal size become node scores, rather than edge scores.

There is only one cash feature currently, `c_large`. `cash_agg` is therefore defined as the proportion of a customers cash transactions that have been defined as being unusually large.

In [17]:
cash_df = pd.read_parquet(CASHPATH)
cash_agg_df = cash_df.groupby('cust_id')['c_large'].mean().reset_index()
cash_agg_df = cash_agg_df.rename(columns={'c_large':'c_agg'})
cash_agg_df.sample(3)

Unnamed: 0,cust_id,c_agg
16529,CUST34606741,0.111111
22942,CUST44240453,0.0
51376,CUST85926594,1.0


## Node Score Calculation
We assume that the total node suspicion score is an average of the aggregate kyc suspicion score  and the aggregate cash suspicion score above.

In [18]:
#Join KYC and Cash Data
merged = kyc_df.merge(cash_agg_df, on='cust_id', how='left')
merged['c_agg'] = merged['c_agg'].fillna(0)

#Aggregate 
node_df = merged[['cust_id', 'kyc_agg', 'c_agg']].copy()
node_df['score'] = (node_df['kyc_agg'] + node_df['c_agg'])/2

#Clean
node_df = node_df[['cust_id', 'score']]

#Export
node_df.to_parquet(DATAPATH/'node_score.parquet', index=False)
node_df.sort_values('score', ascending=False)

Unnamed: 0,cust_id,score
114317,CUST47228718,1.0
192094,CUST75958300,1.0
44269,CUST75045024,1.0
140795,CUST22362657,1.0
4293,CUST13082674,1.0
...,...,...
73748,CUST49137654,0.0
73749,CUST76627038,0.0
73750,CUST81879435,0.0
73751,CUST28203470,0.0


# Edge Suspicion Score
Features starting with **agg** are aggregates of pre-computed transaction features.

In [46]:
DATAPATH = Path('../data/processed')
EPATH = DATAPATH / 'emt.parquet'
WIREPATH = DATAPATH / 'wire.parquet'

## E-transfer aggregation

In [53]:
edf = pd.read_parquet(EPATH)
edf['e_agg'] = edf.apply(lambda r: sum([r.e_at_risk, r.e_role, r.e_trad_med]), axis=1)
edf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,trxn_type,gender_sender,occupation_sender,...,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,e_at_risk,e_role,e_trad_med,e_agg
435598,CUST79389623,CUST88946146,RICHARD BARNES,SHANE HANSEN,,501.5,KNPW74258424,emt,male,Jewelry Dealer,...,0,0,0,0,0,1,0,0,0,0
236636,EXTERNAL153009,CUST95091229,GUO MIN,DR.ADAM PARRISH,,104.0,UGTH71633092,emt,,,...,0,0,0,0,0,0,0,0,0,0
412804,CUST97512508,CUST36654584,PATRICK LACHANCE,BRENDA GAY,,142.0,RKXB74749742,emt,male,Distiller,...,0,0,0,0,0,0,0,0,0,0


## Wire transfer aggregation
The aggregate wire transfer score is an average of the wire transfer indicators `w_to_country`, `w_from_country`, `w_external_to_animal`.

In [48]:
wdf = pd.read_parquet(WIREPATH)
wdf['w_agg'] = (wire_df['w_to_country'] + wire_df['w_from_country'] + wire_df['w_external_to_animal'])/3
wdf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,trxn_type,gender_sender,...,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,w_to_country,w_from_country,w_external_to_animal,w_agg
20942,CUST96870219,CUST47725965,ANTHONY JENKINS,CLÉMENCE THIBAULT-LALIBERTÉ,1626.0,CA,CA,SFNY50964463,wire,male,...,0,0,0,0,0,0,0,0,0,0.0
57917,EXTERNAL204726,CUST27546771,DR SEAN PICKERING,DIANE LEONARD,4770.0,UK,CA,HJWZ88644125,wire,,...,0,0,0,0,1,0,0,0,0,0.0
24940,CUST92128375,CUST44980145,PRANAY VASA,DR.WILLIAM MARSHALL,5429.5,CA,CA,MKCS29778357,wire,female,...,0,0,0,0,0,1,0,0,0,0.0


## General Score Calculation

In [49]:
W_OCC_INT = 1/2
W_OCC_ANIMAL_L = 1/4
W_OCC_ANIMAL = 1/4

In [50]:
def compute_score(r, w_occ_int, w_occ_animal_l, w_occ_animal):
    return w_occ_int*r.iloc[2] + w_occ_animal_l*r.iloc[1] + w_occ_animal*r.iloc[0]

In [55]:
#Col names
sender_cols = ['t_from_animal', 't_from_animal_large', 't_from_int']
receiver_cols = ['t_to_animal', 't_to_animal_large', 't_to_int']

#***E-TRANSFER**
edf['score_sender'] = edf[sender_cols].apply(compute_score, axis=1, args=(W_OCC_INT, W_OCC_ANIMAL_L, W_OCC_ANIMAL))
edf['score_receiver'] = edf[receiver_cols].apply(compute_score, axis=1, args=(W_OCC_INT, W_OCC_ANIMAL_L, W_OCC_ANIMAL))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
edf['g_agg'] = (edf['score_sender'] + edf['score_receiver'])
edf['g_agg'] = (edf['g_agg'] - edf['g_agg'].min())/(edf['g_agg'].max() - edf['g_agg'].min())
edf.drop(columns=['score_sender', 'score_receiver'], inplace=True)


#**WIRE-TRANSFER
wdf['score_sender'] = wdf[sender_cols].apply(compute_score, axis=1, args=(W_OCC_INT, W_OCC_ANIMAL_L, W_OCC_ANIMAL))
wdf['score_receiver'] = wdf[receiver_cols].apply(compute_score, axis=1, args=(W_OCC_INT, W_OCC_ANIMAL_L, W_OCC_ANIMAL))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
wdf['g_agg'] = (wdf['score_sender'] + wdf['score_receiver'])
wdf['g_agg'] = (wdf['g_agg'] - wdf['g_agg'].min())/(wdf['g_agg'].max() - wdf['g_agg'].min())
wdf.drop(columns=['score_sender', 'score_receiver'], inplace=True)

## Edge Score Calculation

In [61]:
edf['score'] = edf['g_agg'] + edf['e_agg']
wdf['score'] = wdf['g_agg'] + wdf['w_agg']

#exporting
edf.to_parquet(EPATH, index=False)
wdf.to_parquet(WIREPATH, index=False)

s1 = edf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s2 = wdf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s1['type'] = 'e'
s2['type'] = 'w'

pd.concat([s1,s2]).to_parquet(DATAPATH / 'edge_score.parquet')