In [9]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow

# Suspicion Score

There are three reasons why a node might be suspicious: 
1. The KYC data (node data, eg. occupation) is suspicious
2. The transactions (edges, e.g. e-transfer messages) around that node are suspicious
3. The graph structure around that node is suspicious (e.g. in a 'mule' structure)

The purpose of this notebook is to aggregate those three factors (and the indicators contained therein) to come up with an explainable total node suspicion score. 

TODO:

- ~Aggregate node indicators (JWB)~
- Aggregate e-transfer features (MM)
- Aggregate wire-transfer features (JWB)
- ~Aggregate cash-transfer features (JWB)~
- Aggregate general features (AGG)

# Node Suspicion Score

In [15]:
DATAPATH = Path('../data/processed')
KYCPATH = DATAPATH / 'kyc.parquet'
CASHPATH = DATAPATH / 'cash.parquet'

## KYC Aggregation
We first combine the KYC features `label`, `occ_int`, `occ_wealth`, `occ_animal` into one one KYC score. To do this, we assume that the occupation flags are additive, and then take the max of the occupation aggregate score with the `label` feature. 

Basically: If they have been flagged for money laundering, their node should have the heighest suspision score, otherwise we take a weighted average of the occupation flags.

`kyc_agg`$=\max$(`label`, $\texttt{avg}$(`occ_int`, `occ_wealth`, `occ_animal`))


In [16]:
#Optional feature weights
W_OCC_INT = 1/3
W_OCC_WEALTH = 1/3
W_OCC_ANIMAL = 1/3

In [17]:
kyc_df = pd.read_parquet(KYCPATH)

def kyc_agg(r): 
    occ_agg = W_OCC_INT*r.occ_int + W_OCC_WEALTH*r.occ_wealth + W_OCC_ANIMAL*r.occ_animal
    return max(r.label, occ_agg)

kyc_df['kyc_agg'] = kyc_df.apply(lambda r: kyc_agg(r), axis=1)
kyc_df.sample(3)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,kyc_agg
187958,YU RONG,other,Grocery Store Clerk,30.0,3.0,CUST56569983,0,0,0,0,0.0
28024,YANG XIN,female,Postal Worker,32.0,5.0,CUST77566742,0,0,1,0,0.333333
103030,TRISTAN TURNER,male,Therapist,42.0,3.0,CUST33250651,0,0,0,0,0.0


## Cash Aggregation
For the purposes of this project, we consider cash transactions as only occuring between one customer (i.e. the bank does *not* count as a customer). This prevents graph neighbourhoods from being too dense (everyone is connected to the bank, so everyone is a 2-hop neighbour). 

For practical purposes, this means that indicators related to deposit/withdrawal size become node scores, rather than edge scores.

There is only one cash feature currently, `c_large`. `cash_agg` is therefore defined as the proportion of a customers cash transactions that have been defined as being unusually large.

In [18]:
cash_df = pd.read_parquet(CASHPATH)
cash_df.sample(3)

Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label,occupation,c_large
137500,CUST53716443,1830,withdrawal,PPKV15023537,1,0,1,0,Real Estate Agent,0
172856,CUST71291865,2475,deposit,LDCA70083273,1,0,0,0,Musician,0
57728,CUST88784855,5055,withdrawal,POIA78096126,0,0,0,0,Property Manager,0


In [19]:
cash_df = pd.read_parquet(CASHPATH)
cash_agg_df = cash_df.groupby('cust_id')['c_large'].mean().reset_index()
cash_agg_df = cash_agg_df.rename(columns={'c_large':'c_agg'})
cash_agg_df.sample(3)

Unnamed: 0,cust_id,c_agg
13896,CUST30615977,0.0
50943,CUST85259986,0.0
17351,CUST35794847,0.0


## Node Score Calculation
We assume that the total node suspicion score is an average of the aggregate kyc suspicion score  and the aggregate cash suspicion score above.

In [20]:
#Join KYC and Cash Data
merged = kyc_df.merge(cash_agg_df, on='cust_id', how='left')
merged['c_agg'] = merged['c_agg'].fillna(0)

#Aggregate 
node_df = merged[['cust_id', 'kyc_agg', 'c_agg']].copy()
node_df['score'] = (node_df['kyc_agg'] + node_df['c_agg'])/2
node_df.to_parquet('kyc_2.df')

#Clean
node_df = node_df[['cust_id', 'score']]

#Export
node_df.to_parquet(DATAPATH/'pr_node_score.parquet', index=False)
node_df.sort_values('score', ascending=False)

Unnamed: 0,cust_id,score
114317,CUST47228718,1.0
192094,CUST75958300,1.0
44269,CUST75045024,1.0
140795,CUST22362657,1.0
4293,CUST13082674,1.0
...,...,...
73748,CUST49137654,0.0
73749,CUST76627038,0.0
73750,CUST81879435,0.0
73751,CUST28203470,0.0


# Edge Suspicion Score
Features starting with **agg** are aggregates of pre-computed transaction features.

In [21]:
DATAPATH = Path('../data/processed')
EPATH = DATAPATH / 'emt.parquet'
WIREPATH = DATAPATH / 'wire.parquet'

## E-transfer aggregation

In [22]:
edf = pd.read_parquet(EPATH)
edf['e_agg'] = edf.apply(lambda r: sum([r.e_at_risk, r.e_role, r.e_trad_med]), axis=1)
edf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,...,e_role,e_trad_med,trxn_type,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,e_agg
398545,EXTERNAL571758,CUST18456269,CATHERINE ROBINSON,WANG YU,,300.0,ZBPV41101179,0.0,0.0,0.0,...,0,0,emt,0,0,0,0,0,0,0
10998,CUST95297429,CUST19702299,MICHAEL MAYS,CHRISTOPHER ROBERTSON,,365.0,KQIX54475972,0.0,0.0,0.0,...,0,0,emt,0,0,0,0,0,0,0
121405,CUST66650337,EXTERNAL112331,DR.MILEN VASIL'EVICH EGOROV,JOSHUA MOORE,,914.0,QVUH68135709,,,,...,0,0,emt,0,0,0,0,0,0,0


## Wire transfer aggregation
The aggregate wire transfer score is an average of the wire transfer indicators `w_to_country`, `w_from_country`, `w_external_to_animal`.

In [26]:
wdf = pd.read_parquet(WIREPATH)
wdf['w_agg'] = (wdf['w_to_country'] + wdf['w_from_country'] + wdf['w_external_to_animal'])/3
wdf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,...,w_from_country,w_external_to_animal,trxn_type,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,w_agg
64727,CUST61215958,EXTERNAL983318,DANIELLE JACQUES,BENJAMIN MURRAY MD,1826.0,CA,AU,BBHG42618210,,,...,0,0,wire,0,0,0,0,0,0,0.333333
12403,EXTERNAL831504,CUST50456434,JAMES OWENS,NATASHA KELLY,18359.0,CA,CA,PYSW47288614,0.0,0.0,...,0,0,wire,0,0,0,0,0,0,0.0
54489,EXTERNAL382531,CUST71152603,MICHAEL MORRIS,KATHLEEN GOULD,1597.5,CA,CA,IEYW14153870,0.0,0.0,...,0,0,wire,0,0,0,0,0,0,0.0


## General Score Calculation

In [27]:
W_OCC_INT = 1/2
W_OCC_ANIMAL_L = 1/4
W_OCC_ANIMAL = 1/4

In [28]:
def compute_score(r, w_occ_int, w_occ_animal_l, w_occ_animal):
    return w_occ_int*r.iloc[2] + w_occ_animal_l*r.iloc[1] + w_occ_animal*r.iloc[0]

In [29]:
#Col names
sender_cols = ['t_from_animal', 't_from_animal_large', 't_from_int']
receiver_cols = ['t_to_animal', 't_to_animal_large', 't_to_int']

#***E-TRANSFER**
edf['score_sender'] = edf[sender_cols].apply(compute_score, axis=1, args=(W_OCC_INT, W_OCC_ANIMAL_L, W_OCC_ANIMAL))
edf['score_receiver'] = edf[receiver_cols].apply(compute_score, axis=1, args=(W_OCC_INT, W_OCC_ANIMAL_L, W_OCC_ANIMAL))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
edf['g_agg'] = (edf['score_sender'] + edf['score_receiver'])
edf['g_agg'] = (edf['g_agg'] - edf['g_agg'].min())/(edf['g_agg'].max() - edf['g_agg'].min())
edf.drop(columns=['score_sender', 'score_receiver'], inplace=True)


#**WIRE-TRANSFER
wdf['score_sender'] = wdf[sender_cols].apply(compute_score, axis=1, args=(W_OCC_INT, W_OCC_ANIMAL_L, W_OCC_ANIMAL))
wdf['score_receiver'] = wdf[receiver_cols].apply(compute_score, axis=1, args=(W_OCC_INT, W_OCC_ANIMAL_L, W_OCC_ANIMAL))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
wdf['g_agg'] = (wdf['score_sender'] + wdf['score_receiver'])
wdf['g_agg'] = (wdf['g_agg'] - wdf['g_agg'].min())/(wdf['g_agg'].max() - wdf['g_agg'].min())
wdf.drop(columns=['score_sender', 'score_receiver'], inplace=True)

## Edge Score Calculation

In [30]:
edf['score'] = edf['g_agg'] + edf['e_agg']
wdf['score'] = wdf['g_agg'] + wdf['w_agg']

#exporting
edf.to_parquet(EPATH, index=False)
wdf.to_parquet(WIREPATH, index=False)

s1 = edf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s2 = wdf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s1['type'] = 'e'
s2['type'] = 'w'

pd.concat([s1,s2]).to_parquet(DATAPATH / 'pr_edge_score.parquet')