In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow

# Suspicion Score

There are three reasons why a node might be suspicious: 
1. The KYC data (node data, eg. occupation) is suspicious
2. The transactions (edges, e.g. e-transfer messages) around that node are suspicious
3. The graph structure around that node is suspicious (e.g. in a 'mule' structure)

The purpose of this notebook is to aggregate those three factors (and the indicators contained therein) to come up with an explainable total node suspicion score. 

TODO:

- ~Aggregate node indicators (JWB)~
- Aggregate e-transfer features (MM)
- Aggregate wire-transfer features (JWB)
- ~Aggregate cash-transfer features (JWB)~
- Aggregate general features (AGG)
- PageRank (MM)
- Find a suspicious network (AGG)

# Node Suspicion Score

In [28]:
DATAPATH = Path('../data/processed')
KYCPATH = DATAPATH / 'kyc.parquet'
CASHPATH = DATAPATH / 'cash.parquet'

## KYC Aggregation
We first combine the KYC features `label`, `occ_int`, `occ_wealth`, `occ_animal` into one one KYC score. To do this, we assume that the occupation flags are additive, and then take the max of the occupation aggregate score with the `label` feature. 

Basically: If they have been flagged for money laundering, their node should have the heighest suspision score, otherwise we take a weighted average of the occupation flags.

`kyc_agg`$=\max$(`label`, $\texttt{avg}$(`occ_int`, `occ_wealth`, `occ_animal`))


In [34]:
#Optional feature weights
W_OCC_INT = 1/3
W_OCC_WEALTH = 1/3
W_OCC_ANIMAL = 1/3

In [38]:
kyc_df = pd.read_parquet(KYCPATH)

def kyc_agg(r): 
    occ_agg = W_OCC_INT*r.occ_int + W_OCC_WEALTH*r.occ_wealth + W_OCC_ANIMAL*r.occ_animal
    return max(r.label, occ_agg)

kyc_df['kyc_agg'] = kyc_df.apply(lambda r: kyc_agg(r), axis=1)
kyc_df.sample(3)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,kyc_agg
21989,AMY JOHNSON,female,Pet Groomer,33.0,8.0,CUST68389688,0,1,0,0,0.333333
62717,JOHN WATKINS,male,Gardener,34.0,3.0,CUST72834415,0,0,0,0,0.0
68089,DR.KELLY CARTER,female,Food Critic,33.0,7.0,CUST80817667,0,0,0,0,0.0


## Cash Aggregation
For the purposes of this project, we consider cash transactions as only occuring between one customer (i.e. the bank does *not* count as a customer). This prevents graph neighbourhoods from being too dense (everyone is connected to the bank, so everyone is a 2-hop neighbour). 

For practical purposes, this means that indicators related to deposit/withdrawal size become node scores, rather than edge scores.

There is only one cash feature currently, `c_large`. `cash_agg` is therefore defined as the proportion of a customers cash transactions that have been defined as being unusually large.

In [54]:
cash_df = pd.read_parquet(CASHPATH)
cash_agg_df = cash_df.groupby('cust_id')['c_large'].mean().reset_index()
cash_agg_df = cash_agg_df.rename(columns={'c_large':'c_agg'})
cash_agg_df.sample(3)

Unnamed: 0,cust_id,c_agg
13739,CUST30370806,0.0
60855,CUST99887405,0.0
26105,CUST48967972,0.0


## Node Score Calculation
We assume that the total node suspicion score is an average of the aggregate kyc suspicion score  and the aggregate cash suspicion score above.

In [62]:
#Join KYC and Cash Data
merged = kyc_df.merge(cash_agg_df, on='cust_id', how='left')
merged['c_agg'] = merged['c_agg'].fillna(0)

#Aggregate 
node_df = merged[['cust_id', 'kyc_agg', 'c_agg']].copy()
node_df['score'] = (node_df['kyc_agg'] + node_df['c_agg'])/2

#Clean
node_df = node_df[['cust_id', 'score']]


#Export
node_df.to_parquet(DATAPATH/'node_score.parquet', index=False)
node_df.sort_values('score', ascending=False)

Unnamed: 0,cust_id,score
150089,CUST87731042,1.0
165103,CUST51289421,1.0
110190,CUST21142684,1.0
180831,CUST55502501,1.0
57930,CUST27210892,1.0
...,...,...
73441,CUST87917518,0.0
73442,CUST10383819,0.0
73443,CUST81898171,0.0
73444,CUST48514678,0.0


# Edge Suspicion Score
Features starting with **agg** are aggregates of pre-computed transaction features.

In [63]:
DATAPATH = Path('../data/processed')
WIREPATH = DATAPATH / 'wire.parquet'

## E-transfer aggregation

In [None]:
#Maaz code here

## Wire transfer aggregation
The aggregate wire transfer score is an average of the wire transfer indicators `w_to_country`, `w_from_country`, `w_external_to_animal`.

In [69]:
wire_df = pd.read_parquet(WIREPATH)
wire_df['w_agg'] = (wire_df['w_to_country'] + wire_df['w_from_country'] + wire_df['w_external_to_animal'])/3
wire_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender,w_to_country,w_from_country,w_external_to_animal,w_agg
67027,CUST47410529,CUST56821551,JOHN MILLS,KAREN ALLEN,1626.5,CA,CA,VTTG41908592,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0,0.0
9599,CUST34075912,EXTERNAL176213,SAMANTHA FERGUSON,ANNA JONES,1544.5,CA,CA,LFUF84927552,,,,,0.0,0.0,0.0,0.0,0,0,0,0.0
39203,EXTERNAL959844,CUST35676669,RENEE BATTA,HARRY BROWN,10900.0,IN,CA,FFMZ50593813,1.0,0.0,1.0,0.0,,,,,0,0,0,0.0


## Edge Score Calculation

In [19]:
#Alex to compute total scores for wire transfer and e-transfer inc. his general flags
#Alex to export to a csv with columns: cust_id_sender, cust_id_reciever, score, trxn_type, trxn_id

# Temp

In [9]:
DATAPATH = Path('../data/processed')
EPATH = DATAPATH / 'emt.parquet'
WIREPATH = DATAPATH / 'wire.parquet'

In [8]:
edf = pd.read_parquet(EPATH)
edf = edf[['cust_id_sender', 'cust_id_receiver', 'trxn_id']].copy()
edf['trxn_type'] = 'emt'
edf['score'] = np.random.uniform(0,1, size=edf.shape[0])

wdf = pd.read_parquet(WIREPATH)
wdf = wdf[['cust_id_sender', 'cust_id_receiver', 'trxn_id']].copy()
wdf['trxn_type'] = 'wire'
wdf['score'] = np.random.uniform(0,1, size=edf.shape[0])

outdf = pd.concat([edf,wdf], index=True)
outdf.to_parquet(DATAPATH / 'temp_edge_score.parquet', index=False)

ValueError: Length of values (506451) does not match length of index (67872)