In [1]:
from fractions import Fraction
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import yaml
import json

# Suspicion Score

There are three reasons why a node might be suspicious: 
1. The KYC data (node data, eg. occupation) is suspicious
2. The transactions (edges, e.g. e-transfer messages) around that node are suspicious
3. The graph structure around that node is suspicious (e.g. in a 'mule' structure). 

The purpose of this notebook is to aggregate those three factors (and the indicators contained therein) to come up with an explainable total node suspicion score. Because graph structures are complicated and composed of international & external customers - (of which we only have a small fraction), we only focus on the first two reasons, and assume that suspicious graph structures will emerge downstream during pagerank.

If you want to edit weights. Edit the weights.yml file in the current folder.

In [2]:
WEIGHTPATH = Path('./weights.yml')

with open(WEIGHTPATH, 'r') as file: 
    weights = yaml.safe_load(file)
    
weights

{'NODE_WEIGHTS': {'NAMED_TRAFFICKER': 10,
  'LABEL': 1,
  'OCC_INT': '1/6',
  'OCC_WEALTH': '1/6',
  'OCC_ANIMAL': '1/6',
  'C_LARGE': '1/2'},
 'EDGE_WEIGHTS': {'E_AT_RISK': 10,
  'E_ROLE': 1,
  'E_TRAD_MED': 1,
  'W_TO_COUNTRY': '1/3',
  'W_FROM_COUNTRY': '1/3',
  'W_EXTERNAL_TO_ANIMAL': '1/3',
  'T_ANIMAL': '1/4',
  'T_ANIMAL_LARGE': '1/4',
  'T_INT': '1/4',
  'T_TO_SHIPPING': '1/4'}}

# Node Suspicion Score

In [3]:
DATAPATH = Path('../data/processed')
KYCPATH = DATAPATH / 'kyc.parquet'
CASHPATH = DATAPATH / 'cash.parquet'

# Edge datasets are just used to get external nodes
EPATH = DATAPATH / 'emt.parquet'
WPATH = DATAPATH / 'wire.parquet'

node_weights = {w: Fraction(v) for w, v in weights['NODE_WEIGHTS'].items()}

## Building Node List
At this point we should build the complete node list, which includes EXTERNAL customers

In [4]:
edf = pd.read_parquet(EPATH)
wdf = pd.read_parquet(WPATH)
kyc_df = pd.read_parquet(KYCPATH)

# Get all customers from edge lists
n1 = edf[['cust_id_sender', 'name_sender']].rename(columns={'cust_id_sender': 'cust_id', 'name_sender':'name'})
n2 = edf[['cust_id_receiver', 'name_receiver']].rename(columns={'cust_id_receiver': 'cust_id', 'name_receiver':'name'})
n3 = wdf[['cust_id_sender', 'name_sender']].rename(columns={'cust_id_sender': 'cust_id', 'name_sender':'name'})
n4 = wdf[['cust_id_receiver', 'name_receiver']].rename(columns={'cust_id_receiver': 'cust_id', 'name_receiver':'name'})
cust_names = pd.concat([n1,n2,n3,n4])
cleaned = cust_names.drop_duplicates()
cleaned = cleaned[cleaned.cust_id.str.contains("EXT")]

# Get additional countries from wiretransfer data
s1 = wdf[['cust_id_sender', 'country_sender']].copy().rename(columns={'cust_id_sender':'cust_id', 'country_sender':'country'})
s2 = wdf[['cust_id_receiver', 'country_receiver']].copy().rename(columns={'cust_id_receiver':'cust_id', 'country_receiver':'country'})
countries = pd.concat([s1,s2])
countries = countries.drop_duplicates()
cleaned = pd.merge(cleaned, countries, how='left', on='cust_id')

# Add dummy columns for concat
cleaned['occ_wealth'] = 0
cleaned['occ_animal'] = 0
cleaned['occ_int'] = 0
cleaned['label'] = 0
cleaned['gender'] = None
cleaned['age'] = np.nan
cleaned['tenure'] = np.nan


#Merging with KYC data
kyc_df['country'] = 'CA'
kyc_df = pd.concat([kyc_df, cleaned])

# Some extra cleaning
kyc_df = kyc_df.drop_duplicates(subset=['cust_id'])

assert len(kyc_df) == 300008

## Merging Task 3 Data
We also need to add the task 3 data, after which we save the dataset as `kyc_full.parquet`

In [5]:
with open(DATAPATH.parents[1] / 'task_3' / 'names_metadata.json', 'r') as f:
    names = json.load(f)

kyc_df['named_trafficker']=0.0
kyc_df['parsed_name']=kyc_df['name'].str.lower()

for name, data in names.items():

    score = float(data['case_name_score'])
    
    #get number of matches
    l = len(kyc_df.loc[kyc_df['parsed_name'] == name])
    
    #distribute score
    kyc_df.loc[kyc_df['parsed_name'] == name, 'named_trafficker'] = score/(100*l)    
    
kyc_df.drop(columns=['parsed_name'], inplace=True)

In [6]:
kyc_df.sort_values(by='named_trafficker', ascending=False)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,country,named_trafficker
3734,RYAN BLAIR,,,,,EXTERNAL690137,0,0,0,0,,1.0
37632,STÉPHANE THERRIEN,male,Import/Export Dealer,28.0,5.0,CUST79371439,1,0,1,0,CA,1.0
143719,GREG ANDERSON,male,Historian,60.0,8.0,CUST74861302,0,0,0,0,CA,1.0
96713,RYAN MILLAR,male,Unknown,44.0,3.0,CUST23634360,0,0,0,0,CA,1.0
64439,JACK MURPHY,,,,,EXTERNAL753690,0,0,0,0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
100041,VICTORIA FORBES,female,Auctioneer,43.0,13.0,CUST62262298,1,0,1,0,CA,0.0
100040,LAWRENCE WEAVER,male,Banker,24.0,5.0,CUST17460485,1,0,0,0,CA,0.0
100039,HUGUES LAPOINTE,male,Stock Broker,31.0,2.0,CUST19106236,1,0,0,0,CA,0.0
100038,PARINAAZ KAR,male,Fitness Trainer,28.0,9.0,CUST21156762,0,0,0,0,CA,0.0


In [7]:
kyc_df.to_parquet(DATAPATH / 'kyc_full.parquet', index=False)

## KYC Aggregation
We first combine the KYC features `named_trafficker`, `label`, `occ_int`, `occ_wealth`, `occ_animal` into one one score using the weights in `weights.yml`

In [8]:
def kyc_agg(r, node_weights): 
    occ_agg = \
        node_weights['LABEL']*r.label +\
        node_weights['OCC_INT']*r.occ_int +\
        node_weights['OCC_WEALTH']*r.occ_wealth +\
        node_weights['OCC_ANIMAL']*r.occ_animal +\
        node_weights['NAMED_TRAFFICKER']*r.named_trafficker

    return float(occ_agg)

kyc_df['kyc_agg'] = kyc_df.apply(lambda r: kyc_agg(r, node_weights), axis=1)
kyc_df.sample(3)

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,country,named_trafficker,kyc_agg
122237,IND.JENNIFER MURRAY,other,Retired,63.0,11.0,CUST87061155,0,0,0,0,CA,0.0,0.0
30195,CASEY SMITH,,,,,EXTERNAL632783,0,0,0,0,,0.0,0.0
31565,JOHN MORALES,male,Nurse,31.0,3.0,CUST98885567,0,0,0,0,CA,0.0,0.0


## Cash Aggregation
For the purposes of this project, we consider cash transactions as only occuring between one customer (i.e. the bank does *not* count as a customer). This prevents graph neighbourhoods from being too dense (everyone is connected to the bank, so everyone is a 2-hop neighbour). 

For practical purposes, this means that indicators related to deposit/withdrawal size become node scores, rather than edge scores.

There is only one cash feature currently, `c_large`. `cash_agg` is therefore defined as the proportion of a customers cash transactions that have been defined as being unusually large.

In [9]:
cash_df = pd.read_parquet(CASHPATH)
cash_df.sample(3)

Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label,occupation,c_large
82593,CUST69878864,450,withdrawal,STVD82604805,0,0,0,0,Architect,0
29474,CUST50357613,5115,withdrawal,LBNA70301132,0,0,0,0,"Freelancer (e.g., Graphic Designer, Writer)",0
73183,CUST56353102,560,withdrawal,YFUL59202990,1,0,0,0,Curator,0


In [10]:
cash_df = pd.read_parquet(CASHPATH)
cash_agg_df = cash_df.groupby('cust_id')['c_large'].mean().reset_index()
cash_agg_df = cash_agg_df.rename(columns={'c_large':'c_agg'})
cash_agg_df.sample(3)

Unnamed: 0,cust_id,c_agg
26710,CUST49833994,0.0
26677,CUST49785891,0.0
24501,CUST46514689,0.5


## Node Score Calculation
We assume that the total node suspicion score is an average of the aggregate kyc suspicion score  and the aggregate cash suspicion score above.

In [11]:
#Join KYC and Cash Data
merged = kyc_df.merge(cash_agg_df, on='cust_id', how='left')
merged['c_agg'] = merged['c_agg'].fillna(0)

#Aggregate 
node_df = merged[['cust_id', 'kyc_agg', 'c_agg']].copy()
node_df['score'] = (node_df['kyc_agg'] + node_weights['C_LARGE']*node_df['c_agg'])

#Clean
node_df = node_df[['cust_id', 'score']]

#Export
node_df.to_parquet(DATAPATH/ 'pagerank' / 'input_node_scores.parquet', index=False)
node_df.sort_values('score', ascending=False)

Unnamed: 0,cust_id,score
59304,CUST76986222,10.904762
12819,CUST60968343,10.733333
37632,CUST79371439,10.333333
98666,CUST73079564,10.125
201885,EXTERNAL898204,10.0
...,...,...
113281,CUST89727420,0.0
113282,CUST67136315,0.0
113283,CUST94455332,0.0
113285,CUST51952245,0.0


# Edge Suspicion Score
Features starting with **agg** are aggregates of pre-computed transaction features.

In [12]:
DATAPATH = Path('../data/processed')
EPATH = DATAPATH / 'emt.parquet'
WIREPATH = DATAPATH / 'wire.parquet'

edge_weights = {w: Fraction(v) for w, v in weights['EDGE_WEIGHTS'].items()}

## E-transfer aggregation

In [13]:
def e_transfer_agg(r, edge_weights): 
    e_agg = \
        edge_weights['E_ROLE']*r.e_role +\
        edge_weights['E_AT_RISK']*r.e_at_risk +\
        edge_weights['E_TRAD_MED']*r.e_trad_med

    return float(e_agg)

edf = pd.read_parquet(EPATH)
edf['e_agg'] = edf.apply(lambda r: e_transfer_agg(r, edge_weights), axis=1)
edf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,regex_flag,occ_wealth_receiver,occ_animal_receiver,...,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,t_to_shipping,e_agg,g_agg,score
499680,CUST34618336,CUST73663192,JENNIFER GONZALEZ,JACOB GLASS,,303.0,ZZVR50144166,0,1.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
198397,CUST91057939,CUST99225587,NEHMAT KONDA,DANA VASQUEZ,,372.0,GQCD68576631,0,0.0,0.0,...,0,0,0,0,1,0,0,0.0,0.166667,0.166667
85916,CUST52661970,EXTERNAL451243,ANNA WHITE,LOGAN COOK,,225.0,AXYQ41376766,0,,,...,0,0,0,0,0,0,0,0.0,0.0,0.0


## Wire transfer aggregation
The aggregate wire transfer score is an average of the wire transfer indicators `w_to_country`, `w_from_country`, `w_external_to_animal`.

In [14]:
def wire_transfer_agg(r, edge_weights):
    w_agg = \
        edge_weights['W_TO_COUNTRY']*r.w_to_country +\
        edge_weights['W_FROM_COUNTRY']*r.w_from_country +\
        edge_weights['W_EXTERNAL_TO_ANIMAL']*r.w_external_to_animal
    
    return float(w_agg)

wdf = pd.read_parquet(WIREPATH)
wdf['w_agg'] = wdf.apply(lambda r: wire_transfer_agg(r, edge_weights), axis=1)
wdf.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,...,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int,t_to_shipping,w_agg,g_agg,score
54377,EXTERNAL658304,CUST25378011,TERESA PETERSON,SCOTT COOPER,4311.0,CA,CA,FGZN74781968,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
21924,CUST73806951,CUST73482358,SANDRA CLAY,NICHOLAS LOPEZ,2710.0,CA,CA,XKMW42295178,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
49549,EXTERNAL405586,CUST54130897,SUN XIANG,DR.JAMES LOWE,12855.0,CN,CA,BLQJ75192480,0.0,0.0,...,0,0,0,0,0,0,0,0.333333,0.0,0.333333


## General Score Calculation

In [15]:
def compute_score(r, edge_weights, receiver=False):
    score = \
        edge_weights['T_INT']*r.iloc[2] +\
        edge_weights['T_ANIMAL']*r.iloc[1] +\
        edge_weights['T_ANIMAL_LARGE']*r.iloc[0]
    
    if receiver:
        score += edge_weights['T_TO_SHIPPING']*r.iloc[3]
    
    return float(score)

In [16]:
#Col names
sender_cols = ['t_from_animal', 't_from_animal_large', 't_from_int']
receiver_cols = ['t_to_animal', 't_to_animal_large', 't_to_int', 't_to_shipping']

#***E-TRANSFER**
edf['score_sender'] = edf[sender_cols].apply(compute_score, axis=1, args=(edge_weights, False))
edf['score_receiver'] = edf[receiver_cols].apply(compute_score, axis=1, args=(edge_weights, True))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
edf['g_agg'] = (edf['score_sender'] + edf['score_receiver'])/2
# edf['g_agg'] = (edf['g_agg'] - edf['g_agg'].min())/(edf['g_agg'].max() - edf['g_agg'].min())
edf.drop(columns=['score_sender', 'score_receiver'], inplace=True)

#**WIRE-TRANSFER
wdf['score_sender'] = wdf[sender_cols].apply(compute_score, axis=1, args=(edge_weights, False))
wdf['score_receiver'] = wdf[receiver_cols].apply(compute_score, axis=1, args=(edge_weights, True))

#add score_sender and score_receiver and normalize the score to be between 0 and 1
wdf['g_agg'] = (wdf['score_sender'] + wdf['score_receiver'])/2
# wdf['g_agg'] = (wdf['g_agg'] - wdf['g_agg'].min())/(wdf['g_agg'].max() - wdf['g_agg'].min())
wdf.drop(columns=['score_sender', 'score_receiver'], inplace=True)

## Edge Score Calculation

In [17]:
edf['score'] = edf['g_agg'] + edf['e_agg']
wdf['score'] = wdf['g_agg'] + wdf['w_agg']

#exporting
edf.to_parquet(EPATH, index=False)
wdf.to_parquet(WIREPATH, index=False)

s1 = edf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s2 = wdf[['cust_id_sender', 'cust_id_receiver', 'score']].copy()
s1['type'] = 'e'
s2['type'] = 'w'

pd.concat([s1,s2]).to_parquet(DATAPATH / 'pagerank' / 'input_edge_scores.parquet')