In [1]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_array
from pathlib import Path
import networkx as nx
import pandas as pd
import numpy as np
import itertools

# 2. Custom Method

This method adapts the pagerank technique outlined in [Zhang *et al.,* 2022](https://arxiv.org/abs/2104.02764). In that work, the random walk uses relative edge weights to assign probabilities of traversing edges from a given node. This means edges are weighted *locally* - around a given node - but not *globally*. 

In this method, I use the weighted pagerank method from Zhang *et al.,*, but I adjust the gamma parameter - the probability of randomly teleporting from any node - to be *node specific* and based on the total edge suspicion of a given node.

Mainly: 

Let $M$ be a transition matrix with: 
$$m_{ij} = \begin{cases}
    \theta w_{ji}/s_j^{out} + (1-\theta)a_{ji}/d_j^{out} && \texttt{if}\; d_j^{out}\neq 0\\
    0 && \texttt{if}\; d_j^{out}=0
\end{cases}$$
Where: 
- $\theta$ is a tunable parameter that represents how important edge weights should be in the pagerank alg
- $w_{ji}$ is the weight of an edge from node $j \rightarrow i$
- $s_j^{out} = \sum_{v \in V|j \rightarrow v}w_{jv}$ is the "strength" of outgoing edges from node $j$
- $a_{ji}$ = $1 \:\texttt{if}\: j \rightarrow i \:\texttt{else}\: 0$
- $d_j^{out} = \sum_{v \in V|j \rightarrow v}a_{jv}$
- $\beta_i$ is the node importance score
  


Then the pagerank can be calculated with power iterations on
$$P=\boldsymbol{\gamma} \odot MP + (1-\boldsymbol{\gamma}) \odot \boldsymbol{\beta}/||\boldsymbol{\beta}||_1$$

Where:
- $1-\boldsymbol{\gamma}$ is a vector of hyperparameters representing the probability of randomly teleporting from a given node
- $\boldsymbol{\beta}/||\boldsymbol{\beta}||_1$ is the vectorized version of $\beta_{i}/\sum_{i\in V}\beta_i$

---
*note the other formulation in the paper results in a dense transition matrix that is nxn.... too big*

*note that for $m_{ij}$, $i$ is the target and $j$ is the source*

In [22]:
THETA = 1
REVERSE = True

DATAPATH = Path('../data/processed/pagerank')
OUTPATH = DATAPATH.parent / 'nodes_custom_rev_new.parquet'

n_df = pd.read_parquet(DATAPATH / 'input_node_scores.parquet')
e_df = pd.read_parquet(DATAPATH / 'input_edge_scores.parquet')
e_df = e_df[['cust_id_sender', 'cust_id_receiver', 'score']].copy()

In [23]:
#reversing edges
if REVERSE:
    e2_df = pd.DataFrame()
    e2_df['cust_id_receiver'] = e_df['cust_id_sender'].copy()
    e2_df['cust_id_sender'] = e_df['cust_id_receiver'].copy()
    e2_df['score'] = e_df['score']

    e_df = pd.concat([e_df, e2_df])
    e_df.sample(3)

## Constructing $\boldsymbol{\beta}/||\boldsymbol{\beta}||_1$

In [24]:
# Get all nodes in the graph
node_list = []
node_list.extend(e_df.cust_id_sender.tolist() + e_df.cust_id_receiver.tolist() + n_df.cust_id.tolist())
node_list = list(set(node_list)) #hack to remove duplicate"

In [25]:
# Construct B
b_df = pd.DataFrame(data={'cust_id':node_list})
b_df = b_df.merge(n_df, on='cust_id', how='left')
b_df = b_df.rename(columns={'score':'b_i'})
b_df = b_df.fillna(0)

one_norm = abs(b_df['b_i']).sum()

b_df['b_i'] = b_df['b_i'] / one_norm

b_df.sort_values('b_i', ascending=False).head(3)

Unnamed: 0,cust_id,b_i
45369,CUST76986222,0.000672
297352,CUST60968343,0.000662
78822,CUST73079564,0.000624


## Node Encoding
This section encodes the cust_ids ordinally.

In [26]:
le = LabelEncoder()
le.fit(node_list)

e_df['cust_id_sender'] = le.transform(e_df['cust_id_sender'])
e_df['cust_id_receiver'] = le.transform(e_df['cust_id_receiver'])

b_df['cust_id'] = le.transform(b_df['cust_id'])

node_enc = le.transform(node_list)

e_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,score
358723,198730,129150,0.0
448030,3402,60617,0.0
259499,277289,88040,0.0


## Constructing $M$
*note that for $m_{ij}$, $i$ is the target node and $j$ is the source node*

In [27]:
e_df['score'] += 0.01 #No edges can have 0 weight, so we add a relatively small value to all edges

#Calculate w_ji
w_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False)['score'].sum()
w_df = w_df.rename(columns={'score':'w_ji'})
e_df = e_df.merge(w_df, on=['cust_id_sender', 'cust_id_receiver'], how='left')

#Calculate a_ji
a_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False)['score'].count()
# a_df = e_df.groupby 
a_df = a_df.rename(columns={'score':'a_ji'})
e_df = e_df.merge(a_df, on=['cust_id_sender', 'cust_id_receiver'], how='left')

#Calculate s_j
s_df = e_df.groupby(['cust_id_sender'], as_index=False)['score'].sum()
s_df = s_df.rename(columns={'score':'s_j'})
e_df = e_df.merge(s_df, on='cust_id_sender', how='left')

#Calculate d_j
d_df = e_df.groupby(['cust_id_sender'], as_index=False)['score'].count()
d_df = d_df.rename(columns={'score':'d_j'})
e_df = e_df.merge(d_df, on='cust_id_sender', how='left')

#Remove duplicate edges... taking max is just a hack
e_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False).max()

e_df.sample(10)

Unnamed: 0,cust_id_sender,cust_id_receiver,score,w_ji,a_ji,s_j,d_j
569448,140594,183582,0.01,0.01,1,0.42,17
858449,216270,144765,0.01,0.01,1,0.215,9
1103521,287614,49949,0.01,0.01,1,0.355,23
1097844,285925,129779,0.01,0.01,1,0.02,2
1081536,281039,151106,0.135,0.135,1,0.61,11
31587,7857,215306,0.01,0.01,1,0.5,25
372084,91841,239067,0.135,0.135,1,0.665,4
1070585,277865,17866,0.01,0.01,1,0.01,1
353888,87326,35919,0.01,0.01,1,0.03,3
125377,30478,31986,0.135,0.135,1,0.145,2


In [28]:
def calc_m(r): 
    m = THETA*r.w_ji/r.s_j + (1-THETA)*r.a_ji/r.d_j
    return m

#calculate m_ij for sender j, receiver i
e_df['m'] = e_df.apply(lambda r: calc_m(r), axis=1)

In [29]:
#Verify matrix is column stochastic... m should all be 1.0
e_df.groupby('cust_id_sender').sum().sort_values('m', ascending=True).head(3)

Unnamed: 0_level_0,cust_id_receiver,score,w_ji,a_ji,s_j,d_j,m
cust_id_sender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
91885,354058,0.155,0.155,3,0.465,9,1.0
288107,237774,0.155,0.155,3,0.465,9,1.0
207348,249041,0.155,0.155,3,0.465,9,1.0


## Calculate $\boldsymbol{\gamma}$

In [30]:
# Calculate gammas
gamma_df = e_df.groupby(['cust_id_sender'], as_index=False)['s_j'].max() #trick, all s_j are the same
gamma_df['gamma'] = np.exp(gamma_df['s_j'])/(0.8+np.exp(gamma_df['s_j']))
gamma_df = gamma_df.rename(columns={'cust_id_sender': 'cust_id'})

# Create gamma array
gamma_df = gamma_df.merge(b_df, on='cust_id', how='right') #trick to get all node ids
gamma_df = gamma_df.fillna(np.exp(0)/(0.8+np.exp(0)))
gamma_df = gamma_df.sort_values(['cust_id'])
gamma = gamma_df['gamma'].values
gamma

array([0.57031674, 0.59101032, 0.56048818, ..., 0.56048818, 0.56048818,
       0.56048818])

## Pagerank

In [31]:
def custom_pagerank(M, B, P, gamma, tol=1e-10, maxit=10000):
    """Computes the node and edge weighted pagerank using a custom algorithm
    
    This function takes in a transition matrix, node weights, and an initial pagerank vector
    and computes the weighted pagerank, stopping when the difference between iterations is < 
    tolerence or when the maximum # iterations is reached.
    
    Args:
        M: The transition matrix according to Zhang et al. should be a scipy sparse coordinate matrix (NxN)
        B: A numpy array of relative node weights (Nx1)
        P: A numpy array vector of initial pageranks, normally just uniform (Nx1) 
        gamma: An array of probabilities of randomly teleporting from a given node (Nx1)
        tol: iteration tolerance - when the maximum RELATIVE change in a node score is below this value, iterations stop
        maxit: maximum iterations
    
    Returns
        P: The final pagerank vector, min-max scaled to be in [0-1]
    
    """
    
    itcount = 0
    max_diff = 1000 #placeholder large value
    diff_list = []
    while (itcount <= maxit) and (max_diff >= tol):
           
        #Pagerank iter
        P_int = np.multiply(gamma,M.dot(P)) + np.multiply((1-gamma),B)
                
        #Adding leaked pagerank back
        #need to do this since we don't preprocess to remove dead ends.
        leak = np.sum(P_int)
        P_int = P_int + (1-leak)/len(P)
        
        max_diff = (np.absolute(P-P_int)/P).max()
        diff_list.append(max_diff)
        itcount += 1
        P = P_int
        
    
    print(f'Final max (relative) error: {max_diff:.3}')
    print(f'Final iterations: {itcount}')
    # Scaling
    P = (P - P.min())/(P.max() - P.min())
    
    return P, diff_list

In [32]:
#Construct necessary matrices
i = e_df['cust_id_receiver'].values
j = e_df['cust_id_sender'].values
m = e_df['m'].values

N = b_df.shape[0]
B = b_df.sort_values('cust_id')['b_i'].values
M = coo_array((m,(i,j)), shape=(N,N))
P = np.full(N, 1/N)

#Run Pagerank
P, dl = custom_pagerank(M, B, P, gamma)

Final max (relative) error: 9.98e-11
Final iterations: 4460


## Exporting for Webapp
The pagerank data needs to be converted back into KYC-esque data, with customer names, countries, etc. but this needs to include *external customers*

In [33]:
def clean_pagerank_results(pagerank_df):
    """Takes in a [id, score] dataframe and turns it back into KYC-style data"""
    
    datapath = Path('../data/processed/')
    
    kyc_df = pd.read_parquet(datapath / 'kyc.parquet')
    wdf = pd.read_parquet(datapath / 'wire.parquet')
    edf = pd.read_parquet(datapath / 'emt.parquet')
    
    cleaned = pagerank_df.copy()
    
    # Join with kyc data and add country column
    kyc_df['country'] = 'CA'
    cleaned = pd.merge(cleaned, kyc_df, how='left', on='cust_id')
    
    # Add external customer names
    n1 = edf[['cust_id_sender', 'name_sender']].rename(columns={'cust_id_sender': 'cust_id', 'name_sender':'name'})
    n2 = edf[['cust_id_receiver', 'name_receiver']].rename(columns={'cust_id_receiver': 'cust_id', 'name_receiver':'name'})
    n3 = wdf[['cust_id_sender', 'name_sender']].rename(columns={'cust_id_sender': 'cust_id', 'name_sender':'name'})
    n4 = wdf[['cust_id_receiver', 'name_receiver']].rename(columns={'cust_id_receiver': 'cust_id', 'name_receiver':'name'})
    cust_names = pd.concat([n1,n2,n3,n4])
    cust_names = cust_names.drop_duplicates()
    cust_names = dict(zip(cust_names.cust_id.to_list(), cust_names.name.to_list()))
    
    def _map_cust_names(row): 
        if row['name'] == None:  
            return cust_names[row.cust_id]
        else:
            return row['name']
    
    cleaned['name'] = cleaned.apply(lambda row: _map_cust_names(row), axis=1) 

    # Get additional countries from wiretransfer data
    s1 = wdf[['cust_id_sender', 'country_sender']].copy().rename(columns={'cust_id_sender':'cust_id', 'country_sender':'country'})
    s2 = wdf[['cust_id_receiver', 'country_receiver']].copy().rename(columns={'cust_id_receiver':'cust_id', 'country_receiver':'country'})
    countries = pd.concat([s1,s2])
    countries = countries.drop_duplicates()

    cleaned = pd.merge(cleaned, countries, how='left', on='cust_id')
    cleaned['country_x'] = cleaned['country_x'].combine_first(cleaned['country_y'])
    cleaned['country'] = cleaned['country_x']
    cleaned = cleaned.drop(columns=['country_x', 'country_y'])

    return cleaned

In [6]:
import pandas as pd
from pathlib import Path
datapath = Path('../data/processed/')

In [15]:
wdf = pd.read_parquet(datapath / 'wire.parquet')
edf = pd.read_parquet(datapath / 'emt.parquet')

n1 = edf[['cust_id_sender', 'name_sender']].rename(columns={'cust_id_sender': 'cust_id', 'name_sender':'name'})
n2 = edf[['cust_id_receiver', 'name_receiver']].rename(columns={'cust_id_receiver': 'cust_id', 'name_receiver':'name'})
n3 = wdf[['cust_id_sender', 'name_sender']].rename(columns={'cust_id_sender': 'cust_id', 'name_sender':'name'})
n4 = wdf[['cust_id_receiver', 'name_receiver']].rename(columns={'cust_id_receiver': 'cust_id', 'name_receiver':'name'})
cust_names = pd.concat([n1,n2,n3,n4])
cust_names = cust_names.drop_duplicates()
cust_names = dict(zip(cust_names.cust_id.to_list(), cust_names.name.to_list()))
cust_names

{'CUST26232205': 'JASON GARRISON',
 'EXTERNAL623153': 'GINA WISE',
 'CUST35533148': 'ANTHONY ROBERSON',
 'CUST59096559': 'KEVIN PARK',
 'CUST69049633': 'ZHU FENG LAN',
 'CUST27403977': 'IND.DAVID DUNLAP JR.',
 'CUST76959536': 'BAI JUN',
 'CUST65275585': 'EDUARDO PONCE VILLAREAL',
 'EXTERNAL483303': 'JAMES RUBIO',
 'CUST95489575': 'MARY HARPER',
 'CUST17422161': 'LIU KUN',
 'EXTERNAL502917': 'MARTHA JACINTO CEDILLO',
 'CUST43516538': 'DR.JAMES CHAMBERS',
 'CUST34620664': 'DR.YU YING',
 'CUST99353313': 'GABRIEL LAROSE',
 'EXTERNAL210567': 'ANGELA THOMPSON',
 'CUST22347636': 'ANTONIO BROWN',
 'CUST40190988': 'KATIE RODGERS',
 'EXTERNAL837633': 'ESHANI CHAD',
 'EXTERNAL145500': 'GILBERT FOURNIER',
 'CUST56819898': 'ÉLISABETH HAMEL',
 'CUST16249013': 'GUY THOMAS',
 'CUST98454879': 'CHARLOTTE VÉZINA',
 'CUST73958429': 'PARINAAZ BHASIN',
 'CUST13944316': 'DR.LAWRENCE FLOWERS',
 'CUST50125727': 'DR.JESSICA SMITH',
 'CUST66995537': 'BRAD BERG',
 'CUST54683641': 'IND.ROBERT REYES',
 'CUST9006173

In [34]:
P_df = pd.DataFrame(data=P, columns=['score'])
P_df['cust_id'] = P_df.index
P_df['cust_id'] = le.inverse_transform(P_df['cust_id']) #transform back to actual IDS
P_df = P_df[['cust_id','score']]
P_df = clean_pagerank_results(P_df)
P_df.to_parquet(OUTPATH)