In [1]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_array
from pathlib import Path
import networkx as nx
import pandas as pd
import numpy as np
import itertools
# from cdlib import algorithms

# TODO 
- May need to find a better way of incorporating # of transactions
- Method 2 (multiplicative)
- CHANGE EDGE DATASETS TO USE ACTUAL DATA
- In 1. Add starting suspicions
- In 3. Add reverse edges

# 1. Appending Node Scores
1. Calculate pagerank with edge score only
2. Scale pageranks by min and max pageranks
3. Add node scores to pageranks

In [2]:
DATAPATH = Path('../data/processed')

n_df = pd.read_parquet(DATAPATH / 'pr_node_score.parquet')
e_df = pd.read_parquet(DATAPATH / 'pr_edge_score.parquet')

In [3]:
# Combine identical edges
e_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False)['score'].sum()

# Construct graph
G = nx.from_pandas_edgelist(e_df, 'cust_id_sender', 'cust_id_receiver', edge_attr='score', create_using=nx.DiGraph())

# Pagerank
pr = nx.pagerank(G, weight='score')

In [4]:
# Scaling
pr_df = pd.DataFrame.from_dict(pr, orient='index').reset_index()
pr_df = pr_df.rename(columns={'index': 'cust_id', 0:'score'})

max_score = pr_df['score'].max()
min_score = pr_df['score'].min()
pr_df['score'] = (pr_df['score'] - min_score)/(max_score - min_score)

# Adding node scores
# pr_df = pr_df.merge(n_df, on='cust_id', how='left')
# pr_df['score_y'] = pr_df['score_y'].fillna(0)
# pr_df['score'] = pr_df['score_x']+pr_df['score_y']
pr_df = pr_df[['cust_id', 'score']]

# Exporting
pr_df.to_parquet(DATAPATH / 'pagerank_1.parquet')
pr_df.sample(3)

Unnamed: 0,cust_id,score
55833,EXTERNAL907254,0.0
243378,EXTERNAL459865,0.0
22720,CUST82287762,0.14215


# 2. Cdlib

In [5]:
DATAPATH = Path('../data/processed')

n_df = pd.read_parquet(DATAPATH / 'node_score.parquet')
n_df.set_index('cust_id', inplace=True)
e_df = pd.read_parquet(DATAPATH / 'temp_edge_score.parquet')

FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\processed\\temp_edge_score.parquet'

In [None]:
e_df = e_df[(e_df['cust_id_receiver'].isin(n_df.index)) & (e_df['cust_id_sender'].isin(n_df.index)) ] # for some reason e_df had nodes which were not in n_df

In [58]:
G = nx.from_pandas_edgelist(e_df, 'cust_id_sender', 'cust_id_receiver', edge_attr='score', create_using=nx.Graph())

In [63]:
# ran this for an hour, it did not finish
# other option from cdlib is algorithms.ilouvain, but when I tried running that, it said it would need 40 gb memory lol

# communities = algorithms.eva(G, n_df.to_dict(orient='index'))

# 3. Zhang et al. Method
This method uses the pagerank technique outlined in [Zhang *et al.,* 2022](https://arxiv.org/abs/2104.02764). 

Mainly: 

Let $M$ be a transition matrix with: 
$$m_{ij} = \begin{cases}
    \theta w_{ji}/s_j^{out} + (1-\theta)a_{ji}/d_j^{out} && \texttt{if}\; d_j^{out}\neq 0\\
    0 && \texttt{if}\; d_j^{out}=0
\end{cases}$$
Where: 
- $\theta$ is a tunable parameter that represents how important edge weights should be in the pagerank alg
- $w_{ji}$ is the weight of an edge from node $j \rightarrow i$
- $s_j^{out} = \sum_{v \in V|j \rightarrow v}w_{jv}$ is the "strength" of outgoing edges from node $j$
- $a_{ji}$ = $1 \:\texttt{if}\: j \rightarrow i \:\texttt{else}\: 0$
- $d_j^{out} = \sum_{v \in V|j \rightarrow v}a_{jv}$
- $\beta_i$ is the node importance score
  


Then the pagerank can be calculated with power iterations on
$$P=\gamma MP + (1-\gamma)\boldsymbol{\beta}/||\boldsymbol{\beta}||_1$$

Where:
- $1-\gamma$ is a tunable parameter representing the probability of restarting a random walk (typically 0.8-0.9)
- $\boldsymbol{\beta}/||\boldsymbol{\beta}||_1$ is the vectorized version of $\beta_{i}/\sum_{i\in V}\beta_i$

---
*note the other formulation in the paper results in a dense transition matrix that is nxn.... too big*

*note that for $m_{ij}$, $i$ is the target and $j$ is the source*

In [54]:
THETA = 0.5
GAMMA = 0.9

DATAPATH = Path('../data/processed')

n_df = pd.read_parquet(DATAPATH / 'pr_node_score.parquet')
e_df = pd.read_parquet(DATAPATH / 'pr_edge_score.parquet')
e_df = e_df[['cust_id_sender', 'cust_id_receiver', 'score']].copy()

In [39]:
#reversing edges
e2_df = pd.DataFrame()
e2_df['cust_id_receiver'] = e_df['cust_id_sender'].copy()
e2_df['cust_id_sender'] = e_df['cust_id_receiver'].copy()
e2_df['score'] = e_df['score']

e_df = pd.concat([e_df, e2_df])

## Constructing $\boldsymbol{\beta}/||\boldsymbol{\beta}||_1$

In [55]:
# Get all nodes in the graph
node_list = []
node_list.extend(e_df.cust_id_sender.tolist() + e_df.cust_id_receiver.tolist() + n_df.cust_id.tolist())
node_list = list(set(node_list)) #hack to remove duplicates

In [56]:
# Construct B
b_df = pd.DataFrame(data={'cust_id':node_list})
b_df = b_df.merge(n_df, on='cust_id', how='left')
b_df = b_df.rename(columns={'score':'b_i'})
b_df = b_df.fillna(0)

one_norm = abs(b_df['b_i']).sum()

b_df['b_i'] = b_df['b_i'] / one_norm

b_df.sort_values('b_i', ascending=False).head(3)

Unnamed: 0,cust_id,b_i
287469,CUST43500432,8.3e-05
212462,CUST22549711,8.3e-05
74782,CUST46707759,8.3e-05


## Node Encoding
This section encodes the cust_ids ordinally.

In [57]:
le = LabelEncoder()
le.fit(node_list)

e_df['cust_id_sender'] = le.transform(e_df['cust_id_sender'])
e_df['cust_id_receiver'] = le.transform(e_df['cust_id_receiver'])

b_df['cust_id'] = le.transform(b_df['cust_id'])

node_enc = le.transform(node_list)

e_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,score
208666,15675,297113,0.0
445619,36789,78696,0.5
391075,198200,114169,0.0


## Constructing $M$
*note that for $m_{ij}$, $i$ is the target node and $j$ is the source node*

In [58]:
e_df['score'] += 0.01 #No edges can have 0 weight, so we add a relatively small value to all edges

#Calculate w_ji
w_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False)['score'].sum()
w_df = w_df.rename(columns={'score':'w_ji'})
e_df = e_df.merge(w_df, on=['cust_id_sender', 'cust_id_receiver'], how='left')

#Calculate a_ji
a_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False)['score'].count()
# a_df = e_df.groupby 
a_df = a_df.rename(columns={'score':'a_ji'})
e_df = e_df.merge(a_df, on=['cust_id_sender', 'cust_id_receiver'], how='left')

#Calculate s_j
s_df = e_df.groupby(['cust_id_sender'], as_index=False)['score'].sum()
s_df = s_df.rename(columns={'score':'s_j'})
e_df = e_df.merge(s_df, on='cust_id_sender', how='left')

#Calculate d_j
d_df = e_df.groupby(['cust_id_sender'], as_index=False)['score'].count()
d_df = d_df.rename(columns={'score':'d_j'})
e_df = e_df.merge(d_df, on='cust_id_sender', how='left')

#Remove duplicate edges... takin max is just a hack
e_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False).max()

e_df.sample(10)

Unnamed: 0,cust_id_sender,cust_id_receiver,score,w_ji,a_ji,s_j,d_j
69751,38431,145225,0.51,0.51,1,1.52,2
244906,136075,94739,0.01,0.01,1,0.02,2
28662,15645,178065,0.51,0.51,1,3.57,7
289019,161149,180957,0.51,0.51,1,1.1,10
115273,64013,213691,0.01,0.01,1,0.03,3
25998,14226,166548,0.01,0.01,1,0.05,5
507446,269032,163361,0.01,0.01,1,1.57,7
216980,120487,160091,0.01,0.01,1,0.02,2
429959,233065,161535,0.01,0.01,1,1.58,8
6253,3476,134897,0.01,0.01,1,0.02,2


In [59]:
def calc_m(r): 
    m = THETA*r.w_ji/r.s_j + (1-THETA)*r.a_ji/r.d_j
    return m

#calculate m_ij for sender j, receiver i
e_df['m'] = e_df.apply(lambda r: calc_m(r), axis=1)
e_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,score,w_ji,a_ji,s_j,d_j,m
408775,223204,86871,0.51,0.51,1,0.52,2,0.740385
262886,146266,7909,0.01,0.01,1,0.02,2,0.5
234397,130275,51615,0.01,0.01,1,0.53,3,0.176101


In [60]:
e_df.sort_values('a_ji')

Unnamed: 0,cust_id_sender,cust_id_receiver,score,w_ji,a_ji,s_j,d_j,m
0,0,8526,0.01,0.01,1,0.04,4,0.250000
382303,210856,39252,0.01,0.01,1,0.01,1,1.000000
382302,210855,148045,0.01,0.01,1,0.02,2,0.500000
382301,210855,90694,0.01,0.01,1,0.02,2,0.500000
382300,210854,135277,0.01,0.01,1,0.01,1,1.000000
...,...,...,...,...,...,...,...,...
534080,281449,6484,0.51,2.04,4,2.56,6,0.731771
238675,132629,213711,0.01,0.04,4,1.63,13,0.166116
187492,104180,264034,0.01,0.04,4,1.10,10,0.218182
41668,22720,147177,0.51,2.04,4,4.58,8,0.472707


In [61]:
e_df.groupby('cust_id_sender').sum()

Unnamed: 0_level_0,cust_id_receiver,score,w_ji,a_ji,s_j,d_j,m
cust_id_sender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,544065,0.04,0.04,4,0.16,16,1.0
2,353019,0.02,0.02,2,0.04,4,1.0
3,264496,0.01,0.01,1,0.01,1,1.0
4,128894,0.01,0.01,1,0.01,1,1.0
6,1613240,6.61,6.61,11,72.71,121,1.0
...,...,...,...,...,...,...,...
300002,505540,0.03,0.03,3,0.09,9,1.0
300003,116999,0.01,0.01,1,0.01,1,1.0
300005,137886,0.01,0.01,1,0.01,1,1.0
300006,159644,0.02,0.02,2,0.04,4,1.0


In [29]:
e_df[e_df.cust_id_sender==201855]

Unnamed: 0,cust_id_sender,cust_id_receiver,score,w_ji,a_ji,s_j,d_j,m
363144,201855,93325,0.01,0.01,1,0.02,2,0.5
363145,201855,148228,0.01,0.01,1,0.02,2,0.5


In [49]:
e_df[e_df["cust_id_sender"]==164781].sum()

cust_id_sender      4119525.0
cust_id_receiver    4993275.0
score                     4.0
w_ji                      6.8
a_ji                     30.0
s_j                     170.0
d_j                     750.0
m                         1.0
dtype: float64

In [203]:
le.inverse_transform([83495])

array(['CUST48479576'], dtype='<U14')

## Pagerank

In [62]:
def weighted_pagerank(M, B, P, gamma=0.85, tol=0.00001, maxit=1000):
    """Computes the node and edge weighted pagerank
    
    This function takes in a transition matrix, node weights, and an initial pagerank vector
    and computes the weighted pagerank, stopping when the difference between iterations is < 
    tolerence or when the maximum # iterations is reached.
    
    Args:
        M: The transition matrix according to Zhang et al. should be a scipy sparse coordinate matrix (NxN)
        B: A numpy array of relative node weights (Nx1)
        P: A numpy array vector of initial pageranks, normally just uniform (Nx1) 
        gamma: A hyperparam representing the probability of a random surfer NOT making a random jump to a new node
        tol: iteration tolerance - when the maximum RELATIVE change in a node score is below this value, iterations stop
        maxit: maximum iterations
    
    Returns
        P: The final pagerank vector, min-max scaled to be in [0-1]
    
    """
    
    itcount = 0
    max_diff = 1000 #placeholder large value
    diff_list = []
    while (itcount <= maxit) and (max_diff >= tol):
           
        #Pagerank iter
        P_int = GAMMA*M.dot(P) + (1-GAMMA)*B
                
        #Adding leaked pagerank back
        #need to do this since we don't preprocess to remove dead ends.
        leak = np.sum(P_int)
        P_int = P_int + (1-leak)/len(P)
        
        max_diff = (np.absolute(P-P_int)/P).max()
        diff_list.append(max_diff)
        itcount += 1
        P = P_int
        
    
    print(max_diff)
    print(itcount)
    # Scaling
    P = (P - P.min())/(P.max() - P.min())
    
    return P, diff_list

In [63]:
#Construct necessary matrices
i = e_df['cust_id_receiver'].values
j = e_df['cust_id_sender'].values
m = e_df['m'].values

N = b_df.shape[0]
B = b_df.sort_values('cust_id')['b_i'].values
M = coo_array((m,(i,j)), shape=(N,N))
# M = M + M.transpose() #line to duplicate edges
P = np.full(N, 1/N)

#Run Pagerank
P, dl = weighted_pagerank(M, B, P, GAMMA)

# Verify that data is stored correctly
test_i = 35453
test_j = 238577
print(f'M at i={test_i} and j={test_j} is {M.tocsr()[test_i,test_j]}')

9.227331524212615e-06
97
M at i=35453 and j=238577 is 0.0


In [72]:
print(N)
print(len(P))

300008
300008


In [64]:
fig = px.scatter(y=dl, x=list(range(len(dl))), log_y=True)

fig.show()

In [65]:
# Exporting
P_df = pd.DataFrame(data=P, columns=['score'])
P_df['cust_id'] = P_df.index
P_df['cust_id'] = le.inverse_transform(P_df['cust_id']) #transform back to actual IDS
P_df = P_df[['cust_id','score']]
P_df.to_parquet(DATAPATH / 'pagerank_no_rev_edge.parquet')

# Vis

In [66]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio
import kaleido 
print('kaleido version:', kaleido.__version__)

#Inline figures
pio.renderers.default = 'iframe'

#Themeing
# mcolors = px.colors.qualitative.Dark24

pio.templates['custom'] = go.layout.Template(
    layout=dict(
        xaxis=dict(ticks='outside', tickcolor='lightgray', showgrid=False, showline=True),
        yaxis=dict(ticks='outside', tickcolor='lightgray', showgrid=False, showline=True, mirror=True),
        yaxis2=dict(ticks='outside', tickcolor='lightgray', showgrid=False,),
        # colorway=mcolors,
    )

)

pio.templates.default = 'plotly_white+custom'

kaleido version: 0.2.1


In [73]:
P_df=pd.read_parquet(DATAPATH/'pagerank_rev_edge.parquet')
print(len(P_df))

300008


In [70]:
fig = px.histogram(P_df[P_df['score']>=0], x='score')
fig.show()


In [68]:
fig = px.histogram(n_df, x='score')
fig.show()


In [12]:
fig = px.histogram(P_df, x='score')
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [114]:
P_df.sort_values('score', ascending=False).head(10)

Unnamed: 0,cust_id,score
113256,CUST70977942,1.0
90586,CUST58785793,0.84137
23431,CUST22549711,0.836228
23101,CUST22362657,0.807643
153072,CUST92429485,0.760642
55793,CUST40191937,0.752005
56020,CUST40302821,0.745428
76531,CUST51289421,0.730183
118728,CUST73958429,0.724819
20445,CUST21010424,0.719274


In [119]:
kdf = pd.read_parquet(DATAPATH/'kyc.parquet')
cust_id = 'CUST22549711'
kdf[kdf.cust_id == cust_id]

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,kyc_agg
89083,ELLEN HANSEN,female,Warehouse Worker,29.0,8.0,CUST22549711,0,0,0,1,1.0
