In [2]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_array
from pathlib import Path
import networkx as nx
import pandas as pd
import numpy as np
# from cdlib import algorithms

# TODO 
- May need to find a better way of incorporating # of transactions
- Method 2 (multiplicative)
- CHANGE EDGE DATASETS TO USE ACTUAL DATA
- In 1. Add starting suspicions
- In 3. Add reverse edges

# 1. Appending Node Scores
1. Calculate pagerank with edge score only
2. Scale pageranks by min and max pageranks
3. Add node scores to pageranks

In [4]:
DATAPATH = Path('../data/processed')

n_df = pd.read_parquet(DATAPATH / 'node_score.parquet')
e_df = pd.read_parquet(DATAPATH / 'temp_edge_score.parquet')

In [9]:
# Combine identical edges
e_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False)['score'].sum()

# Construct graph
G = nx.from_pandas_edgelist(e_df, 'cust_id_sender', 'cust_id_receiver', edge_attr='score', create_using=nx.DiGraph())

# Pagerank
pr = nx.pagerank(G, weight='score')

In [17]:
# Scaling
pr_df = pd.DataFrame.from_dict(pr, orient='index').reset_index()
pr_df = pr_df.rename(columns={'index': 'cust_id', 0:'score'})

max_score = pr_df['score'].max()
min_score = pr_df['score'].min()
pr_df['score'] = (pr_df['score'] - min_score)/(max_score - min_score)

# Adding node scores
# pr_df = pr_df.merge(n_df, on='cust_id', how='left')
# pr_df['score_y'] = pr_df['score_y'].fillna(0)
# pr_df['score'] = pr_df['score_x']+pr_df['score_y']
pr_df = pr_df[['cust_id', 'score']]

# Exporting
pr_df.to_parquet(DATAPATH / 'pagerank_1.parquet')
pr_df.sample(3)

Unnamed: 0,cust_id,score
226904,EXTERNAL388508,0.0
110410,CUST43942889,0.259206
119704,CUST48571588,0.081349


# 2. Cdlib

In [52]:
DATAPATH = Path('../data/processed')

n_df = pd.read_parquet(DATAPATH / 'node_score.parquet')
n_df.set_index('cust_id', inplace=True)
e_df = pd.read_parquet(DATAPATH / 'temp_edge_score.parquet')

In [54]:
e_df = e_df[(e_df['cust_id_receiver'].isin(n_df.index)) & (e_df['cust_id_sender'].isin(n_df.index)) ] # for some reason e_df had nodes which were not in n_df

In [58]:
G = nx.from_pandas_edgelist(e_df, 'cust_id_sender', 'cust_id_receiver', edge_attr='score', create_using=nx.Graph())

In [63]:
# ran this for an hour, it did not finish
# other option from cdlib is algorithms.ilouvain, but when I tried running that, it said it would need 40 gb memory lol

# communities = algorithms.eva(G, n_df.to_dict(orient='index'))

# 3. Zhang et al. Method
This method uses the pagerank technique outlined in [Zhang *et al.,* 2022](https://arxiv.org/abs/2104.02764). 

Mainly: 

Let $M$ be a transition matrix with: 
$$m_{ij} = \begin{cases}
    \theta w_{ji}/s_j^{out} + (1-\theta)a_{ji}/d_j^{out} && \texttt{if}\; d_j^{out}\neq 0\\
    \beta_{i}/\sum_{i\in V}\beta_i && \texttt{if}\; d_j^{out}=0
\end{cases}$$
Where: 
- $\theta$ is a tunable parameter that represents how important weights should be in the pagerank alg
- $w_{ji}$ is the weight of an edge from node $j \rightarrow i$
- $s_j^{out} = \sum_{v \in V|j \rightarrow v}w_{jv}$ is the "strength" of outgoing edges from node $j$
- $a_{ji}$ = $1 \:\texttt{if}\: j \rightarrow i \:\texttt{else}\: 0$
- $d_j^{out} = \sum_{v \in V|j \rightarrow v}a_{jv}$
- $\beta_i$ is the node importance score
  


Then the pagerank can be calculated with power iterations on
$$P=\gamma MP + (1-\gamma)\boldsymbol{\beta}/||\boldsymbol{\beta}||_1$$

Where:
- $1-\gamma$ is a tunable parameter representing the probability of restarting a random walk (typically 0.8-0.9)

---
*note the other formulation in the paper results in a dense transition matrix that is nxn.... too big*

*note that for $m_{ij}$, $i$ is the target and $j$ is the source*

In [3]:
THETA = 0
GAMMA = 0.5

DATAPATH = Path('../data/processed')

n_df = pd.read_parquet(DATAPATH / 'node_score.parquet')
e_df = pd.read_parquet(DATAPATH / 'temp_edge_score.parquet')
e_df = e_df[['cust_id_sender', 'cust_id_receiver', 'score']].copy()

## Constructing $\boldsymbol{\beta}/||\boldsymbol{\beta}||_1$

In [4]:
# Get all nodes in the graph
node_list = []
node_list.extend(e_df.cust_id_sender.tolist() + e_df.cust_id_receiver.tolist())
node_list = list(set(node_list)) #hack to remove duplicates

In [5]:
# Construct B
b_df = pd.DataFrame(data={'cust_id':node_list})
b_df = b_df.merge(n_df, on='cust_id', how='left')
b_df = b_df.rename(columns={'score':'b_i'})
b_df = b_df.fillna(0)

one_norm = abs(b_df['b_i']).sum()

b_df['b_i'] = b_df['b_i'] / one_norm

b_df.sort_values('b_i', ascending=False).head(3)

Unnamed: 0,cust_id,b_i
229569,CUST20015461,9.3e-05
143147,CUST55502501,9.3e-05
187219,CUST46707759,9.3e-05


## Node Encoding
This section encodes the cust_ids ordinally.

In [6]:
le = LabelEncoder()
le.fit(node_list)

e_df['cust_id_sender'] = le.transform(e_df['cust_id_sender'])
e_df['cust_id_receiver'] = le.transform(e_df['cust_id_receiver'])

b_df['cust_id'] = le.transform(b_df['cust_id'])

e_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,score
428821,258084,38623,0.0
489061,41523,35674,0.0
303,263741,113920,0.0


## Constructing $M$
*note that for $m_{ij}$, $i$ is the target node and $j$ is the source node*

In [7]:
#Calculate w_ji
w_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False)['score'].sum()
w_df = w_df.rename(columns={'score':'w_ji'})
e_df = e_df.merge(w_df, on=['cust_id_sender', 'cust_id_receiver'], how='left')

#Calculate a_ji
a_df = e_df.groupby(['cust_id_sender', 'cust_id_receiver'], as_index=False)['score'].count()
a_df = a_df.rename(columns={'score':'a_ji'})
e_df = e_df.merge(a_df, on=['cust_id_sender', 'cust_id_receiver'], how='left')

#Calculate s_j
s_df = e_df.groupby(['cust_id_sender'], as_index=False)['score'].sum()
s_df = s_df.rename(columns={'score':'s_j'})
e_df = e_df.merge(s_df, on='cust_id_sender', how='left')

#Calculate d_j
d_df = e_df.groupby(['cust_id_sender'], as_index=False)['score'].count()
d_df = d_df.rename(columns={'score':'d_j'})
e_df = e_df.merge(d_df, on='cust_id_sender', how='left')

#Merge b_i
e_df = e_df.merge(b_df, how='left', left_on='cust_id_receiver', right_on='cust_id')
e_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,score,w_ji,a_ji,s_j,d_j,cust_id,b_i
20456,171591,78444,0.0,0.0,1,0.0,2,78444,0.0
210359,245590,4923,0.0,0.0,1,1.5,6,4923,5e-05
473256,136945,118838,0.0,0.0,1,0.0,6,118838,0.0


In [8]:
def calc_m(r): 
    if r.d_j == 0: 
        m = b_i
    else:
        m = THETA*r.w_ji/r.s_j + (1-THETA)*r.a_ji/r.d_j
    return m

#calculate m_ij for sender j, receiver i
e_df['m'] = e_df.apply(lambda r: calc_m(r), axis=1)
e_df.sample(3)

  m = THETA*r.w_ji/r.s_j + (1-THETA)*r.a_ji/r.d_j


Unnamed: 0,cust_id_sender,cust_id_receiver,score,w_ji,a_ji,s_j,d_j,cust_id,b_i,m
255255,166440,163229,0.0,0.0,1,0.0,2,163229,1.5e-05,
525393,74704,263212,0.0,0.0,1,0.0,2,263212,0.0,
357938,21109,60786,0.0,0.0,1,0.0,2,60786,0.0,


## Pagerank

In [9]:
i = e_df['cust_id_receiver'].values
j = e_df['cust_id_sender'].values
m = e_df['m'].values

N = b_df.shape[0]
B = b_df.sort_values('cust_id')['b_i'].values

# Construct sparse matrix
M = coo_array((m,(i,j)), shape=(N,N))

# Verify that data is stored correctly
test_i = 35453
test_j = 238577
print(f'M at i={test_i} and j={test_j} is {M.tocsr()[test_i,test_j]}')

# Pagerank
P = np.full(N, 1/N)
max_diff = 1 #TODO implement
iter = 0 #TODO implement
for i in range(100):
    iter += 1
    P_int = GAMMA*M.dot(P) + (1-GAMMA)*B
    max_diff = np.absolute(P-P_int).max()
    P = P_int

# Scaling
P = (P - P.min())/(P.max() - P.min())

M at i=35453 and j=238577 is 0.0


In [10]:
# Exporting
P_df = pd.DataFrame(data=P, columns=['score'])
P_df['cust_id'] = P_df.index
P_df['cust_id'] = le.inverse_transform(P_df['cust_id']) #transform back to actual IDS
P_df = P_df[['cust_id','score']]
P_df.to_parquet(DATAPATH / 'pagerank_3.parquet')

# Vis

In [11]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio
import kaleido 
print('kaleido version:', kaleido.__version__)

#Inline figures
pio.renderers.default = 'iframe'

#Themeing
# mcolors = px.colors.qualitative.Dark24

pio.templates['custom'] = go.layout.Template(
    layout=dict(
        xaxis=dict(ticks='outside', tickcolor='lightgray', showgrid=False, showline=True),
        yaxis=dict(ticks='outside', tickcolor='lightgray', showgrid=False, showline=True, mirror=True),
        yaxis2=dict(ticks='outside', tickcolor='lightgray', showgrid=False,),
        # colorway=mcolors,
    )

)

pio.templates.default = 'plotly_white+custom'

kaleido version: 0.2.1


In [111]:
fig = px.histogram(P_df[P_df['score']>=0.5], x='score')
fig.show()


In [112]:
fig = px.histogram(n_df, x='score')
fig.show()


In [113]:
fig = px.histogram(P_df, x='score')
fig.show()

In [114]:
P_df.sort_values('score', ascending=False).head(10)

Unnamed: 0,cust_id,score
113256,CUST70977942,1.0
90586,CUST58785793,0.84137
23431,CUST22549711,0.836228
23101,CUST22362657,0.807643
153072,CUST92429485,0.760642
55793,CUST40191937,0.752005
56020,CUST40302821,0.745428
76531,CUST51289421,0.730183
118728,CUST73958429,0.724819
20445,CUST21010424,0.719274


In [119]:
kdf = pd.read_parquet(DATAPATH/'kyc.parquet')
cust_id = 'CUST22549711'
kdf[kdf.cust_id == cust_id]

Unnamed: 0,name,gender,occupation,age,tenure,cust_id,occ_wealth,occ_animal,occ_int,label,kyc_agg
89083,ELLEN HANSEN,female,Warehouse Worker,29.0,8.0,CUST22549711,0,0,0,1,1.0
