In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm

def create_transition_matrix(adjacency_matrix, num_nodes, damping_factor=0.85):
    #Compute the sum of rows.
    adjacency_matrix = adjacency_matrix.copy()
    row_sums = adjacency_matrix.sum(axis=1)

    # Handle dangling nodes (rows where sum is zero)
    for i in tqdm(range(num_nodes), desc = "creating transition matrix", total = num_nodes):
        if row_sums[i] == 0:
            '''
            If a node has no outgoing edges, assume it links to all other nodes
            adjacency_matrix[i] = 1 / num_nodes
            ---> we avoid doing this here and incorporate it at a later stage to make the computation more efficient
            '''
            pass
        else:
            '''
            Normalize rows for nodes with outgoing edges and apply the damping factor here(only the first part of damping factor)
            please note that this is same as doing :
            adjacency_matrix[i] /= row_sums[i]
            adjacency_matrix[i] *= damping_factor

            so lets say a sparse row has n non-zero entries, then we have to do 2*n operations per row if we follow the above code,
            instead we first compute the scalar value i.e row_sums[i]/damping_factor and divide it with the row,
            then the number of operations are n + 1 per row.
            '''
            adjacency_matrix[i] /= (row_sums[i]/damping_factor)

    return adjacency_matrix

    # Apply damping factor to the transition matrix
    #Note :The below steps are ignored and removed, as there is a way to achieve better computational efficiency by re-arranging the pagerank equation
    #adjacency_matrix[:] = damping_factor * adjacency_matrix #+ ((1 - damping_factor) / num_nodes) * np.ones((num_nodes, num_nodes))
    #transition_matrix = damping_factor * adjacency_matrix + ((1 - damping_factor) / num_nodes) * np.ones((num_nodes, num_nodes))
    #return transition_matrix

def batch_indices(start, stop, batch_size):
    current = start
    while current <= stop:
        end = min(current + batch_size-1, stop)
        yield (current, end)
        current += batch_size

def create_transition_matrix_batchwise(adjacency_matrix, num_nodes, damping_factor = 0.85, batch_size = 10):
    #row_sums = adjacency_matrix.sum(axis=1)

    for start_idx, end_idx in tqdm(batch_indices(0, num_nodes-1, batch_size), desc = "computing transition matrix", total = np.ceil(num_nodes/batch_size)):

        row_sums = adjacency_matrix[start_idx:end_idx + 1].sum(axis = 1)


        for i in range(start_idx, end_idx + 1):
            if row_sums[i - start_idx] == 0:
                pass
            else:
                adjacency_matrix[i] /= (row_sums[i - start_idx]/damping_factor)

    return adjacency_matrix

def compute_pagerank(transition_matrix, num_nodes, tol=1e-6, max_iter=100):
    # Initialize the PageRank vector with equal probability for each node
    pagerank = np.ones(num_nodes) / num_nodes
    teleport = (1 - damping_factor) / num_nodes

    t_m = transition_matrix.T

    dangling_nodes = np.where(t_m.sum(axis=0).flatten() == 0)[0]

    for _ in tqdm(range(max_iter), desc = "computing pagerank", total = max_iter):

        dangling_contrib = damping_factor * pagerank[dangling_nodes].sum() / num_nodes

        new_pagerank = (t_m @ pagerank) + dangling_contrib + teleport
        # Check for convergence
        if np.linalg.norm(new_pagerank - pagerank, 1) < tol:
            break
        pagerank = new_pagerank

    return pagerank

# Main Code

In [2]:
nodes_file = '/content/sample_nodes_0.01_204_pageranked_gephi.csv'
edges_file = '/content/sample_edges_0.01_204.csv'

nodes_df = pd.read_csv(nodes_file)
edges_df = pd.read_csv(edges_file)

node_ids = nodes_df['Id'].values
id_to_index = {node_id: idx for idx, node_id in enumerate(node_ids)}
num_nodes = len(node_ids)

to_disk = True
adj_mtx_path = './adjacency_matrix.dat'
adjacency_matrix = None

if(to_disk):
    adjacency_matrix = np.memmap(adj_mtx_path, dtype = 'float32', mode = 'w+', shape = (num_nodes, num_nodes))
    for _, row in edges_df.iterrows():
        src_idx = id_to_index[row['Source']]
        tgt_idx = id_to_index[row['Target']]
        adjacency_matrix[src_idx, tgt_idx] = 1.0

    #flush any remaining operations to disk and delete the reference
    adjacency_matrix.flush()
    del adjacency_matrix

else:
    adjacency_matrix = np.zeros((num_nodes, num_nodes), dtype = np.float32)
    for _, row in edges_df.iterrows():
        src_idx = id_to_index[row['Source']]
        tgt_idx = id_to_index[row['Target']]
        adjacency_matrix[src_idx, tgt_idx] = 1.0

In [3]:
if(to_disk):
    adjacency_matrix = np.memmap(adj_mtx_path, dtype = 'float32', mode = 'r+', shape = (num_nodes, num_nodes))

# Create the transition matrix with damping factor
damping_factor = 0.85
transition_matrix = None
if(to_disk):
    batch_size = 45
    transition_matrix = create_transition_matrix_batchwise(adjacency_matrix, num_nodes, damping_factor, batch_size)
    #flush any remaining disk write operations from creating the transition matrix
    transition_matrix.flush()
else:
    transition_matrix = create_transition_matrix(adjacency_matrix, num_nodes, damping_factor)


# Compute PageRank
pagerank = compute_pagerank(transition_matrix, num_nodes)
print("PageRank values:", pagerank)

computing transition matrix:   0%|          | 0/399.0 [00:00<?, ?it/s]

computing pagerank:   0%|          | 0/100 [00:00<?, ?it/s]

PageRank values: [1.03278661e-05 8.13587502e-04 1.22194227e-02 ... 8.87231332e-06
 8.87231332e-06 8.87231332e-06]


In [4]:
node_ids = nodes_df['Id'].values
nodes_df['my_pagerank'] = nodes_df['Id'].map(lambda x : pagerank[id_to_index[x]])

In [5]:
nodes_df.sample(30)

Unnamed: 0,Id,Label,names,pageranks,my_pagerank
1607,1714803,,Manual labour,4.4e-05,4.4e-05
1115,1167425,,2003 invasion of Iraq,0.000495,0.000495
1002,17547,,321st Air Expeditionary Group,9e-06,9e-06
4084,174754,,Cleopatra VII,0.00021,0.000211
15725,1484721,,Mpho Koaho,9e-06,9e-06
7310,1185903,,Jonny Greenwood,1.8e-05,1.8e-05
7706,483744,,Shara'b As Salam District,9e-06,9e-06
2740,1459593,,Soviet Top League,4.4e-05,4.4e-05
13710,1376343,,Dante Basco,9e-06,9e-06
11008,913122,,Rich Bickle,9e-06,9e-06


In [6]:
nodes_df[['pageranks', 'my_pagerank']].describe()

Unnamed: 0,pageranks,my_pagerank
count,17914.0,17914.0
mean,5.6e-05,5.6e-05
std,0.000214,0.000214
min,9e-06,9e-06
25%,9e-06,9e-06
50%,1e-05,1e-05
75%,2.5e-05,2.4e-05
max,0.012222,0.012219
