In [1]:
import pandas as pd
import numpy as np
import os
import scipy
from tqdm.notebook import tqdm

def create_transition_matrix_sparse(adjacency_matrix, num_nodes, damping_factor=0.85):
    row_sums = np.array(adjacency_matrix.sum(axis=1)).flatten()

    for i in tqdm(range(num_nodes), desc = "creating transition matrix", total = num_nodes):
        if row_sums[i] != 0:
            start, end = adjacency_matrix.indptr[i], adjacency_matrix.indptr[i + 1] #first non-zero element's indices of row i and row i+1 in the data array
            adjacency_matrix.data[start:end] /= (row_sums[i] / damping_factor)

    return adjacency_matrix


def compute_pagerank(transition_matrix, num_nodes, tol=1e-10, max_iter=100):
    # Initialize the PageRank vector with equal probability for each node
    pagerank = np.ones(num_nodes) / num_nodes
    teleport = (1 - damping_factor) / num_nodes

    t_m = transition_matrix.transpose().tocsr() # transpose the matrix here

    dangling_nodes = np.where(t_m.sum(axis=0).A1 == 0)[0]

    for _ in tqdm(range(max_iter), desc = "computing pagerank", total = max_iter):

        dangling_contrib = damping_factor * pagerank[dangling_nodes].sum() / num_nodes

        new_pagerank = (t_m @ pagerank) + dangling_contrib + teleport
        # Check for convergence
        if np.linalg.norm(new_pagerank - pagerank, 1) < tol:
            break
        pagerank = new_pagerank

    return pagerank


In [2]:
damping_factor = 0.85
num_iterations = 200  # or set a convergence threshold
tol = 1e-10  # tolerance for convergence

In [3]:
nodes_df = pd.read_csv('wiki_articles_names.csv')
edges_df = pd.read_csv('wiki_articles_edges.csv')
edges_df = edges_df[~(edges_df['Target'].isnull())].reset_index(drop = True)

num_nodes = len(nodes_df)

node_id = {n:i for i, n in enumerate(nodes_df['id'])}

#create the sparse matrix and store it on disk
data = np.ones(len(edges_df), dtype = np.float32)

row_indices = edges_df['Source'].map(node_id).to_list()
col_indices = edges_df['Target'].map(node_id).to_list()

adjacency_matrix = scipy.sparse.csr_matrix((data, (row_indices, col_indices)), shape = (num_nodes, num_nodes), dtype = np.float32)

data_memmap = np.memmap('data.dat', dtype='float32', mode='w+', shape= adjacency_matrix.data.shape)
indices_memmap = np.memmap('indices.dat', dtype='int32', mode='w+', shape= adjacency_matrix.indices.shape)
indptr_memmap = np.memmap('indptr.dat', dtype='int32', mode='w+', shape= adjacency_matrix.indptr.shape)

# Write the matrix components to disk
data_memmap[:] = adjacency_matrix.data
indices_memmap[:] = adjacency_matrix.indices
indptr_memmap[:] = adjacency_matrix.indptr

# Flush to ensure data is written to disk
data_memmap.flush()
indices_memmap.flush()
indptr_memmap.flush()

del data_memmap
del indices_memmap
del indptr_memmap

In [4]:
round(len(edges_df)/(len(nodes_df)**2),7)*100

0.00089

Only ***0.00089%*** of the wikipedia dataset's adjacency matrix has 1s in it, rest of it is just 0s. So its pretty trivial to adapt for a sparse computation framework.<br>

so we can store the adjacency matrix in a sparse format (CSR: Compressed Sparse Row) format and do the transition matrix formation in-place, followed by computing the pagerank.

In [5]:
data_memmap = np.memmap('data.dat', dtype='float32', mode='r+', shape=adjacency_matrix.data.shape)
indices_memmap = np.memmap('indices.dat', dtype='int32', mode='r+', shape=adjacency_matrix.indices.shape)
indptr_memmap = np.memmap('indptr.dat', dtype='int32', mode='r+', shape=adjacency_matrix.indptr.shape)

# Reconstruct the CSR matrix from memmap arrays
adjacency_matrix = scipy.sparse.csr_matrix((data_memmap, indices_memmap, indptr_memmap), shape=(num_nodes, num_nodes), dtype = np.float32)

In [6]:
transition_matrix = create_transition_matrix_sparse(adjacency_matrix, num_nodes, damping_factor)

data_memmap[:] = transition_matrix.data
indices_memmap[:] = transition_matrix.indices
indptr_memmap[:] = transition_matrix.indptr

# Flush to ensure data is written to disk
data_memmap.flush()
indices_memmap.flush()
indptr_memmap.flush()


# Compute PageRank
pagerank = compute_pagerank(transition_matrix, num_nodes)
print("PageRank values:", pagerank)

creating transition matrix:   0%|          | 0/1791489 [00:00<?, ?it/s]

computing pagerank:   0%|          | 0/100 [00:00<?, ?it/s]

PageRank values: [3.42206779e-07 3.71587972e-07 1.89012491e-07 ... 1.13629101e-07
 1.52287226e-07 9.21964909e-08]


In [7]:
nodes_df['my_pagerank'] = pagerank

In [8]:
nodes_df['my_pagerank'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1791489 entries, 0 to 1791488
Series name: my_pagerank
Non-Null Count    Dtype  
--------------    -----  
1791489 non-null  float64
dtypes: float64(1)
memory usage: 13.7 MB


In [9]:
nodes_df['my_pagerank'].describe()

Unnamed: 0,my_pagerank
count,1791489.0
mean,5.581949e-07
std,7.42235e-06
min,8.375226e-08
25%,1.032593e-07
50%,1.427953e-07
75%,2.699128e-07
max,0.005514321


In [10]:
#if you have ground truth pagerank values stored in nodes df then uncomment this to check
#nodes_df[['pageranks', 'my_pagerank']].describe()

In [11]:
ranked_df = nodes_df.sort_values(by = ['my_pagerank'], ascending = False)
ranked_df.head(50)

Unnamed: 0,id,names,my_pagerank
279122,279122,United States,0.005514
987583,987583,France,0.002367
541013,541013,United Kingdom,0.002158
230038,230038,Canada,0.001721
896828,896828,Germany,0.00167
121347,121347,World War II,0.001501
1055792,1055792,English language,0.00137
610154,610154,Australia,0.001307
98332,98332,Italy,0.001294
1118496,1118496,India,0.001198
