# Initialization

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import numpy as np
from numpy.linalg import norm
import matplotlib.pyplot as plt
from scipy.io import mmread
from scipy.sparse import coo_matrix

In [7]:
G = mmread("/content/drive/MyDrive/10 - UB /20 - Colab Notebooks/UB Courses/Numerical Linear Algebra/Project_3/p2p-Gnutella30.mtx")
print("G.shape:", G.shape)
print("number of stored values:", G.nnz, "out of a size of", G.shape[0]*G.shape[1])

row_idxs = G.row
col_idxs = G.col
values = G.data

G.shape: (36682, 36682)
number of stored values: 88328 out of a size of 1345569124


In [8]:
print(G)

  (1311, 0)	1.0
  (1629, 0)	1.0
  (5352, 0)	1.0
  (9391, 0)	1.0
  (0, 1)	1.0
  (3050, 1)	1.0
  (16254, 1)	1.0
  (21629, 1)	1.0
  (26775, 1)	1.0
  (28911, 1)	1.0
  (0, 2)	1.0
  (0, 3)	1.0
  (1331, 3)	1.0
  (3675, 3)	1.0
  (5014, 3)	1.0
  (11048, 3)	1.0
  (15239, 3)	1.0
  (18724, 3)	1.0
  (30694, 3)	1.0
  (31271, 3)	1.0
  (31803, 3)	1.0
  (34383, 3)	1.0
  (0, 4)	1.0
  (1163, 4)	1.0
  (3500, 4)	1.0
  :	:
  (36649, 36657)	1.0
  (36649, 36658)	1.0
  (36619, 36659)	1.0
  (36619, 36660)	1.0
  (36619, 36661)	1.0
  (36650, 36662)	1.0
  (36650, 36663)	1.0
  (36650, 36664)	1.0
  (36658, 36665)	1.0
  (36658, 36666)	1.0
  (14860, 36667)	1.0
  (36088, 36668)	1.0
  (36088, 36669)	1.0
  (36628, 36670)	1.0
  (36628, 36671)	1.0
  (36655, 36672)	1.0
  (36655, 36673)	1.0
  (36655, 36674)	1.0
  (36674, 36675)	1.0
  (36674, 36676)	1.0
  (36674, 36677)	1.0
  (36617, 36678)	1.0
  (36617, 36679)	1.0
  (36617, 36680)	1.0
  (36641, 36681)	1.0


In [9]:
n = coo_matrix(np.sum(G, axis=0))    # array containing the out_degree for each column j of G
print(n)

  (0, 0)	4.0
  (0, 1)	6.0
  (0, 2)	1.0
  (0, 3)	11.0
  (0, 4)	6.0
  (0, 5)	1.0
  (0, 6)	2.0
  (0, 7)	14.0
  (0, 8)	1.0
  (0, 9)	7.0
  (0, 10)	1.0
  (0, 12)	1.0
  (0, 13)	1.0
  (0, 14)	6.0
  (0, 15)	1.0
  (0, 16)	1.0
  (0, 17)	13.0
  (0, 18)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 21)	1.0
  (0, 22)	3.0
  (0, 23)	3.0
  (0, 24)	2.0
  (0, 25)	1.0
  :	:
  (0, 36657)	1.0
  (0, 36658)	1.0
  (0, 36659)	1.0
  (0, 36660)	1.0
  (0, 36661)	1.0
  (0, 36662)	1.0
  (0, 36663)	1.0
  (0, 36664)	1.0
  (0, 36665)	1.0
  (0, 36666)	1.0
  (0, 36667)	1.0
  (0, 36668)	1.0
  (0, 36669)	1.0
  (0, 36670)	1.0
  (0, 36671)	1.0
  (0, 36672)	1.0
  (0, 36673)	1.0
  (0, 36674)	1.0
  (0, 36675)	1.0
  (0, 36676)	1.0
  (0, 36677)	1.0
  (0, 36678)	1.0
  (0, 36679)	1.0
  (0, 36680)	1.0
  (0, 36681)	1.0


In [10]:
G?

# Ex 1. Compute the PR vector of $M_{m}$ using the power method (adapted to PR computation). The algorithm reduces to iterate $x_{k+1} = (1 − m)GDx_{k} + ez^{t}x_{k}$ until $||x_{k+1} − x_{k}||_{∞} < tol$.

## Dense version

**Initially, make use of a test matrix in dense notation**

In [11]:
# link matrix G used for testing ---------------------------------------------------
# This matrix is not strictly needed, since all the testing could have been done 
# Directly on the given matrix G. It was necessary simply because when I started 
# working on this I had to run it locally and couldn't afford to store the matrix G
# as a dense matrix ----------------------------------------------------------------
G_test = np.array([[0, 1, 1, 1, 0, 1, 0], 
                  [1, 1, 0, 0, 1, 0, 1],
                  [0, 1, 1, 1, 1, 0, 1],
                  [1, 0, 0, 1, 1, 0, 1],
                  [0, 1, 1, 1, 0, 1, 0],
                  [1, 1, 1, 0, 1, 0, 0],
                  [1, 1, 0, 0, 0, 0, 1]])

print(G_test)

[[0 1 1 1 0 1 0]
 [1 1 0 0 1 0 1]
 [0 1 1 1 1 0 1]
 [1 0 0 1 1 0 1]
 [0 1 1 1 0 1 0]
 [1 1 1 0 1 0 0]
 [1 1 0 0 0 0 1]]


In [12]:
def problem_gen_dense(G, m=0.15):
    """ problem_gen
    Function to generate the necessary vectors and matrices associated to the link matrix G
    Returns M, mS, e, z, A, D, n
    """
    
    # Retrieve the dimension of the problem --------------------------------------------
    n_pages = G.shape[0]
    
    # Compute the vector n and the matrix D --------------------------------------------
    n = np.sum(G, axis=0)    # array containing the out_degree for each column j of G
    D = np.zeros([n_pages, n_pages])
    for j in range(n_pages):
        if n[j] !=0:
            D[j,j] = 1/n[j]
    
    # Compute the matrix A -------------------------------------------------------------
    A = np.dot(G, D)
    
    # Compute the product mS -----------------------------------------------------------
    e = np.ones([n_pages, 1])
    z = 1/n_pages*np.ones([n_pages, 1])
    for j in range(n_pages):
        if A[:,j].any != 0:
            z[j] = m/n_pages
    mS = np.dot(e, z.T)
    
    # Compute the matrix M -------------------------------------------------------------
    M = (1-m)*A + mS
    
    # Visualize the size of the generated matrices and vectors -------------------------
    # print("M.shape:", M.shape)
    # if n_pages<=10:
    #    print("M:")
    #    print(M)
    
    return M, mS, e, z, A, D, n

In [13]:
def PageRank_dense(G, m=0.15, tol=1e-5):
    """ PageRank
    Computes the page rank of page k in the direct graph defined by the link matrix G
    Returns the page rank vector PR
    """
    
    # Retrieve the dimension of the problem --------------------------------------------
    n_pages = G.shape[0]
    
    # Generate the necessary vectors and matrices associated to the link matrix G ------
    M, mS, e, z, A, D, n = problem_gen_dense(G, m)        # This is not optimal but since in Ex.2 we are optimizing the method in order to 
                                                    # handle larger dataset, this method is written this way for clarity 
    
    # Initialize xk and xk_1 -----------------------------------------------------------
    xk = np.random.rand(n_pages).reshape([n_pages, 1])
    xk_1 = xk+1     # Initialized in order to enter the while loop, updated as soon as it enters the loop
    
    # Power method ---------------------------------------------------------------------
    while (norm(xk_1-xk, ord=np.inf)>tol):
        xk = xk_1          # Update the value of xk
        xk_1 = np.dot(M, xk)
        
    return xk_1 / np.sum(xk_1)

In [14]:
PR = PageRank_dense(G_test)
print("Page Rank vector:")
print(PR)

Page Rank vector:
[[0.16494243]
 [0.1301901 ]
 [0.15910275]
 [0.1419003 ]
 [0.16494243]
 [0.14378229]
 [0.09513968]]


## Dense+Sparse version

**Let's now implement the same method using a coo sparse matrix format. In order to do this, let's use the proposed matrix G**

In [15]:
n_pages = G.shape[0]

# Compute the vector n and the matrix D --------------------------------------------
n = coo_matrix(np.sum(G, axis=0))    # array containing the out_degree for each column j of G
print(n_pages)
print(n.shape)
D_data = 1/n.data
print(D_data)
D = coo_matrix((D_data, (n.row, n.col)), shape=(n_pages, n_pages))
print(D.shape)

36682
(1, 36682)
[0.25       0.16666667 1.         ... 1.         1.         1.        ]
(36682, 36682)


In [16]:
A = coo_matrix(G.dot(D))
print(A.shape)

(36682, 36682)


In [17]:
print(A.col)

[36681 36680 36679 ...     2     1     0]


In [18]:
def problem_gen(G, m=0.15, mode='dense'):
    """ problem_gen
    Function to generate the necessary vectors and matrices associated to the link matrix G
    Returns M, mS, e, z, A, D, n
    """
    
    if mode == 'dense':
        # Retrieve the dimension of the problem --------------------------------------------
        n_pages = G.shape[0]

        # Compute the vector n and the matrix D --------------------------------------------
        n = np.sum(G, axis=0)    # array containing the out_degree for each column j of G
        D = np.zeros([n_pages, n_pages])
        for j in range(n_pages):
            if n[j] !=0:
                D[j,j] = 1/n[j]

        # Compute the matrix A -------------------------------------------------------------
        A = np.dot(G, D)

        # Compute the product mS -----------------------------------------------------------
        e = np.ones([n_pages, 1])
        z = 1/n_pages*np.ones([n_pages, 1])
        for j in range(n_pages):
            if A[:,j].any != 0:
                z[j] = m/n_pages
        mS = np.dot(e, z.T)

        # Compute the matrix M -------------------------------------------------------------
        M = (1-m)*A + mS
        
        return M, mS, e, z, A, D, n
        
    elif mode == 'coo_matrix':
        # Retrieve the dimension of the problem --------------------------------------------
        n_pages = G.shape[0]

        # Compute the vector n and the matrix D --------------------------------------------
        n = coo_matrix(np.sum(G, axis=0))    # array containing the out_degree for each column j of G
        D_data = 1/n.data
        D = coo_matrix((D_data, (n.row, n.col)), shape=(n_pages, n_pages))

        # Compute the matrix A -------------------------------------------------------------
        A = coo_matrix(G.dot(D))

        # Compute the product mS -----------------------------------------------------------
        e = np.ones([n_pages, 1])               # The product mS is the product of two dense vectors,
        z = 1/n_pages*np.ones([n_pages, 1])         # in this case the sparse notation is not called for
        for j in A.col:           
            z[j] = m/n_pages         # zj is m/n for the columns j where A contains non-zero values
        # mS = np.dot(e, z.T)        # In this case if we build mS we run into the same problems as storing the matrices as dense

        # Compute the matrix M -------------------------------------------------------------
        # M = (1-m)*A + mS           # Thus we cannot compute M explicitly, we will compute mS in the PageRank method to avoid computing the matrix
        
        return e, z, A, D, n

In [19]:
e, z, A, D, n = problem_gen(G, mode='coo_matrix')

In [20]:
def PageRank(G, m=0.15, tol=1e-5, mode='dense'):
    """ PageRank
    Computes the page rank of page k in the direct graph defined by the link matrix G
    Returns the page rank vector PR
    """
    
    # Retrieve the dimension of the problem --------------------------------------------
    n_pages = G.shape[0]
    
    # Initialize xk and xk_1 -----------------------------------------------------------
    xk = np.random.rand(n_pages).reshape([n_pages, 1])
    xk_1 = xk+1     # Initialized in order to enter the while loop, updated as soon as it enters the loop
    
    if mode == 'dense':
        # Generate the necessary vectors and matrices associated to the link matrix G --
        M, mS, e, z, A, D, n = problem_gen(G, m, mode)        # This is not optimal but since in Ex.2 we are optimizing the method in order to 
                                                              # handle larger matrices, this method is written this way for clarity 
        # Power method ---------------------------------------------------------------------
        while (norm(xk_1-xk, ord=np.inf)>tol):
            xk = xk_1          # Update the value of xk
            xk_1 = np.dot(M, xk)
        
    if mode == 'coo_matrix':
        # Generate the necessary vectors and matrices associated to the link matrix G --
        e, z, A, D, n = problem_gen(G, m, mode)        # This is not optimal but since in Ex.2 we are optimizing the method in order to 
                                                              # handle larger matrices, this method is written this way for clarity 
        # Power method ---------------------------------------------------------------------
        while (norm(xk_1-xk, ord=np.inf)>tol):
            xk = xk_1          # Update the value of xk
            xk_1 = np.dot((1-m)*A, xk) + e*(np.dot(z.T,xk))
        
    return xk_1 / np.sum(xk_1)

In [None]:
PR = PageRank(G, mode='coo_matrix')
print("Page Rank vector:")
print(PR)

# Ex 2. Compute the PR vector of $M_{m}$ using the power method without storing matrices.

In [None]:
def PageRank_opt(G, m=0.15, tol=1e-5, mode='dense'):
    """ PageRank_opt
    Computes the page rank of page k in the direct graph defined by the link matrix G
    Returns the page rank vector PR
    """
    
    if mode=='dense':
        # Retrieve the dimension of the problem --------------------------------------------
        n_pages = G.shape[0] 

        # Initialize xk and xk_1 -----------------------------------------------------------
        xk = np.random.rand(n_pages).reshape([n_pages, 1])
        xk_1 = xk+1     # Initialized in order to enter the while loop, updated as soon as it enters the loop
        L = []
        for j in range(n_pages):
            L.append(G[:,j])
        n = np.sum(G, axis=0)    # array containing the out_degree for each column j of G

        # Power method ---------------------------------------------------------------------
        while (norm(xk_1-xk)>tol):
            xk = xk_1          # Update the value of xk
            for j in range(n_pages):
                if n[j]==0:
                    xk_1 = xk + xk[j]/n
                else:
                    for i in L:
                        xk_1[i] = xk[i] + xk[j]/n[j]
                xk_1 = (1-m)*xk + m/n
    
    if mode=='coo_matrix':
        # Retrieve the dimension of the problem --------------------------------------------
        n_pages = G.shape[0] 

        # Initialize xk and xk_1 -----------------------------------------------------------
        x = np.zeros(n_pages)
        xc = np.ones(n_pages) / n_pages
        
        # Compute Lj and nj ----------------------------------------------------------------
        L = []
        for j in range(n_pages):     # Go thorugh the column index j
            L.append([])
            for i in range(G.col):      # Go thorugh the column indexes of G
                if G.col[i] = j:            # If there is any corresponfing to the column j considered
                    L[j].append(G.row[i])       # Add the row index to the list Lj
            
        n = np.sum(G, axis=0)    # array containing the out_degree for each column j of G

        # Power method ---------------------------------------------------------------------
        while (norm(x-xc, ord=np.inf)>tol):
            xc=x
            x=np.zeros(n)
            for j in range (0,n):
            if(n[j]==0):
            x=x+xc[j]/n
            else:
            for i in L[j]:
            x[i]=x[i]+xc[j]/n[j]
            x=(1-m)*x+m/n
        
    return xk_1 / np.sum(xk_1)

In [None]:
PR = PageRank(G_test)
print("Page Rank vector:")
print(PR)

Page Rank vector:
[[1.62784194]
 [1.28486063]
 [1.57020575]
 [1.40043027]
 [1.62784194]
 [1.41900331]
 [0.93894513]]


In [None]:
PR = PageRank_opt(G_test)
print("Page Rank vector:")
print(PR)

Page Rank vector:
[[       inf        inf        inf        inf        inf        inf
         inf]
 [       inf        inf        inf        inf        inf        inf
         inf]
 [0.25       0.16666667 0.25       0.25       0.25       0.5
  0.25      ]
 [0.25       0.16666667 0.25       0.25       0.25       0.5
  0.25      ]
 [0.25       0.16666667 0.25       0.25       0.25       0.5
  0.25      ]
 [0.25       0.16666667 0.25       0.25       0.25       0.5
  0.25      ]
 [0.25       0.16666667 0.25       0.25       0.25       0.5
  0.25      ]]


  xk_1[i] = xk[i] + xk[j]/n[j]
  while (norm(xk_1-xk)>tol):
