In [12]:
import numpy as np
from scipy import sparse

In [25]:
beta = 0.8
tolerance = 10**-10
topK = 10
bottomK = 10

In [14]:
def getGraphData(file):
    graphEdges = dict()
    with open(file) as fp:
        line = fp.readline()
        while line:
            lineL = [ int(node) for node in line.strip().split('\t')]
            if graphEdges.get(lineL[0]):
                graphEdges[lineL[0]].append(lineL[1])
            else:
                graphEdges[lineL[0]] = [lineL[1]]
            line = fp.readline()
    return graphEdges

graphEdges = getGraphData('web-Stanford.txt')


In [15]:
numberOfNodes = sorted(graphEdges.keys(), reverse=True)[:1][0]
print("Number of nodes using is", numberOfNodes)

Number of nodes using is 281903


In [16]:
def createMatrixFromGraph(graph, size):
    matrix = np.zeros((size, size))
    for node in graph.keys():
        nodeKey = node
        nodeValues = graph.get(node)
        nodesCount = len(nodeValues)
        eachNodeVal = 1 / nodesCount
        # for handling duplicates
        valDict = dict()
        for val in nodeValues:
            if valDict.get(val):
                valDict[val] = valDict.get(val) + 1
            else:
                valDict[val] = 1

        for nodeVal in list(set(nodeValues)):
            # To match node and matrix indices: node = matrix indices + 1
            count = valDict.get(nodeVal)
            matrix[nodeVal-1][node-1] = count * float(eachNodeVal)
    return matrix

# matrix = createMatrixFromGraph(graphEdges, numberOfNodes)

In [17]:
def createMatrixFromGraphWithCount(graph, size):
    matrix = np.zeros((size, size))
    for node in graph.keys():
        nodeKey = node
        nodeValues = graph.get(node)
        nodesCount = len(nodeValues)
        # for handling duplicates
        valDict = dict()
        for val in nodeValues:
            if valDict.get(val):
                valDict[val] = valDict.get(val) + 1
            else:
                valDict[val] = 1

        for nodeVal in list(set(nodeValues)):
            # To match node and matrix indices: node = matrix indices + 1
            count = valDict.get(nodeVal)
            matrix[nodeVal-1][node-1] = count
    return matrix

# matrixWithCount = createMatrixFromGraph(graphEdges, numberOfNodes)

In [18]:
def createMatrixFromGraphSparse(graph, size):
    row = []
    col = []
    data = []
    for node in graph.keys():
        nodeKey = node
        nodeValues = graph.get(node)
        nodesCount = len(nodeValues)
        eachNodeVal = 1 / nodesCount
        # for handling duplicates
        valDict = dict()
        for val in nodeValues:
            if valDict.get(val):
                valDict[val] = valDict.get(val) + 1
            else:
                valDict[val] = 1
        for nodeVal in list(set(nodeValues)):
            # To match node and matrix indices: node = matrix indices + 1
            count = valDict.get(nodeVal)
            row.append(nodeVal-1)
            col.append(node-1)
            data.append(count * float(eachNodeVal))
    matrix = sparse.csr_matrix((data, (row, col)), shape=(size, size))
    return matrix

sparseMatrix = createMatrixFromGraphSparse(graphEdges, numberOfNodes)


In [19]:
numberOfNonZeroEdges = sparse.csr_matrix.count_nonzero(sparseMatrix)
print('Number of non zero points is ',numberOfNonZeroEdges)

Number of non zero points is  2312497


In [20]:
# Given, 1 is one vector with nx1 entries of value 1
one = np.ones((numberOfNodes,1))

# initialising the r0
r = 1/ float(numberOfNodes) * one

In [21]:
count = 0
while True:
    count += 1
    rnew = ((1-beta)/numberOfNodes) * one + beta * sparseMatrix.dot(r)
    l1Norm = np.linalg.norm((rnew - r), ord=1)
    if l1Norm < tolerance:
        break
    r = rnew
print('Number of iterations it took is ', count)

Number of iterations it took is  87


In [30]:
def getValuesTuple(indices, r):
    return [(index+1, r[index][0]) for index in indices]
        

In [32]:
# Convert the nx1 pageRank vector to a list

# Top k page ranks with values
topKPagesId = r.T[0].argsort()[-topK:][::-1]
topKPages = getValuesTuple(topKPagesId,r)
print('************Top 5 node ids with scores************')
for pr in enumerate(topKPages):
    (index, (pRVal, nodeId)) = pr
    print('Page rank ', index+1, 'for page id ',nodeId+1,'with value is - ', pRVal)

print('\n\n')

# Bottom 5 page ranks with values
bottomKPagesId = r.T[0].argsort()[:bottomK]
bottomKPages = getValuesTuple(bottomKPagesId,r)
print('************Bottom 5 node ids with scores************')
for pr in enumerate(bottomKPages):
    (index, (pRVal, nodeId)) = pr
    print('Page rank ', index+1, 'for page id ',nodeId+1,'with value is - ', pRVal)


************Top 5 node ids with scores************
Page rank  1 for page id  1.0104746290075999 with value is -  89073
Page rank  2 for page id  1.0096102377782246 with value is -  226411
Page rank  3 for page id  1.0083796591805754 with value is -  241454
Page rank  4 for page id  1.003257857259282 with value is -  134832
Page rank  5 for page id  1.0027316738913734 with value is -  69358
Page rank  6 for page id  1.0027045445730853 with value is -  67756
Page rank  7 for page id  1.0026613086123561 with value is -  234704
Page rank  8 for page id  1.0025254553882856 with value is -  225872
Page rank  9 for page id  1.0024971047487976 with value is -  186750
Page rank  10 for page id  1.002486836912227 with value is -  262860



************Bottom 5 node ids with scores************
Page rank  1 for page id  1.0000007094638936 with value is -  1
Page rank  2 for page id  1.0000007094638936 with value is -  21386
Page rank  3 for page id  1.0000007094638936 with value is -  87191
Page r