In [1]:
import networkx as nx
import numpy as np

In [6]:
def graph(fileIn = None):
    """
    input: 
        fileIn: fileName input for graph 
    output: G
    """
    G = {}
    try:
        f = open(fileIn, 'r')
        datas = f.readlines()
        f.close()
    except:
        print('file not exists!!')
        return None
    max_node = 0
    for line in datas:
        if line[0] != '#':
            pair = line.split()
            f, t = int(pair[0]), int(pair[1])
            G[f] = G.get(f, [])
            G[t] = G.get(t, [])
            G[f].append(t)
    return G

In [7]:
# đọc đồ thị dữ liệu từ file
G = graph('web-NotreDame.txt')
len(G.keys())

325729

In [13]:
def pageRank(G, beta = 0.85, iter = 100, teleport_list = None, eps = 1e-8):
    if not teleport_list:
        teleport_list = G.keys()
    #N_teleport_list = len(teleport_list)
    N = len(G.keys())
    next_rank_list = np.array([1/N for i in range(N)])
    curr_rank_list = np.array(next_rank_list)
    const_rank_list = np.zeros(N)
    for node in teleport_list:
        const_rank_list[node] = (1.0 - beta) / N
    
    # loop for algorithm
    for i in range(iter):
        curr_rank_list, next_rank_list = np.array(next_rank_list), np.array(const_rank_list)
        
        for node in G:
            if G[node]:
                contribution = beta * curr_rank_list[node] / len(G[node])
                for edge in G[node]:
                    next_rank_list[edge] += contribution
        
        leakage_contribution = (1.0 - np.sum(next_rank_list)) / N
        next_rank_list += leakage_contribution
        total_diff = np.sum(np.abs(curr_rank_list - next_rank_list))
        if total_diff < eps * N:
            print("iters = ", i)
            break

    return next_rank_list

In [14]:
#G = graph()
%time rank_list = pageRank(G, eps=1e-06)
print(np.sum(rank_list))
print(rank_list[0])

iters =  1
Wall time: 2.81 s
1.0000000000000002
0.005092846884345504


In [33]:
rank_list

array([5.09284688e-03, 3.89555402e-04, 2.13606770e-04, ...,
       3.80891313e-06, 3.84082421e-06, 2.75926806e-06])

### using NetworkX

In [10]:
diG = nx.DiGraph()
for key in G:
    for el in G[key]:
        diG.add_edge(key, el)

In [11]:
diG.number_of_nodes()

325729

In [20]:
%time pagerank = nx.pagerank(diG)

Wall time: 20.5 s


In [21]:
pagerank[0]

0.0050928468843440345

In [24]:
pageRank_list = []
for page in pagerank:
    pageRank_list.append(pagerank[page])

In [25]:
pageRank_list = np.array(pageRank_list)

In [26]:
pageRank_list

array([5.09284688e-03, 3.89555402e-04, 2.13606770e-04, ...,
       3.80891313e-06, 3.84082421e-06, 2.75926806e-06])

### root mean squared error

In [27]:
np.sqrt(np.mean(np.power((np.array(pageRank_list) - rank_list), 2)))

8.182332187228208e-18

### absolute mean error

In [28]:
np.mean(np.abs(np.array(pageRank_list) - rank_list))

9.444851794931788e-19