# PageRank

## 读取数据

In [1]:
import numpy as np

# 读数据
def read_data():
    file = open('Data.txt', 'r')
    graph=[] 
    node_set=set()
    for line in file:
        data=line.split()
        edge=(int(data[0]),int(data[1]))  # 以tuple存储两个点/边
        node_set.add(edge[0])
        node_set.add(edge[1])
        graph.append(edge)

    node_num=len(node_set) # 点的个数
    return graph, node_num, node_set

In [21]:
G, node_num, node_set = read_data()
print('点的个数:',node_num)
# 验证点的序号是否为 1-node_num
for i in range(1,node_num+1):
    if i not in node_set:
        print('点的序号不连续')
        break
print('点的序号连续')

点的个数: 8297
点的序号连续


## test

In [45]:
import networkx as nx

testG = nx.DiGraph()
for edge in G:
    testG.add_edge(edge[0], edge[1])
print(testG.number_of_nodes())


8297


In [52]:
pr = nx.pagerank(testG, alpha=0.85, tol=1e-6/len(testG))
# 对 pagerank 值进行排序
sorted_pr = sorted(pr.items(), key=lambda x: x[1], reverse=True)
# 打印前100个节点
for node, pr_value in sorted_pr[:100]:
    print(f"Node: {node}, PageRank: {pr_value}")
# 计算所有节点pagerank值之和
pr_sum = sum(pr.values())
print(f"所有节点pagerank值之和: {pr_sum}")


Node: 2730, PageRank: 0.0008718545630996286
Node: 7102, PageRank: 0.0008545292545357016
Node: 1010, PageRank: 0.0008496113429084786
Node: 368, PageRank: 0.0008358983067864309
Node: 1907, PageRank: 0.000830589955903218
Node: 7453, PageRank: 0.0008206419715353892
Node: 4583, PageRank: 0.0008178783257292128
Node: 7420, PageRank: 0.0008103312135438388
Node: 1847, PageRank: 0.000809994456449296
Node: 5369, PageRank: 0.000805995061333756
Node: 3164, PageRank: 0.0008050887986405982
Node: 7446, PageRank: 0.0008031585963961542
Node: 3947, PageRank: 0.0008022195095139477
Node: 2794, PageRank: 0.0007923189800154398
Node: 3215, PageRank: 0.00078216836242779
Node: 5346, PageRank: 0.000781197483794825
Node: 7223, PageRank: 0.0007773650726513161
Node: 630, PageRank: 0.0007743753319023608
Node: 4417, PageRank: 0.0007688170169220024
Node: 4955, PageRank: 0.0007607858766730214
Node: 3208, PageRank: 0.0007590253502760098
Node: 2902, PageRank: 0.0007575383869467485
Node: 5671, PageRank: 0.0007558488555572

## 基础版本

In [56]:
# 初始化邻接矩阵
def get_stochastic_matrix(G, node_num):
	matrix = np.zeros((node_num,node_num))
	# 统计邻接矩阵
	for edge in G:
		matrix[edge[1]-1][edge[0]-1] = 1  # 入度0->1
	# 计算
	for j in range(node_num):
		sum_of_col = sum(matrix[:,j])  # 出度之和(d)
		# 如果发现dead-end，将其转为随机跳转
		if sum_of_col == 0:
			matrix[:,j] = 1/node_num
			continue
		for i in range(node_num):  # 1/d
			matrix[i,j] /= sum_of_col
	return matrix

# 迭代
def power_interation(matrix, beta, node_num):
	# 用 1/node_num 初始化rank vector
	scores = np.ones((node_num))/node_num
	new_scores = np.zeros((node_num))
	interation_num = 0  # 迭代次数
	e = node_num # 两次迭代之间的误差
	while e > 1e-6:
		new_scores = beta*np.dot(matrix,scores)+(1-beta)/node_num  # β随机游走
		e = sum(abs(new_scores-scores))
		scores = np.copy(new_scores)
		interation_num += 1
	return scores, interation_num

In [57]:
beta = 0.85  # 按照链接跳转的概率
matrix = get_stochastic_matrix(G, node_num)
scores, interation_num = power_interation(matrix, beta, node_num)
print('PageRank:', scores)
print('迭代次数:', interation_num)

PageRank: [5.39597744e-04 9.59161392e-05 1.12610782e-04 ... 9.50066677e-05
 8.35763251e-05 7.36008357e-05]
迭代次数: 53


In [58]:
print(sum(scores))
# 只取最大的前100个
sorted_indices = np.argsort(scores)[::-1][:100]
sorted_scores = scores[sorted_indices]
print('Top 100:', sorted_scores)
print('Top 100的点:', sorted_indices+1)  # 点的序号从1开始

0.9999999999999979
Top 100: [0.00087185 0.00085453 0.00084961 0.0008359  0.00083059 0.00082064
 0.00081788 0.00081033 0.00080999 0.000806   0.00080509 0.00080316
 0.00080222 0.00079232 0.00078217 0.0007812  0.00077737 0.00077438
 0.00076882 0.00076079 0.00075903 0.00075754 0.00075585 0.00075166
 0.00074796 0.0007474  0.00074577 0.00074494 0.00074458 0.00074116
 0.0007401  0.00073804 0.00073637 0.00073385 0.00073313 0.00073267
 0.00073254 0.00073134 0.00073048 0.00072626 0.00072583 0.00072582
 0.00072123 0.00071736 0.00071543 0.00071399 0.00071396 0.00071301
 0.00071236 0.00071202 0.00071025 0.00070964 0.00070934 0.00070916
 0.00070835 0.00070823 0.00070495 0.00070113 0.00069982 0.00069907
 0.00069859 0.00069789 0.00069758 0.0006968  0.00069651 0.00069614
 0.00069586 0.00069517 0.00069411 0.00069328 0.00069273 0.00069209
 0.00069156 0.00069146 0.00069116 0.00069044 0.00068669 0.00068636
 0.0006859  0.00068532 0.00068364 0.00068344 0.00068323 0.00068301
 0.00068281 0.00068277 0.00068233 

In [73]:
def write_result(sorted_indices, sorted_scores, filename):
    with open(filename, 'w') as file:
        for i in range(len(sorted_indices)):
            file.write(str(sorted_indices[i]+1) + ' ' + str(sorted_scores[i]) + '\n')

In [None]:
write_result(sorted_indices, sorted_scores, 'result.txt')

## 稀疏矩阵优化

In [60]:
# 初始化稀疏矩阵
def get_sparse_matrix(G, node_num):
    sparse_matrix = [[] for _ in range(node_num)]
    for edge in G:
        sparse_matrix[edge[0]-1].append(edge[1]-1)  # 出度0->1
    return sparse_matrix

def power_interation_sparse(sparse_matrix, beta, node_num):
    # 用 1/node_num 初始化分数 
    scores = np.ones((node_num))/node_num  # 1/N
    e = node_num  # 两次迭代之间的误差
    interation_num = 0  # 迭代次数
    while e > 1e-6:
        new_scores = (1-beta)*np.ones((node_num))/node_num
        # 挨个读入稀疏矩阵
        for i in range(node_num):
            # 如果是dead-end
            if len(sparse_matrix[i]) == 0:  # 没有出度，为所有节点分配
                new_scores += beta*scores[i]/node_num  # 因为稀疏矩阵，所以在这里处理
                continue
            for j in sparse_matrix[i]:  # i->j
                new_scores[j] += beta*scores[i]/len(sparse_matrix[i])
        e = sum(abs(new_scores-scores))
        scores = np.copy(new_scores)
        interation_num += 1
    return scores, interation_num

def power_interation_sparse_book(sparse_matrix, beta, node_num):
    # 用 1/node_num 初始化分数 
    scores = np.ones((node_num))/node_num  # 1/N
    e = node_num  # 两次迭代之间的误差
    interation_num = 0  # 迭代次数
    while e > 1e-6:
        new_scores = np.zeros((node_num))
        for i in range(node_num):
            for j in sparse_matrix[i]:  # i->j
                new_scores[j] += beta*scores[i]/len(sparse_matrix[i])
        # re-insert the leaked PageRank
        new_scores += (1-sum(new_scores))/node_num
        e = sum(abs(new_scores-scores))
        scores = np.copy(new_scores)
        interation_num += 1
    return scores, interation_num

In [64]:
beta = 0.85
sparse_matrix = get_sparse_matrix(G, node_num)
scores , interation_num = power_interation_sparse(get_sparse_matrix(G, node_num), beta, node_num)
#scores , interation_num = power_interation_sparse_book(get_sparse_matrix(G, node_num), beta, node_num)

print('迭代次数:', interation_num)

迭代次数: 53


In [65]:
print(sum(scores))
# 只取最大的前100个
sorted_indices = np.argsort(scores)[::-1][:100]
sorted_scores = scores[sorted_indices]
print('Top 100:', sorted_scores)
print('Top 100的点:', sorted_indices+1)  # 点的序号从1开始

0.9999999999999997
Top 100: [0.00087185 0.00085453 0.00084961 0.0008359  0.00083059 0.00082064
 0.00081788 0.00081033 0.00080999 0.000806   0.00080509 0.00080316
 0.00080222 0.00079232 0.00078217 0.0007812  0.00077737 0.00077438
 0.00076882 0.00076079 0.00075903 0.00075754 0.00075585 0.00075166
 0.00074796 0.0007474  0.00074577 0.00074494 0.00074458 0.00074116
 0.0007401  0.00073804 0.00073637 0.00073385 0.00073313 0.00073267
 0.00073254 0.00073134 0.00073048 0.00072626 0.00072583 0.00072582
 0.00072123 0.00071736 0.00071543 0.00071399 0.00071396 0.00071301
 0.00071236 0.00071202 0.00071025 0.00070964 0.00070934 0.00070916
 0.00070835 0.00070823 0.00070495 0.00070113 0.00069982 0.00069907
 0.00069859 0.00069789 0.00069758 0.0006968  0.00069651 0.00069614
 0.00069586 0.00069517 0.00069411 0.00069328 0.00069273 0.00069209
 0.00069156 0.00069146 0.00069116 0.00069044 0.00068669 0.00068636
 0.0006859  0.00068532 0.00068364 0.00068344 0.00068323 0.00068301
 0.00068281 0.00068277 0.00068233 

In [66]:
write_result(sorted_indices, sorted_scores, 'result_sparse.txt')

## 分块优化

In [69]:
scores = np.ones((node_num))/node_num  # 存储在disk
new_scores = np.zeros((node_num))  # 分块存储在ram

def power_interation_block(sparse_matrix, beta, node_num, block_size):
    global scores, new_scores
    # 分块
    block_num = node_num//block_size
    e = 1  # 两次迭代之间的误差
    while e > 1e-6:
        e = 0
        # 每次处理一块
        for i in range(block_num):
            # 读进一个块
            new_scores[i*block_size:(i+1)*block_size] = (1-beta)/node_num
            # scan M and r_old once for each block
            for j in range(node_num):
                # 遇到dead-end
                if len(sparse_matrix[j]) == 0:
                    new_scores[i*block_size:(i+1)*block_size] += beta*scores[j]/node_num
                    continue
                for m in sparse_matrix[j]:
                    if m>=i*block_size and m<(i+1)*block_size:
                        new_scores[m] += beta*scores[j]/len(sparse_matrix[j])
            e += sum(abs(new_scores[i*block_size:(i+1)*block_size]-scores[i*block_size:(i+1)*block_size]))
        # 处理剩余部分
        new_scores[block_num*block_size:] = (1-beta)/node_num
        for j in range(node_num):
            if len(sparse_matrix[j]) == 0:
                new_scores[block_num*block_size:] += beta*scores[j]/node_num
                continue
            for m in sparse_matrix[j]:
                if m>=block_num*block_size:
                    new_scores[m] += beta*scores[j]/len(sparse_matrix[j])
        e += sum(abs(new_scores[block_num*block_size:]-scores[block_num*block_size:]))
        scores = np.copy(new_scores)

In [70]:
beta = 0.85
block_size = 2000
sparse_matrix = get_sparse_matrix(G, node_num)
power_interation_block(sparse_matrix, beta, node_num, block_size)

In [71]:
print(sum(scores))
# 只取最大的前100个
sorted_indices = np.argsort(scores)[::-1][:100]
sorted_scores = scores[sorted_indices]
print('Top 100:', sorted_scores)
print('Top 100的点:', sorted_indices+1)  # 点的序号从1开始

0.9999999999999997
Top 100: [0.00087185 0.00085453 0.00084961 0.0008359  0.00083059 0.00082064
 0.00081788 0.00081033 0.00080999 0.000806   0.00080509 0.00080316
 0.00080222 0.00079232 0.00078217 0.0007812  0.00077737 0.00077438
 0.00076882 0.00076079 0.00075903 0.00075754 0.00075585 0.00075166
 0.00074796 0.0007474  0.00074577 0.00074494 0.00074458 0.00074116
 0.0007401  0.00073804 0.00073637 0.00073385 0.00073313 0.00073267
 0.00073254 0.00073134 0.00073048 0.00072626 0.00072583 0.00072582
 0.00072123 0.00071736 0.00071543 0.00071399 0.00071396 0.00071301
 0.00071236 0.00071202 0.00071025 0.00070964 0.00070934 0.00070916
 0.00070835 0.00070823 0.00070495 0.00070113 0.00069982 0.00069907
 0.00069859 0.00069789 0.00069758 0.0006968  0.00069651 0.00069614
 0.00069586 0.00069517 0.00069411 0.00069328 0.00069273 0.00069209
 0.00069156 0.00069146 0.00069116 0.00069044 0.00068669 0.00068636
 0.0006859  0.00068532 0.00068364 0.00068344 0.00068323 0.00068301
 0.00068281 0.00068277 0.00068233 

In [74]:
write_result(sorted_indices, sorted_scores, 'result_block.txt')

## Block-Stripe优化

In [95]:
def get_stripes(G, node_num, block_size):
    block_num = node_num//block_size
    remain = node_num%block_size
    if remain != 0:  # +1是因为最后一个stripe也得存
        block_num += 1

    stripes = [ {} for _ in range(block_num)]  # [0,1,2,...,block_num-1]
    length = [0 for _ in range(node_num)]

    # 初始化稀疏矩阵
    for edge in G:
        to_node = edge[1]-1
        from_node = edge[0]-1
        index = to_node//block_size  # dest所在块的编号
        if from_node not in stripes[index].keys():
            # 将from_node加入stripes
            stripes[index][from_node] = []
        stripes[index][from_node].append(to_node)  # 将to_node加入stripes
        length[from_node] += 1  # 记录每个节点的出度
    return stripes, length

def deal_dead_end(stripes, length, node_num, block_size):
    block_num = node_num//block_size
    remain = node_num%block_size
    if remain != 0:
        block_num += 1
    for i in range(node_num):
        # 没有出度则为dead-end
        if length[i] == 0:
            length[i] = node_num
            for j in range(block_num):
                stripes[j][i] = [k for k in range(j*block_size, min((j+1)*block_size, node_num))]
    return stripes, length


In [96]:
# test
tempG = [(0,0), (0,1), (0,3), (0,4), (1,0), (1,4), (2,1), (2,3), (2,4)]
tempG = [(i+1,j+1) for i,j in tempG]
temp_node_num = 5
block_size = 2
stripes, length = get_stripes(tempG, temp_node_num, block_size)
print(stripes)
print(length)
stripes, length = deal_dead_end(stripes, length, temp_node_num, block_size)
print(stripes)
print(length)

[{0: [0, 1], 1: [0], 2: [1]}, {0: [3], 2: [3]}, {0: [4], 1: [4], 2: [4]}]
[4, 2, 3, 0, 0]
[{0: [0, 1], 1: [0], 2: [1], 3: [0, 1], 4: [0, 1]}, {0: [3], 2: [3], 3: [2, 3], 4: [2, 3]}, {0: [4], 1: [4], 2: [4], 3: [4], 4: [4]}]
[4, 2, 3, 5, 5]


In [99]:
scores = np.ones((node_num))/node_num  # 存储在disk
new_scores = np.zeros((node_num))  # 分块存储在ram

def power_interation_block_stripe(stripes, length, node_num, block_size, beta):
    global scores, new_scores    
    block_num = node_num//block_size
    remain = node_num%block_size
    if remain != 0:  # +1是因为最后一个stripe也得存
        block_num += 1
    end_block_index = block_num-1
        
    e = 1  # 两次迭代之间的误差
    interation_num = 0
    while e > 1e-6:
        e = 0
        # 每次处理一块
        for i in range(end_block_index):
            new_scores[i*block_size:(i+1)*block_size] = (1-beta)/node_num
            for from_node in stripes[i]:  # 遍历当前块下的所有源节点(stripe)
                for to_node in stripes[i][from_node]:  # 对应的目标节点
                    new_scores[to_node] += beta*scores[from_node]/length[from_node]
            e += sum(abs(new_scores[i*block_size:(i+1)*block_size]-scores[i*block_size:(i+1)*block_size]))
        
        # 处理剩余部分
        if remain != 0:
            new_scores[end_block_index*block_size:] = (1-beta)/node_num
            for from_node in stripes[end_block_index]:
                for to_node in stripes[end_block_index][from_node]:
                    new_scores[to_node] += beta*scores[from_node]/length[from_node]
            e+=sum(abs(new_scores[end_block_index*block_size:]-scores[end_block_index*block_size:]))
        scores=np.copy(new_scores)
        interation_num += 1
        print('interation_num:', interation_num, ' e:', e)

In [100]:
block_size = 2000
beta = 0.85
stripes, length = get_stripes(G, node_num, block_size)
stripes, length = deal_dead_end(stripes, length, node_num, block_size)
print('finish deal dead end')
power_interation_block_stripe(stripes, length, node_num, block_size, beta)

finish deal dead end
interation_num: 1  e: 0.16516579676568333
interation_num: 2  e: 0.07766412986971014
interation_num: 3  e: 0.06169222893625692
interation_num: 4  e: 0.04926045708557971
interation_num: 5  e: 0.0393357822055534
interation_num: 6  e: 0.03141064081512026
interation_num: 7  e: 0.02508227253017668
interation_num: 8  e: 0.020028895333592485
interation_num: 9  e: 0.015993633870237663
interation_num: 10  e: 0.012771364572275706
interation_num: 11  e: 0.010198292305589085
interation_num: 12  e: 0.008143622033207665
interation_num: 13  e: 0.0065029102749777894
interation_num: 14  e: 0.005192755983911705
interation_num: 15  e: 0.004146561088528543
interation_num: 16  e: 0.003311145163410164
interation_num: 17  e: 0.002644042149409765
interation_num: 18  e: 0.002111341708936572
interation_num: 19  e: 0.0016859654876882778
interation_num: 20  e: 0.0013462906613582624
interation_num: 21  e: 0.0010750507991398347
interation_num: 22  e: 0.0008584581724443042
interation_num: 23  e: 

In [101]:
print(sum(scores))
# 只取最大的前100个
sorted_indices = np.argsort(scores)[::-1][:100]
sorted_scores = scores[sorted_indices]
print('Top 100:', sorted_scores)
print('Top 100的点:', sorted_indices+1)  # 点的序号从1开始

0.9999999999999988
Top 100: [0.00087185 0.00085453 0.00084961 0.0008359  0.00083059 0.00082064
 0.00081788 0.00081033 0.00080999 0.000806   0.00080509 0.00080316
 0.00080222 0.00079232 0.00078217 0.0007812  0.00077737 0.00077438
 0.00076882 0.00076079 0.00075903 0.00075754 0.00075585 0.00075166
 0.00074796 0.0007474  0.00074577 0.00074494 0.00074458 0.00074116
 0.0007401  0.00073804 0.00073637 0.00073385 0.00073313 0.00073267
 0.00073254 0.00073134 0.00073048 0.00072626 0.00072583 0.00072582
 0.00072123 0.00071736 0.00071543 0.00071399 0.00071396 0.00071301
 0.00071236 0.00071202 0.00071025 0.00070964 0.00070934 0.00070916
 0.00070835 0.00070823 0.00070495 0.00070113 0.00069982 0.00069907
 0.00069859 0.00069789 0.00069758 0.0006968  0.00069651 0.00069614
 0.00069586 0.00069517 0.00069411 0.00069328 0.00069273 0.00069209
 0.00069156 0.00069146 0.00069116 0.00069044 0.00068669 0.00068636
 0.0006859  0.00068532 0.00068364 0.00068344 0.00068323 0.00068301
 0.00068281 0.00068277 0.00068233 

In [102]:
write_result(sorted_indices, sorted_scores, 'result_block_stripe.txt')

In [1]:
list=[0,1,2,3]
list=np.array(list)
print(list+1)
print(list)

TypeError: can only concatenate list (not "int") to list