## PageRank & HITS(Hyperlink-Induced Topic Search)

In [25]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext

# create the Spark Session
spark = SparkSession.builder.appName("spark").getOrCreate()

# create the Spark Context
sc = spark.sparkContext



In [None]:
temp1 = sc.parallelize([(1,2),(4,2)])
temp2 = sc.parallelize([(1,2)])

In [249]:
edges = sc.textFile('./graph-full.txt').map(lambda line: line.split())\
                                            .map(lambda pair: (int(pair[0]), int(pair[1])))
print(edges.take(1))
print(edges.count())
print()

edges = edges.distinct()
print(edges.take(1))
print(edges.count())
for i in edges.collect():
    if i[0] == 816:
        print(i)
print()

out_count = edges.map(lambda x: (x[0], 1))
out_count = out_count.reduceByKey(lambda v1, v2: v1 + v2)
print(out_count.take(3))
print(out_count.count())
for i in out_count.collect():
    if i[0] == 774:
        print(i)
print()

edges = edges.join(out_count)
print(edges.take(1))
print(edges.count())
for i in edges.collect():
    if i[0]==816:
        print(i)
print()


[(1, 2)]
8192

[(994, 918)]
8161
(816, 774)
(816, 208)
(816, 894)
(816, 762)
(816, 817)
(816, 999)
(816, 849)

[(994, 8), (816, 7), (322, 9)]
1000
(774, 7)

[(816, (774, 7))]
8161
(816, (774, 7))
(816, (208, 7))
(816, (894, 7))
(816, (762, 7))
(816, (817, 7))
(816, (999, 7))
(816, (849, 7))



In [250]:
import numpy as np
r = [(i, 1/1000) for i in range(1, 1000 + 1)]
print(len(r))
print(len(r[0]))
r = sc.parallelize(r)
print(r.take(1))
print(r.count())
m = edges.join(r)  # (dest, beta * source_weight / source_out_degree)
print(m.take(1))
print(m.count())


1000
2
[(1, 0.001)]
1000
[(864, ((394, 7), 0.001))]
8161


In [229]:
data = sc.textFile("graph-full.txt")
print(data.take(3))
print(data.count())
check = data.map(lambda line: (line.split("\t")[0], 1)).reduceByKey(lambda x, y: x+y)
num_node = check.count()
print("# node:", num_node)

['1\t2', '2\t3', '3\t4']
8192
# node: 1000


In [68]:
graph = data.map(lambda line: (int(line.split("\t")[0]), [int(line.split("\t")[1])]) )\
            .reduceByKey(lambda x, y: x+y)\
                .sortByKey()
print(graph.take(3))
print(graph.count())

[(1, [2, 586, 904, 502, 531, 689]), (2, [3, 505, 799, 781, 415, 713, 690, 433, 632, 440, 498]), (3, [4, 190, 545, 562, 796, 679, 981, 455, 619])]
1000


In [223]:
import numpy as np

# M = np.matrix([ np.repeat(0,num_node) for _ in range(num_node) ])
M = np.matrix(np.zeros((num_node, num_node), dtype = np.float))

for i,(_,edges) in enumerate(graph.collect()):
    n = len(edges)
    for edge in edges:
        M[edge-1,i] = 1/n
        print(edge-1,i)
        print(M[edge-1,i])
    break

# print(M)
print("="*100)
print(M[1,0])
print(M[585,0])
print(M[903,0])
print(M.shape)
print(M.max())

1 0
0.16666666666666666
585 0
0.16666666666666666
903 0
0.16666666666666666
501 0
0.16666666666666666
530 0
0.16666666666666666
688 0
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
(1000, 1000)
0.16666666666666666


In [248]:
import numpy as np
r = np.repeat(1/num_node, num_node).reshape(1,-1).T
print(r.shape)
print(r[0])
beta = 0.8
iteration = 40

(1000, 1)
[0.001]


In [225]:
for _ in range(iteration):
    r = ((beta)*M@r) + ((1-beta)/num_node) 

In [226]:
r1 = np.array(r)

for _ in range(5):
    # print(r.min())
    print(r.argmin()+1,"",r1.min())
    r[r.argmin()] = 1.


1  0.00019999999999999996
3  0.00019999999999999996
4  0.00019999999999999996
5  0.00019999999999999996
6  0.00019999999999999996


In [227]:
for _ in range(5):
    # print(r.max())
    print(r1.argmax()+1,"",r1.max())
    r1[r1.argmax()] = 0.



2  0.00022666666666666663
502  0.00022666666666666663
531  0.00022666666666666663
586  0.00022666666666666663
689  0.00022666666666666663


In [22]:
x = np.array([[1,2],[3,4]])
y = np.array([1,1])
print(x@y)


1024
