In [1]:
import numpy as np
from pyspark import SparkConf, SparkContext

In [2]:
conf = SparkConf()
sc = SparkContext(conf=conf)



In [6]:
def page_rank():
    n_node = 1000
    beta = 0.8
    edges = sc.textFile('./graph-full.txt').map(lambda line: line.split())\
                                            .map(lambda pair: (int(pair[0]), int(pair[1])))
    edges = edges.distinct()
    out_count = edges.map(lambda x: (x[0], 1))
    out_count = out_count.reduceByKey(lambda v1, v2: v1 + v2)
    edges = edges.join(out_count)
    r = [(i, 1/n_node) for i in range(1, n_node + 1)]

    for i in range(40):
        r = sc.parallelize(r)
        m = edges.join(r)  # (dest, beta * source_weight / source_out_degree)
        m = m.map(lambda x: (x[1][0][0], beta * x[1][1] / x[1][0][1]))
        m = m.reduceByKey(lambda v1, v2: v1 + v2)
        m = m.mapValues(lambda v: v + (1 - beta) / n_node)
        r = m.collect()

    r = sc.parallelize(r)
    print(r.top(5, lambda x: x[1]))
    print(r.top(5, lambda x: -x[1]))

In [7]:
def hits():
    n_node = 100
    lmb = 1
    mu = 1
    edges = sc.textFile('./graph-full.txt').map(lambda line: line.split()
                                                      ).map(lambda pair: (int(pair[0]), int(pair[1])))
    edges = edges.distinct()
    reversed_edges = edges.map(lambda x: (x[1], x[0]))
    h = [(i, 1) for i in range(1, n_node + 1)]
    for i in range(40):
        h = sc.parallelize(h)
        h_t = edges.join(h)  # (source, (dest, h_source))
        h_t = h_t.map(lambda x: (x[1][0], mu * x[1][1]))  # (dest, h_source)
        h_t = h_t.reduceByKey(lambda v1, v2: v1 + v2)
        max_a = h_t.max(lambda x: x[1])[1]
        a = h_t.map(lambda x: (x[0], x[1] / max_a))

        a_t = reversed_edges.join(a)  # (dest, (source, a_dest))
        a_t = a_t.map(lambda x: (x[1][0], lmb * x[1][1]))  # (source, a_dest)
        a_t = a_t.reduceByKey(lambda v1, v2: v1 + v2)
        max_h = a_t.max(lambda x: x[1])[1]
        h = a_t.map(lambda x: (x[0], x[1] / max_h))
        h = h.collect()
    h = sc.parallelize(h)
    print(h.top(5, lambda x: x[1]))
    print(a.top(5, lambda x: x[1]))
    print(h.top(5, lambda x: -x[1]))
    print(a.top(5, lambda x: -x[1]))

In [8]:
page_rank()

[(263, 0.002020291181518219), (537, 0.0019433415714531501), (965, 0.001925447807166263), (243, 0.0018526340162417314), (285, 0.0018273721700645142)]
[(558, 0.0003286018525215297), (93, 0.0003513568937516577), (62, 0.00035314810510596274), (424, 0.00035481538649301454), (408, 0.00038779848719291705)]


In [9]:
hits()

[(840, 1.0), (155, 0.9499618624906542), (234, 0.8986645288972261), (389, 0.8634171101843793), (472, 0.8632841092495218)]
[(893, 1.0), (16, 0.96355728496344), (799, 0.9510158161074022), (146, 0.9246703586198447), (473, 0.899866197360405)]
[(23, 0.04206685489093654), (835, 0.057790593544330165), (141, 0.06453117646225179), (539, 0.06602659373418493), (889, 0.07678413939216455)]
[(19, 0.0560831637760762), (135, 0.06653910487622795), (462, 0.075442286246419), (24, 0.08171239406816948), (910, 0.08571673456144879)]
