Jonathan Dyer  
jbdyer

In [1]:
import random
from collections import Counter
random.seed(1)

In [2]:
def read_graph(fname):
    '''
    Takes a filename of a file with incidence vector representation of 
    a graph, and reads it into a dictionary of key, [values] pairs. The
    key is each node in the graph, and the value is a list of neighbors.
    '''
    d = {}
    with open(fname) as f:
        for line in f:
            k, *v = line.split()
            d[k] = v
            
    return d

In [3]:
def random_walk(graph, walk_len=1000, beta=0.85):
    '''
    Performs the process of a "random walk" across the given graph, with
    the parameters specified.
    1) Randomly select a page p to start on.
    2) With probability beta, select a random page from the neighbors of p 
    to switch to, otherwise jump to a random page.
    3) Repeat step 2 walk_len times.
    '''
    nodes = list(graph.keys())
    current = random.choice(nodes)  # get starting node
    
    # for walk_len times, jump pages as above
    for i in range(walk_len):
        if random.random() <= beta:
            current = random.choice(graph[current])
        else:
            current = random.choice(nodes)
            
    return current

In [4]:
def simulate_pagerank(fname, walk_len=1000, N=1000, beta=0.85):
    '''
    This is the driver function. It reads the graph in fname and 
    performs N random walks of length walk_len. Finally it calculates
    the pagerank of each page and prints that information.
    '''
    # random.seed(1)
    
    # first read the graph
    graph = read_graph(fname)
    all_nodes = list(graph.keys())
    
    # now perform the random walks, counting each result
    node_count = Counter()
    for i in range(N):
        node_count.update(random_walk(graph))
        
    # finally calculate the PageRank for each
    for node in sorted(all_nodes):
        rank = node_count[node] / N
        print(node, rank)

In [5]:
simulate_pagerank("graph-1.txt")

A 0.379
B 0.206
C 0.37
D 0.045


In [7]:
simulate_pagerank("graph-2.txt", N=5000)

A 0.3648
B 0.1778
C 0.2678
D 0.078
E 0.1116


In [9]:
simulate_pagerank("wikipedia-example.txt", N=5000)

A 0.032
B 0.3818
C 0.3504
D 0.0386
E 0.0822
F 0.037
G 0.0148
H 0.0174
I 0.0166
J 0.0126
K 0.0166
