In [2]:
import urllib.request
import io
import gzip
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix 

In [4]:
file = 'web-NotreDame.txt'
source = urllib.request.urlopen("https://snap.stanford.edu/data/web-NotreDame.txt.gz")
compressedFile = io.BytesIO(source.read())
decompressedFile = gzip.GzipFile(fileobj=compressedFile)

with open(file, 'wb') as outfile:
    outfile.write(decompressedFile.read())
    outfile.close()

In [3]:
graph_df = pd.read_csv('web-NotreDame.txt', delim_whitespace = True, skiprows=4, names=['from_node', 'to_node'])

In [4]:
graph_df = graph_df[(graph_df['from_node']<10000) & (graph_df['to_node']<10000)]
graph_df['weight'] = pd.Series(np.ones(graph_df.shape[0]), index=graph_df.index)

In [74]:
#create unique list of nodes
nodes = np.unique(graph_df.iloc[:, 0].tolist() + graph_df.iloc[:, 1].tolist())

#turn graph into adjacency matrix
M = coo_matrix((graph_df.iloc[:,2], (graph_df.iloc[:,0],graph_df.iloc[:,1])), shape=(len(nodes), len(nodes)))

#compute number of outgoing edges for each node
n_edges = np.array(M.sum(1))[:,0]

#find non-zero entries in Matrix
i, j = M.nonzero()

#initialize weight transfer 
M.data /= n_edges[i]

#save as array
m = M.toarray()
m = m.transpose()

In [75]:
def pagerank(M, alpha, num_iter):
    pr = np.ones((M.shape[0],1))
    beta = 1-alpha
    for i in range(0,num_iter):
        old = pr.copy()
        pr = alpha*np.dot(M, old) + beta
    return pr

## Step 3.3

In [76]:
m[10:30,10:30]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  

In [84]:
pr = pagerank(m, 0.85, 15)
pr_df = pd.DataFrame(pr).reset_index()
pr_df.columns = ['id', 'pagerank']
pr_df.sort_values(by=['pagerank'], ascending=False, inplace=True)
pr_df[:10].reset_index().drop('index', axis=1)

Unnamed: 0,id,pagerank
0,0,224.702638
1,1973,189.250314
2,1790,53.438593
3,1828,50.954873
4,1,27.975911
5,238,26.779136
6,140,23.520898
7,14,22.232264
8,16,21.591054
9,162,18.283386
