In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
from sklearn.preprocessing import normalize

## Question1

In [2]:
def pagerank(G, d = 0.85, max_iter = 100, personalized = None):
    # Parameters:
    #     G is the input graph 
    #     d is the dampling factor
    #     max_iter is the maximum iterations 
    #     personalized is the personalized dictionary
    # Return:
    #     nodes' pageranks.
    
    # if not directed, transform it to directed graph
    G = G.to_directed()
    N = G.number_of_nodes()
    
    # the order of nodes
    node_list = sorted(G.nodes)
    
    # normalize graph matrix by columns
    A = nx.to_numpy_matrix(G, nodelist=node_list)
    A = np.array(A)
    A_normalized = normalize(A, axis=0, norm='l1')
    
    # calculate transition matrix and initial vector
    v = np.ones(N) / N
    v_0 = v
    P = ((1 - d)/ N + d * A_normalized)

    # iterate 
    for i in range(max_iter):
        v_curr = v
        v = ((1 - d) * v_0 + d * P @ v_curr)
        # check convergence
        err = sum(abs(v[n] - v_curr[n]) for n in range(v.shape[0]))
        if err < N * 1.0e-6:
            return v
        
    raise Exception("PageRank didn't converge")

# Question2

In [3]:
f_hollins = open("hollins/hollins.dat", "r")
data_hollins = f_hollins.readlines()
edgelist_hollins = pd.DataFrame(columns=['source', 'target'])
link_hollins = data_hollins[1:6013]

for edge in data_hollins[6013:]:
    edge = edge.split(' ')
    edge[0] = int(edge[0])
    edge[1] = int(edge[1])
    edgelist_hollins.loc[len(edgelist_hollins.index)] = [edge[0], edge[1]]

In [4]:
G_hollin = nx.from_pandas_edgelist(edgelist_hollins,
                                   source='source',
                                   target='target', 
                                   edge_attr=None, 
                                   create_using=nx.DiGraph())

In [5]:
pagerank_hollins = pagerank(G_hollin)
pr = nx.pagerank(G_hollin, alpha=0.85)
print(max(pagerank_hollins))
print(max(pr.values()))

0.011258540172189512
0.020209640931177636


In [6]:
key_hollions = list(range(1, G_hollin.number_of_nodes() + 1))
pagedict_hollins = dict(zip(key_hollions, pagerank_hollins))

In [7]:
with open('p1.txt', 'w') as file:
    for n,value in enumerate(pagerank_hollins, 1):
        file.write(str(n))
        file.write('\t')
        file.write(str(value))
        file.write('\n')

In [8]:
index_5high = sorted(pagedict_hollins, key=pagedict_hollins.get, reverse=True)[:5]
index_5low = sorted(pagedict_hollins, key=pagedict_hollins.get, reverse=False)[:5]

print("pages with the five highest Pagerank valuse are\n")
for n in index_5high:
    print(link_hollins[n])

print("pages with the five lowest Pagerank valuse are\n")
for n in index_5low:
    print(link_hollins[n])

pages with the five highest Pagerank valuse are

622 http://www.hollins.edu/campuslife/clubs/elective.htm 

2 http://www.hollins.edu/ 

2995 http://www1.hollins.edu/registrar/How%20to%20Calculate%20your%20GPA.doc 

1824 http://www1.hollins.edu/registrar/schedule_of_classes.htm 

431 http://www.hollins.edu/cgi-bin/sugform1.cgi 

pages with the five lowest Pagerank valuse are

4 http://www1.hollins.edu/Docs/Forms/GetForms.htm 

68 http://www.hollins.edu/academics/library/services/acq.htm 

74 http://www.hollins.edu/admissions/ugradadm/facts/facts.htm 

92 http://www1.hollins.edu/registrar/registrar.htm 

108 http://www1.hollins.edu/Registrar/Final%202004-05%20calendar.pdf 



# Question3

In [9]:
f_blogs = open("blogs/blogs.dat", "r")
data_blogs = f_blogs.readlines()
rowlabels_blogs = data_blogs[4:1494]
collabels_blogs = data_blogs[1495:2985]
edgelist_blogs = data_blogs[2986:]

dataframe_blogs = pd.DataFrame(columns=['source', 'target'])

for edge in edgelist_blogs:
    edge = edge.strip()
    edge = edge.split(' ')
    if edge[0] == '!':
        break
    edge[0] = int(edge[0])
    edge[1] = int(edge[1])
    edge[2] = int(edge[2])
    if edge[2] == 1:
        dataframe_blogs.loc[len(dataframe_blogs.index)] = [edge[0], edge[1]]

In [10]:
G_blog = nx.from_pandas_edgelist(dataframe_blogs,
                                   source='source',
                                   target='target', 
                                   edge_attr=None, 
                                   create_using=nx.DiGraph())

In [11]:
pagerank_blog = pagerank(G_blog)
pr_blog = nx.pagerank(G_blog, alpha=0.85)
print(max(pagerank_blog))
print(max(pr_blog.values()))

0.022456987785429552
0.01869156756601579


In [12]:
pagelist_blog = list(zip(sorted(G_blog.nodes), pagerank_hollins))
pagedict_blog = dict(zip(sorted(G_blog.nodes), pagerank_hollins))

In [13]:
with open('p2.txt', 'w') as file:
    for n,value in pagelist_blog:
        file.write(str(n))
        file.write('\t')
        file.write(str(value))
        file.write('\n')

In [14]:
index_5high = sorted(pagedict_blog, key=pagedict_blog.get, reverse=True)[:5]
index_5low = sorted(pagedict_blog, key=pagedict_blog.get, reverse=False)[:5]

print("pages with the five highest Pagerank valuse are\n")
for n in index_5high:
    print(rowlabels_blogs[n])

print("pages with the five lowest Pagerank valuse are\n")
for n in index_5low:
    print(rowlabels_blogs[n])

pages with the five highest Pagerank valuse are

"andrightly"

"12thharmon"

"realitybas"

"incite1bl"

"americand2"

pages with the five lowest Pagerank valuse are

"95thesesb"

"blogveggi"

"blogfreeor"

"bradfriedm"

"charlinean"



# Question 4