In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
from sklearn.preprocessing import normalize
from collections import Counter

## Question1

In [2]:
def pagerank(G, d = 0.85, max_iter = 100):
    # Parameters:
    #     G is the input graph 
    #     d is the dampling factor
    #     max_iter is the maximum iterations 
    # Return:
    #     nodes' pageranks.
    
    # if not directed, transform it to directed graph
    G = G.to_directed()
    N = G.number_of_nodes()
    
    # the order of nodes
    node_list = sorted(G.nodes)
    
    # normalize graph matrix by columns
    A = nx.to_numpy_matrix(G, nodelist=node_list)
    A = np.array(A)
    A_normalized = normalize(A, axis=0, norm='l1')
    
    # calculate transition matrix and initial vector
    v = np.ones(N) / N
    v_0 = v

    # iterate 
    for i in range(max_iter):
        v_curr = v
        v = ((1 - d) * v_0 + d * A_normalized @ v_curr)
        # check convergence
        err = sum(abs(v[n] - v_curr[n]) for n in range(v.shape[0]))
        # check if converge
        if err < N * 1.0e-6:
            return dict(zip(node_list, v))
        
    raise Exception("PageRank didn't converge")

# Question2

In [3]:
# open file and get edgelist
f_hollins = open("hollins/hollins.dat", "r")
data_hollins = f_hollins.readlines()
edgelist_hollins = pd.DataFrame(columns=['source', 'target'])
link_hollins = data_hollins[1:6013]

for edge in data_hollins[6013:]:
    edge = edge.split(' ')
    edge[0] = int(edge[0])
    edge[1] = int(edge[1])
    edgelist_hollins.loc[len(edgelist_hollins.index)] = [edge[0], edge[1]]

In [4]:
# generate graph
G_hollin = nx.from_pandas_edgelist(edgelist_hollins,
                                   source='source',
                                   target='target', 
                                   edge_attr=None, 
                                   create_using=nx.DiGraph())

In [5]:
# run pagerank function and compare to built-in function in networkx
pagerank_hollins = pagerank(G_hollin)
pr = nx.pagerank(G_hollin, alpha=0.85)
print(max(pagerank_hollins.values()))
print(max(pr.values()))

0.01658081465244013
0.020209640931177636


In [6]:
# write to file
with open('p1.txt', 'w') as file:
    for key in pagerank_hollins:
        file.write(str(key))
        file.write('\t')
        file.write(str(pagerank_hollins[key]))
        file.write('\n')

In [7]:
# get 5 highest and five lowest values and their indices
index_5high = sorted(pagerank_hollins, key=pagerank_hollins.get, reverse=True)[:5]
index_5low = sorted(pagerank_hollins, key=pagerank_hollins.get, reverse=False)[:5]

print("pages with the five highest Pagerank valuse are\n")
for n in index_5high:
    print(link_hollins[n])

print("pages with the five lowest Pagerank valuse are\n")
for n in index_5low:
    print(link_hollins[n])

pages with the five highest Pagerank valuse are

622 http://www.hollins.edu/campuslife/clubs/elective.htm 

2 http://www.hollins.edu/ 

1824 http://www1.hollins.edu/registrar/schedule_of_classes.htm 

2995 http://www1.hollins.edu/registrar/How%20to%20Calculate%20your%20GPA.doc 

431 http://www.hollins.edu/cgi-bin/sugform1.cgi 

pages with the five lowest Pagerank valuse are

4 http://www1.hollins.edu/Docs/Forms/GetForms.htm 

68 http://www.hollins.edu/academics/library/services/acq.htm 

74 http://www.hollins.edu/admissions/ugradadm/facts/facts.htm 

92 http://www1.hollins.edu/registrar/registrar.htm 

108 http://www1.hollins.edu/Registrar/Final%202004-05%20calendar.pdf 



# Question3

In [8]:
f_blogs = open("blogs/blogs.dat", "r")
data_blogs = f_blogs.readlines()
rowlabels_blogs = data_blogs[4:1494]
collabels_blogs = data_blogs[1495:2985]
edgelist_blogs = data_blogs[2986:]

In [9]:
G_blog = nx.Graph()
for n, value in enumerate(rowlabels_blogs, 1):
    G_blog.add_node(n, name = value)

for edge in edgelist_blogs:
    edge = edge.strip()
    edge = edge.split(' ')
    if edge[0] == '!':
        break
    edge[0] = int(edge[0])
    edge[1] = int(edge[1])
    edge[2] = int(edge[2])
    if edge[2] == 1:
        G_blog.add_edge(edge[0], edge[1])

In [10]:
pagerank_blog = pagerank(G_blog)
pr_blog = nx.pagerank(G_blog, alpha=0.85)
print(max(pagerank_blog.values()))
print(max(pr_blog.values()))

0.010203989160468711
0.012034313739825829


In [11]:
with open('p2.txt', 'w') as file:
    for key in pagerank_blog:
        file.write(str(key))
        file.write('\t')
        file.write(str(pagerank_blog[key]))
        file.write('\n')

In [12]:
index_5high = sorted(pagerank_blog, key=pagerank_blog.get, reverse=True)[:5]
index_5low = sorted(pagerank_blog, key=pagerank_blog.get, reverse=False)[:5]

print("pages with the five highest Pagerank valuse are\n")
for n in index_5high:
    print(G_blog.nodes[n]['name'])

print("pages with the five lowest Pagerank valuse are\n")
for n in index_5low:
    print(G_blog.nodes[n]['name'])

pages with the five highest Pagerank valuse are

"blogsforbu"

"dailykosc"

"drudgerepo"

"instapundi"

"talkingpoi"

pages with the five lowest Pagerank valuse are

"40ozblogb"

"4linatblo"

"americandr"

"asiannati"

"asiegeofhe"



# Question 4

In [13]:
def personalized_pagerank(G, d = 0.85, max_iter = 100, personalized = None):
    # Parameters:
    #     G is the input graph 
    #     d is the dampling factor
    #     max_iter is the maximum iterations 
    #     personalized is the index of source
    # Return:
    #     nodes' pageranks.
    
    # if not directed, transform it to directed graph
    G = G.to_directed()
    N = G.number_of_nodes()
    
    # the order of nodes
    node_list = sorted(G.nodes)
    
    # normalize graph matrix by columns
    A = nx.to_numpy_matrix(G, nodelist=node_list)
    A = np.array(A)
    A_normalized = normalize(A, axis=0, norm='l1')
    
    # calculate transition matrix and initial vector
    if personalized == None:
        v = np.ones(N) / N
    else:
        v = np.zeros(N)/ N
        v[personalized] = 1

    v_0 = v

    # iterate 
    for i in range(max_iter):
        v_curr = v
        v = ((1 - d) * v_0 + d * A_normalized @ v_curr)
        # check convergence
        err = sum(abs(v[n] - v_curr[n]) for n in range(v.shape[0]))
        # check if converge
        if err < N * 1.0e-6:
            return dict(zip(node_list, v))
        
#     raise Exception("PageRank didn't converge")
    return dict(zip(node_list, v))

In [14]:
bloglist = ['"dailykosc"', 
            '"atriosblo"', 
            '"wonkettec"', 
            '"talkleftc"', 
            '"juancolec"', 
            '"powerlineb"',
            '"realclearp"',
            '"blogsforbu"',
            '"instapundi"',
            '"michellema"']
dataframe_blog = pd.DataFrame(columns=bloglist)

In [15]:
for blog in bloglist:
    index = rowlabels_blogs.index(blog+'\n')
    pagerank_blog = personalized_pagerank(G_blog, d = 0.9, max_iter=20, personalized=index)
    dataframe_blog[blog] = list(pagerank_blog.values())

In [16]:
dataframe_blog.to_csv('p3.txt')

# Question 5

In [27]:
dict_liberal = dict(list(pagerank_blog.items())[0:757])
dict_conservative = dict(list(pagerank_blog.items())[757:])

In [29]:
print(sum(dict_liberal.values()))
print(sum(dict_conservative.values()))

0.24710074531422646
0.7528992546857731
