In [None]:
import sknetwork
from IPython.display import SVG
from sknetwork.visualization import svg_graph, svg_bigraph
import networkx as nx

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import string
import random

from sknetwork.utils import get_neighbors
from sknetwork.ranking import PageRank, top_k, HITS
from sknetwork.clustering import Louvain, get_modularity, PropagationClustering

In [None]:
wikivitals = sknetwork.data.load_netset("wikivitals") # load dataset

In [None]:
print(wikivitals.keys())

In [None]:
adjacency = wikivitals.adjacency # adjacency matrix
names = wikivitals.names # article names
labels = wikivitals.labels
names_labels = wikivitals.names_labels
biadjacency = wikivitals.biadjacency # biadjacency matrix (not used in project)
print(wikivitals.keys())
label_to_id = {name: i for i, name in enumerate(names_labels)}
names[777]

In [None]:
type(adjacency)

In [None]:
names = list(names)
type(names)

In [None]:
pagerank = PageRank()
scores = pagerank.fit_predict(adjacency)
for i in top_k(scores, 20):
    print(names[i])

In [None]:
hits = HITS()
hubs = hits.fit_predict(biadjacency)

In [None]:
scores = [scores] # put page rank and HITS output into the same array
scores.append(hubs)
scores = np.array(scores).T

In [None]:
import requests # make wikipedia api request
import tabulate # display output in pretty table
from IPython.core.display import display, HTML # for converting html table to pretty table
from bs4 import BeautifulSoup


def format_link(title):
    # Hipparcus -> https://en.wikipedia.org/wiki/Hipparchus
    title = title.replace(" ", "_")
    return 'https://en.wikipedia.org/wiki/' + title

def get_table(candidate_scores): # generate the html table that is displayed
    
    # candidate_scores: {title: (pagerank, hubs, incoming, outgoing)}
    index_to_name = {-1: -1}
    headers = ["Index", "Title", "Link", "Incoming Links", "Outgoing Links","PageRank", "Hubs"]
    table = []
    for i,title in enumerate(candidate_scores):
        row = []
        row.append(i)
        row.append(title)
        row.append(format_link(title))
        row.append(candidate_scores[title][2])
        row.append(candidate_scores[title][3])
        row.append(candidate_scores[title][0])
        row.append(candidate_scores[title][1])
        table.append(row)
        index_to_name[i] = title
        
    return tabulate.tabulate(table, tablefmt='html', headers=headers , showindex=False), index_to_name
        

def search(query, scores, adjacency):
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
                    'action':'query',
                    'format':'json',
                    'list':'search',
                    'utf8':1,
                    'srsearch':query,
                    'srlimit': 500,
                    'srqiprofile': 'empty' # dont want assistance from wikipedia search engine
                }
    params['srsearch'] = query
    data = requests.get(url, params=params).json() # api call with srsearch

    candidates = []
    for article in data['query']['search']:
        if article['title'] in names:
            candidates.append(article['title'])
    
    # row outgoing, col incoming
    candidate_metrics = {} # {name: (pagerank, hubs , incoming, outgooing)}
    for candidate in candidates:
        i = names.index(candidate)
        pagerank, hubs = scores[i][0], scores[i][1]
        outgoing = np.sum(adjacency[i]) # row
        incoming = np.sum(adjacency[:,i]) # col
        candidate_metrics[candidate] = (pagerank, hubs, incoming, outgoing) 
        
    candidate_metrics = dict(sorted(candidate_metrics.items(), key=lambda item: item[1][0], reverse=True)) # Put scores in descending order

    return get_table(candidate_metrics) # generate a table with the metrics and labels

def query(): # function to accept user input and call the search function
    query = input("Enter your Search Query:")
    table, index_to_name = search(query, scores, adjacency)
    display(table)
    option = int(input("Enter Index to get more information about an article or '-1' to enter another query:"))
    
    while option not in index_to_name.keys():
        option = int(input("Invalid Option: Enter Index to get more information about an article or '-1' to enter another query:"))
        
    return index_to_name[option]
        

def showArticle(name): # function to pull article abstract with wikipedia API
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
            'action': 'query',
            'format': 'json',
            'titles': name,
            'prop': 'extracts',
            'exintro': True,
            'explaintext': True,
        }

    response = requests.get(url, params=params)
    data = response.json()

    page = next(iter(data['query']['pages'].values()))
    print(page['extract'])

def menu(name): # Show menu when selecting an article
    print("You have selected:", name)
    print("What would you like to do?")
    print("1. Read Article Abstract")
    print("2. See Recommended Articles")
    print("3. See All Incoming Articles")
    print("4. See All Outgoing Articles")
    print("5. Enter Another Query")
    print("6. Exit Program")
    option = int(input("Choose Here"))
    return option

# outgoing = np.sum(adjacency[i]) # row
# incoming = np.sum(adjacency[:,i]) # col

def show_incoming(name): # showing incoming articles
    
    article_id = names.index(name)
    col = adjacency[:,article_id].toarray()
    incoming_ids = np.where(col == True)[0]
    
        # row outgoing, col incoming
    metrics = {} # {name: (pagerank, hubs , incoming, outgooing)}
    for i in incoming_ids:
        name = names[i]
        pagerank, hubs = scores[i][0], scores[i][1]
        outgoing = np.sum(adjacency[i]) # row
        incoming = np.sum(adjacency[:,i]) # col
        metrics[name] = (pagerank, hubs, incoming, outgoing) 
        
    metrics = dict(sorted(metrics.items(), key=lambda item: item[1][0], reverse=True)) # Put scores in descending order
    table, index_to_name = get_table(metrics)
    display(table)
    option = int(input("Enter Index to get more information about an article or '-1' to enter another query:"))
    
    while option not in index_to_name.keys():
        option = int(input("Invalid Option: Enter Index to get more information about an article or '-1' to enter another query:"))
        
    return index_to_name[option]
    
    

def show_outgoing(name): # show outgoing articles
    article_id = names.index(name)
    row = adjacency[article_id].toarray()[0]
    outgoing_ids = np.where(row == True)[0]
    metrics = {} # {name: (pagerank, hubs , incoming, outgooing)}
    for i in outgoing_ids:
        name = names[i]
        pagerank, hubs = scores[i][0], scores[i][1]
        outgoing = np.sum(adjacency[i]) # row
        incoming = np.sum(adjacency[:,i]) # col
        metrics[name] = (pagerank, hubs, incoming, outgoing) 
        
    metrics = dict(sorted(metrics.items(), key=lambda item: item[1][0], reverse=True)) # Put scores in descending order
    table, index_to_name = get_table(metrics)
    display(table)
    option = int(input("Enter Index to get more information about an article or '-1' to enter another query:"))
    
    while option not in index_to_name.keys():
        option = int(input("Invalid Option: Enter Index to get more information about an article or '-1' to enter another query:"))
        
    return index_to_name[option]

def get_recommendations(name): # recommendation algorithm
    
    index = names.index(name)
    label = labels[index]
    
    
    cluster_indices = np.where(labels == label)[0]
    weights = labels == label
    
    weight = 1 / np.sum(weights) # weight for articles in cluster
    
    weights = { i: weight for i in cluster_indices }
    weights[index] = 3 # weight for article to be recommended
    
    pagerank = PageRank()
    scores_clust = pagerank.fit_transform(adjacency, weights) # get pagerank scores within the cluster
    
    rec = top_k(scores_clust - scores[:,0], 100) # difference between original pagerank score and current score
    rec = np.delete(rec, np.where(rec == index)) # remove the thing that is getting recommended
    rec = np.array([i for i in rec if i in cluster_indices]) # remove recommendations not in cluster
    
    
    metrics = {} # {name: (pagerank, hubs , incoming, outgooing)}
    for i in rec[:20]:
        name = names[i]
        pagerank, hubs = scores[i][0], scores[i][1]
        outgoing = np.sum(adjacency[i]) # row
        incoming = np.sum(adjacency[:,i]) # col
        metrics[name] = (pagerank, hubs, incoming, outgoing) 
        
    table, index_to_name = get_table(metrics)
    display(table)
    option = int(input("Enter Index to get more information about an article or '-1' to enter another query:"))
    
    while option not in index_to_name.keys():
        option = int(input("Invalid Option: Enter Index to get more information about an article or '-1' to enter another query:"))
        
    return index_to_name[option]
    
def main(): # driver function
    name = query()
    while True:
        if name == -1:
            name = query()
        else:
            while True:
                menu_choice = menu(name)
                if menu_choice == 1:
                    showArticle(name)

                elif menu_choice == 2:
                    name = get_recommendations(name)
   
                elif menu_choice == 3:
                    name = show_incoming(name)

                elif menu_choice == 4:
                    name = show_outgoing(name)

                elif menu_choice == 5:
                    name = query()
                    break

                elif menu_choice == 6:
                    return
            
    
main()

In [None]:
# Hyperparameter tuning for Louvain
resolutions = np.linspace(0.1, 3, 30)

modularities = ['Dugue', 'Newman', 'Potts']
best = {"mod": None, "res": None, "score":0}

dugue = []
newman = []
potts = []

for mod in modularities:
    for res in resolutions:
        
        lv = Louvain(resolution=res, modularity=mod)
        lv.fit(adjacency)
        lv_labels = np.array(lv.labels_)
        lv_score = get_modularity(adjacency, lv_labels)
        
        if lv_score > best["score"]:
            best["mod"]=mod
            best["res"]=res
            best["score"]=lv_score
            
        if mod == "Dugue":
            dugue.append(lv_score)
        elif mod == "Newman":
            newman.append(lv_score)
        elif mod == "Potts":
            potts.append(lv_score)

            
            
best

In [None]:
best_louv = Louvain(resolution=best['res'], modularity=best["mod"])
best_louv.fit(adjacency)
np.unique(best_louv.labels_)

In [None]:
# Hyperparameter tuning plot for Louvain
import matplotlib.pyplot as plt

best={'mod': 'Dugue', 'res': 1.0999999999999999, 'score': 0.4721801809672379}

# plot the lines
plt.plot(resolutions, dugue, label='Dugue')
plt.plot(resolutions, potts, label='Potts')
plt.plot(resolutions, newman, label='Newman')


plt.legend()


plt.xlabel('Resolution')
plt.ylabel('Modularity Score')
plt.title('Parameter Tuning of Louvain Clustering')


plt.savefig('louvain.png')
plt.show()

In [None]:
from sknetwork.embedding import Spectral 

num_components = 25
spectral = Spectral(num_components)
embedding = spectral.fit_transform(adjacency)

print(embedding[7777])

In [None]:
# Sanity check for Louvain
labels = best_louv.labels_
clusters = []
for label in np.unique(labels): # loop through each cluster label
    cluster_indices = np.where(labels==label)[0]
    
    # [{index: (pr, hubs), index: (pr, hubs)}, {index: (pr, hubs), index: (pr, hubs)}]
    indice_to_score = {i: (scores[i][0], scores[i][1]) for i in cluster_indices} 
    clusters.append(indice_to_score) 
    
for cluster in clusters:
    
    sorted_cluster = dict(sorted(cluster.items(), key=lambda x: x[1][0], reverse=True))
    sorted_indices = list(sorted_cluster.keys())
    for i in sorted_indices[:30]:
        print(names[i] , scores[i])
    print("------------")
        

In [None]:
from scipy.cluster.hierarchy import linkage # this cell generates a dendrogram, not used in report
from sknetwork.visualization import svg_dendrogram

labels = best_louv.labels_
clusters = []
for label in np.unique(labels):
    cluster_indices = np.where(labels==label)[0]
    
    
label = 4
index = selection[label] # selection : [[top k articles cluster 0 by index], [top k articles cluster 1 by index], etc ]
print(index)
print(len(index))

dendrogram_articles = linkage(embedding[index], method='ward')
image = svg_dendrogram(dendrogram_articles, names=names[index], rotate=True, width=200, scale=2, n_clusters=4)
SVG(image)



