In [1]:
from hw05_FUNCTIONS import *

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import collections
from os.path import isfile
from statistics import median

In [3]:
#this is the folder path where all the data are stored
dataFolder = "./data/"
#base filename for all wiki data
basefilename = "wiki-topcats-"
#file extension of wiki data
ext = ".txt"
#name of all wiki data files
CATEGORIES = "categories"
REDUCED_GRAPH = "reduced"
PAGE_NAMES = "page-names"

In [4]:
#let's open the wiki_categories.txt file and delete all categories
#with less than 3500 articles
#
#read the category file
categories = pd.read_csv(dataFolder+basefilename+CATEGORIES+ext, sep = ";", header = None , names = ["Category","List_of_articles"])
#clean the category column
#Before: "Category: category_name"
#After: "category_name"
categories["Category"] = categories["Category"].apply(lambda x:x.split(":")[1])
#defining a function that delete all categories with less than 3500 articles
ARTICLES_THRESHOLD = 3500
#Scanning each value of list_of_articles, if the number of articles goes beyond the threshold then retain it
#otherwise return an empty string
categories["List_of_articles"] = categories["List_of_articles"].apply(lambda x: x if len(x.strip().split(" ")) >= ARTICLES_THRESHOLD else "")
#delete all rows with empty string as "list_of_articles"
categories = categories[categories["List_of_articles"] != ""]

In [5]:
#Now it is needed to build the final graph, but it must be checked that the nodes into the reduced graph and 
#the nodes into the categories files are the same.
#The set of the nodes into the categories must be built.
#The set of nodes into the reduced graph must be built.
#the intersection between these two sets must be computed.
#only the edges involving nodes of the intersection set must be added to the final graph

#let's open the reduced graph file and create the set of nodes 
reduced_graph = pd.read_csv(dataFolder+basefilename+REDUCED_GRAPH+ext, sep = "\t", header = None, names = ["Node_1","Node_2"])
#create the set of the first column
set_node_1 = set(reduced_graph["Node_1"].values.tolist())
#create the set of the second column
set_node_2 = set(reduced_graph["Node_2"].values.tolist())
#create the set of the nodes into the reduced graph through the union operation of the two previous created sets
set_reduced_graph_nodes = set.union(set_node_1,set_node_2)
#print the size for debugging
print("size of set_node_1:= "+str(len(set_node_1)))
print("size of set_node_2:= "+str(len(set_node_2)))
print("size of set_reduced_graph_nodes:= "+str(len(set_reduced_graph_nodes)))
#The two previous sets are not useful anymore therefore they are deleted from the main memory
del set_node_1
del set_node_2

#Let's create the set of nodes into the categories
#create initially an empty set
set_categories_nodes = set()
#in order to perform the intersection function the nodes must be represented in the same format
#since the nodes into the @reduced_graph dataframe are integer then the "int" type is chosen
categories["List_of_articles"].apply(lambda x: set_categories_nodes.update(set(map(int, x.strip().split(" ")))))

#compute the final set of nodes
set_of_nodes = set.intersection(set_categories_nodes, set_reduced_graph_nodes)
#print the size for debugging
print("size of set_categories_nodes:= "+str(len(set_categories_nodes)))
print("size of set_reduced_graph_nodes:= "+str(len(set_reduced_graph_nodes)))
print("size of set_of_nodes:= "+str(len(set_of_nodes)))
#once the final set is computed the other sets can be deleted to free the memory
del set_categories_nodes
del set_reduced_graph_nodes

size of set_node_1:= 428957
size of set_node_2:= 352518
size of set_reduced_graph_nodes:= 461193
size of set_categories_nodes:= 546237
size of set_reduced_graph_nodes:= 461193
size of set_of_nodes:= 461193


In [6]:
#now it is possible to read line by line all the edges of the reduced graph
#and adding it to the final graph only if they belong to the @set_of_nodes computed
final_graph = nx.DiGraph()

print(".")
#build the graph
reduced_graph.apply(lambda edge: final_graph.add_edge(edge[0],edge[1]) if filterEdges(edge,set_of_nodes) else "" ,axis = 1)

.


0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
2645217    None
2645218    None
2645219    None
2645220    None
2645221    None
2645222    None
2645223    None
2645224    None
2645225    None
2645226    None
2645227    None
2645228    None
2645229    None
2645230    None
2645231    None
2645232    None
2645233    None
2645234    None
2645235    None
2645236    None
2645237    None
2645238    None
2645239    None
2645240    None
2645241    None
2645242    None
2645243    None
2645244    None
2645245    None
2645246    None
Length: 2645247, dtype: 

In [7]:
#the category dataframe must involve only "good nodes" too
categories["List_of_articles"] = categories["List_of_articles"].apply(lambda x: filter_nodes_in_categories(x,set_of_nodes))

In [8]:
category_distances = retrieveCategoryDistances(filename = "category_distances.npy", final_graph = final_graph ,categories=categories)

In [9]:
#compute a dictionary that has as keys the categories and as values the list of articles
#that belongs to that category
category_dictionary = {}
#for each category
for i in range(categories.shape[0]):
    #assign to that category the list of articles
    category_dictionary[i] = list(map(int, categories.iloc[i]['List_of_articles'].split(" ")))

In [10]:
blockRanking = computeBlockRanking(0,category_distances)

In [11]:
compute_nodes_ranking(allGraph = final_graph, block_ranking = blockRanking, categories = category_dictionary)

The top-10 ranked articles are the following: 
81941 82322 82082 82089 82091 81871 81878 82346 82084 81267
The entire ranking is saved on the filesystem
