In [None]:
import pandas as pd
import numpy as np
import re
import wikipedia
import datetime
import requests
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import json
import urllib
from urllib.parse import urlencode
from urllib.request import urlopen
from collections import deque
# documentation: https://mwparserfromhell.readthedocs.io/en/latest/api/mwparserfromhell.nodes.html#module-mwparserfromhell.nodes.wikilink
import mwparserfromhell
import networkx as nx
import random

In [None]:
%matplotlib inline

In [13]:
def getUserActivity(article, granularity, start, end, project ="en.wikipedia.org",
                    access="all-access", agent="user",dateformat="iso"):
    """
    Method to obtain user activity of a given page for a given period of time
    article: name of the wikiipedia article
    granularity: time granularity of activity, either 'monthly' or 'daily'
    start: start date of the research as Datetime.datetime object
    end: end date of the research as Datetime.datetime object
    project: If you want to filter by project, use the domain of any Wikimedia project (by default en.wikipedia.org)
    access: If you want to filter by access method, use one of desktop, mobile-app or mobile-web (by default all-access)
    agent: If you want to filter by agent type, use one of user, bot or spider (by default user).
    dateformat: the dateformat used in result array, can be 'iso','ordinal','datetime'.
    return:
        it return an array of array of the form [ [user_activity_value1, date1], [user_activity_value2, date2]]
    """

    #granularity['monthly','daily']
    #format['iso','ordinal','datetime']
    #Be carefull, for daily granularity left bound date is included, for monthly granularity left bound date is excluded
    
    dstart = start.strftime("%Y%m%d")
    dend = end.strftime("%Y%m%d")
    path = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"+project
            +"/"+access+"/"+agent+"/"+article+"/"+granularity+"/"+dstart+"/"+dend)
    r = requests.get(path)
    if not r.ok:
        print('Request Error: ', r)
        return
    res = []
    for i in range(len(r.json()['items'])):
        time_label = None
        if granularity == 'daily':
            time_label = (start + datetime.timedelta(days=i))
        else:
            time_label = (start + relativedelta(months=+i))
        if dateformat == 'iso':
            time_label = time_label.isoformat()
        elif dateformat == 'ordinal':
            time_label = time_label.toordinal()
            
        res.append([r.json()['items'][i]['views'],time_label])
    return res

In [4]:
API_URL = "https://en.wikipedia.org/w/api.php"
# TODO: make it deal with server side error 
def parse_with_date(title, date=None):
    '''
    
    :param title: title of the wikipedia article
    :param date: snapshot of the page as it was on the date. If None, scrapes the page as is now. 
    :return: the parsed page of wikipedia
    '''
    data = {"action": "query", "prop": "revisions", "rvlimit": 1,
            "rvprop": "content", "format": "json", "titles": title}
    if date is not None:
        data["rvstart"] = date
    raw = urlopen(API_URL, urlencode(data).encode()).read()
    res = json.loads(raw)
    try:
        text = list(res["query"]["pages"].values())[0]["revisions"][0]["*"]
    except KeyError as err:
        print("Key error".format(err))
        print(title)
        return None
    return mwparserfromhell.parse(text)

In [5]:
seed = 'Stan Lee'

In [6]:
stan_lee_12_nov = parse_with_date(seed, date= '2018-11-13T00:00:00Z')
stan_lee_12_nov_l = stan_lee_12_nov.filter_wikilinks()

In [7]:
# 1st hop links. 
len(stan_lee_12_nov_l), stan_lee_12_nov_l[:10]

(512,
 ['[[New York City]]',
  '[[Los Angeles]]',
  '[[California]]',
  '[[Jack Kirby]]',
  '[[Steve Ditko]]',
  '[[John Romita Sr.]]',
  '[[Don Heck]]',
  '[[Bill Everett]]',
  '[[Joe Maneely]]',
  '[[The Will Eisner Award Hall of Fame]]'])

In [None]:
def preprocess_links(links):
    non_link = re.compile('Category:|File:|wikt:|.*#.*')
    links = [str(link.title) for link in links if non_link.match(str(link.title))==None]
    links = list(set(links))
    return links

In [None]:
links = preprocess_links(stan_lee_12_nov_l)
len(links), links[:10]

In [None]:
len(links), type(stan_lee_12_nov_l[0].title)

### Sampling
A heuristic we could use for sampling is to give more probability to the links at the front of the page (assuming they might be more important and more related to the subject) than to the other links.

In [None]:
def sample_links(n_links, n_links_sample, percentage_l, percentage_l_subsample):
    '''
    Example : make the first 20% of the links to account for 50% of the subsample
    :param n_links: number of links a page has
    :param n_links_sample: number of links you want to sample
    :param percentage_l: consider the top percentage_l links ..
    :param percentage_l_subsample: to account for percentage_l_subsample of the subsampled links
    :return:  the chosen links to select
    NOTE: if percentage_l = percentage_l_subsample => aprox uniform sampling 
    '''
    # no sampling needed here
    if n_links_sample >= n_links:
        return np.array(range(n_links))
    
    # how many links from the first group should be subsampled
    n_links_first_group = int(n_links_sample * percentage_l_subsample)
    # how many links from the first group we have
    n_links_first_group_pop = int(n_links * percentage_l)
    
    # no sampling from the first group
    if n_links_first_group_pop < n_links_first_group:
        links_chosen_group_1 = np.array(range(n_links_first_group_pop))
        remaining = n_links_sample - n_links_first_group_pop
        links_chosen_group_2 = np.random.choice(range(n_links_first_group_pop+1, n_links), size = remaining, replace= False)
        return np.append(links_chosen_group_1, links_chosen_group_2)
    
    # if we have to sample from both groups, create the probabilities 
    perc_1 = percentage_l_subsample / n_links_first_group_pop
    perc_2 = (1 - percentage_l_subsample) / (n_links - n_links_first_group_pop)
    #print('Links from first group were sampled with p ', perc_1, ' and links from second group were sampled with p ', perc_2)
    p = [perc_1] * n_links_first_group_pop + [perc_2] * (n_links - n_links_first_group_pop)
    chosen_links = np.random.choice(n_links, size = n_links_sample, p = p, replace=False)
    return chosen_links
    
    

### test

In [None]:
sample_links(100, 10, 0.1, 1)

In [None]:
sample_links(100, 10, 0.09, 1)

In [None]:
# first 10 links should account for 50% of the subsampled links
sample_links(100, 10, 0.1, 0.5)

### Crawling algorithm (BFS-like)

In [None]:
seed = 'Stan Lee'
date = '2018-11-13T00:00:00Z'

In [None]:
seed = 'Rosetta (spacecraft)'
date = '2014-11-12T00:00:00Z'

In [None]:
def wiki_crawl(seed, date):
    # CONSTANTS: to be explored
    max_nodes = 500
    n_links_hop1 = 175 # how many direct neighbours
    n_links_indirect_min = 10
    n_links_indirect_max = 25
    min_popularity = 30 # the minimum number of links a page should have to be taken into the graph (to avoid stubs or really small articles)

    # init algo
    article_links = {}
    queue = deque([seed])

    while(len(article_links) < max_nodes):
        if len(queue) == 0:
            print('no more links to dequeue')
            break
        article = queue.popleft()
        # crawl it
        article_content = parse_with_date(title=article, date=date)
        # check parse succesfull
        if article_content is not None:
            links = article_content.filter_wikilinks()
            links = preprocess_links(links)
            #print(len(links))
            if (len(links) > min_popularity):
                article_links[article] = links
                #print('Added ', article, ' to the graph')
                if article == seed:
                    idx_chosen_links = sample_links(n_links=len(links), n_links_sample=n_links_hop1, percentage_l=0.1, percentage_l_subsample=0.5)
                else:
                    '''
                    if len(links) > 100:
                        n_links_s = 50 # this is a hub, so get more links
                    else:
                        n_links_s = random.randint(n_links_indirect_min, n_links_indirect_max)
                    '''
                    n_links_s = random.randint(n_links_indirect_min, n_links_indirect_max)
                    idx_chosen_links = sample_links(n_links=len(links), n_links_sample=n_links_s, percentage_l=0.4, percentage_l_subsample=0.5 )
                for idx in idx_chosen_links:
                    if links[idx] not in article_links and links[idx] not in queue:
                        queue.append(links[idx])
    return article_links

links = wiki_crawl(seed, date)

In [None]:
def construct_graph(links):
    G = nx.DiGraph()
    G.add_nodes_from(links.keys())
    for article in G.nodes():
        for link in links[article]:
            # add link if not self-loop
            if link in links and link != article:
                G.add_edge(article, link)
    return G

In [None]:
G = construct_graph(links)
G.number_of_nodes(), G.number_of_edges()

In [None]:
# save the graph (take around half h to make)
#nx.write_gpickle(G, "graph2.gpickle")

In [None]:
# test it was saved
G2 = nx.read_gpickle("graph2.gpickle")

In [None]:
G2.number_of_edges(), G2.number_of_nodes()

In [None]:
#G2.nodes() == G.nodes()

## Adding the delta page_view signal

In [None]:
# example
#start = datetime.datetime(year=2018,month=11,day=11, hour=23, minute = 59, second = 59)
#end =  datetime.datetime(year=2018,month=11,day=12, hour=23, minute = 59, second = 59)
#daily_view = getUserActivity(article="Stan Lee",granularity="daily",start=start,end=end,dateformat="iso")
start = datetime.datetime(year=2014,month=11,day=12, hour=23, minute = 59, second = 59)
end =  datetime.datetime(year=2014,month=11,day=11, hour=23, minute = 59, second = 59)
daily_view = getUserActivity(article='Rosetta (spacecraft)',granularity="daily",start=start,end=end,dateformat="iso")
daily_view

In [None]:
# page views are crawled at the end of the day ! 
views = {}
nodes_not_taken = []
for node in G2.nodes():
    try:
        views[node] = getUserActivity(article=node, granularity="daily",start=start,end=end,dateformat="iso")
    except KeyError:
        nodes_not_taken.append(node)
        print(node)

In [None]:
views_graph = {}
for node in G2.nodes():
    if node in views.keys():
        try:
            views_graph[node] = views[node][1][0] -  views[node][0][0] 
        except IndexError:
            views_graph[node] = views[node][0][0]
    else:
        views_graph[node] = 0

In [None]:
[(n, views_graph[n]) for n in list(views_graph.keys())[:10]]

In [None]:
isinstance(views_graph,dict)

In [None]:
nx.set_node_attributes(G2, values= views_graph, name = 'delta')

In [None]:
# check
G2.node['Stan Lee']

In [None]:
# save again 
nx.write_gpickle(G2, "graph_with_delta_signal.gpickle")