In [1]:
# This session uses code from https://github.com/Lodifice/mfnf-pdf-export which is licensed under Apache License 2.0

import re
from datetime import date, datetime

In [17]:
import hashlib
import collections
import shelve

def sha256(text):
    return hashlib.sha256(text.encode("utf8")).hexdigest()

def stablehash(obj):
    if callable(getattr(obj, "_stablehash", None)):
        return obj._stablehash()
    elif isinstance(obj, str):
        return sha256(obj)
    elif isinstance(obj, collections.abc.Sequence):
        return sha256(";".join([stablehash(x) for x in obj]))
    elif isinstance(obj, collections.abc.Mapping):
        return stablehash(["<" + stablehash(k) + ";" + stablehash(v) + ">" for k, v in obj.items()])
    else:
        print(obj)
        raise NotImplementedError()
        
class CachedFunction:
    def __init__(self, db):
        self.db = db

    def __call__(self, func):
        def new_func(*args, **kwargs):
            key = stablehash([func.__name__, args, kwargs])

            if key in self.db:
                return self.db[key]
            else:
                result = func(*args, **kwargs)

                self.db[key] = result

                return result

        return new_func

DB = shelve.open("cache.db", "c", writeback=True)
cached_function = CachedFunction(DB)

In [18]:
def remove_prefix(text, prefix):
    """Removes the prefix `prefix` from string `text` in case it is present."""
    return text[len(prefix):] if text.startswith(prefix) else text

def remove_suffix(text, suffix):
    """Removes the suffix `suffix` from string `text` in case it is present."""
    return text[:len(text)-len(suffix)] if text.endswith(suffix) else text

In [19]:
import requests

from functools import reduce
from urllib.parse import quote

def select_singleton(x):
    return next(iter(x.values()))

def merge(obj1, obj2):
    """Merges two objects depending of the type of the first argument.
    >>> merge(None, 42)
    42
    >>> merge(None, None) is None
    True
    >>> merge([1, 2], [6, 7])
    [1, 2, 6, 7]
    >>> d = merge({ "a": 1, "b": 2}, {"b": 3, "c": 4})
    >>> d == {"a": 1, "b": 3, "c": 4}
    True
    """
    if obj1 is None:
        return obj2
    elif isinstance(obj1, list):
        return obj1 + obj2
    elif isinstance(obj2, dict):
        result = obj1.copy()
        result.update(obj2)
        return result
    else:
        raise NotImplementedError()

def query_path(obj, path):
    return reduce(lambda x, y: y(x) if callable(y) else x[y], path, obj)

class MediaWikiAPI():
    """Implements an API for content stored on a MediaWiki."""

    def __init__(self, domain="de.wikibooks.org", req=requests.Session()):
        """Initializes the object.
        Arguments:
        domain -- domain of the MediaWiki, e.g. `"de.wikibooks.org"`
        req    -- an session object of the `request` framework
        """
        self.domain = domain
        self.req = req

    def _stablehash(self):
        return stablehash((self.__class__.__name__, self.domain))

    @property
    def _index_url(self):
        """Returns the URL to the server's `index.php` file."""
        return "https://" + self.domain + "/w/index.php"

    @property
    def _api_url(self):
        """Returns the URL to the server's `api.php` file."""
        return "https://" + self.domain + "/w/api.php"

    @property
    def _rest_api_url(self):
        """Returns the URL to the server's REST API endpoints."""
        return "https://" + self.domain + "/api/rest_v1"

    def _index_call(self, params):
        """Make an HTTP request to the server's `index.php` file."""
        req = self.req.get(self._index_url, params=params)

        req.raise_for_status()

        return req.text

    def _api_call(self, endpoint, data={}, domain=None):
        """Call an REST API endpoint."""
        if domain is None:
            api_url = self._rest_api_url
        else:
            api_url = "https://" + domain + "/api/rest_v1"
        
        endpoint_url = "/".join([api_url] + endpoint)

        result = self.req.get(endpoint_url, data=data)

        return result

    def query(self, params, path_to_result):
        params["format"] = "json"
        params["action"] = "query"
        path_to_result = ["query"] + path_to_result
        result = None

        while True:
            api_result = self.req.get(self._api_url, params=params).json()

            if "error" in api_result:
                message = "Error while making API call."

                raise ConnectionError(api_result.get("info", message))

            result = merge(result, query_path(api_result, path_to_result))

            if "continue" in api_result:
                params.update(api_result["continue"])
            else:
                return result

    @cached_function
    def get_revisions(self, title):
        if title.startswith("c:"):
            return []
        
        params = {"prop": "revisions", "rvprop": "size|user|timestamp|userid|ids|comment", "titles": title,
                  "rvlimit": "max"}

        try:
            return self.query(params, ["pages", select_singleton, "revisions"])
        except KeyError as e:
            if str(e) == "'revisions'":
                return []
            else:
                print(title)
                raise
    
    def revisions(self, title):
        if title is None:
            return []
        
        result = self.get_revisions(title)
        
        oldsize = 0
        for rev in reversed(result):
            rev["date"] = datetime.strptime(rev["timestamp"], "%Y-%m-%dT%H:%M:%SZ").date()
            rev["diffsize"] = rev["size"] - oldsize
            oldsize = rev["size"]
            rev["weight"] = max(100, rev["diffsize"])

        return result
    
    @cached_function
    def get_all_pageviews(self, title):
        endpoint = ["metrics", "pageviews", "per-article", self.domain, "all-access", "user",
                    quote(title, safe=""), "daily", "20150701", "20180622"]
        
        req = self._api_call(endpoint, domain="wikimedia.org")
        
        if req.status_code == requests.codes.ok:
            return req.json()["items"]
        else:
            return []
    
    def get_content(self, title):
        return self._index_call({"action": "raw", "title": title})
    
    def pageviews_of(self, title, start, end):
        if len(end) == 8:
            end += "00"
            
        return sum((x["views"] for x in self.get_all_pageviews(title) if x["timestamp"] >= start and x["timestamp"] <= end))
    
    def pageviews(self, title, start, end):
        return sum((self.pageviews_of(x, start, end) for x in self.all_titles(title)))
    
    def all_titles(self, title):
        """Returns a set of all titles the article `title` had in the past."""
        result = set()

        result.add(title)

        re_link = "\\[\\[([^\\]]+)\\]\\]"
        re1 = ".*verschob die Seite %s nach %s.*" % (re_link, re_link)
        re2 = ".*hat „%s“ nach „%s“ verschoben.*" % (re_link, re_link)
        regs = [ re.compile(re1), re.compile(re2) ]

        for comment in (x["comment"] for x in self.get_revisions(title)):
            for reg in regs:
                m = reg.match(comment)

                if m:
                    result.add(m.group(1))
                    result.add(m.group(2))

        return result
    
wb = MediaWikiAPI()

In [20]:
from sitemap import parse_sitemap

mfnf = parse_sitemap(wb.get_content("Mathe für Nicht-Freaks: Sitemap"))
mfnf["title"] = "Mathe für Nicht-Freaks"
grund = mfnf["children"][1]
ana1 = mfnf["children"][2]
la1 = mfnf["children"][3]

# Seitenaufrufe

In den folgenden Statistiken sind Seitenaufrufe durch Bots und Spiders rausgerechnet.

In [21]:
def pageviews(node, start, end):
    result = wb.pageviews(node["title"], start, end) if node["title"] else 0
    
    return result + sum((pageviews(x, start, end) for x in node["children"]))

## Seitenaufrufe im Wintersemester 2017/18

In [22]:
def ws16(what, node):
    print(what, pageviews(node, "20171001", "20180331"))

ws16("Gesamt:", mfnf)
ws16("Grundlagenbuch:", grund)
ws16("Analysis 1:", ana1)
ws16("Lineare Algebra 1:", la1)

Gesamt: 1725099
Grundlagenbuch: 587720
Analysis 1: 920518
Lineare Algebra 1: 166852


## Seitenaufrufe im letzten Jahr (20.06.17 – 20.06.18)

In [26]:
def ws16(what, node):
    print(what, pageviews(node, "20170621", "20180620"))

ws16("Gesamt:", mfnf)
ws16("Grundlagenbuch:", grund)
ws16("Analysis 1:", ana1)
ws16("Lineare Algebra 1:", la1)

Gesamt: 2559575
Grundlagenbuch: 856488
Analysis 1: 1362961
Lineare Algebra 1: 260576


## Wachstum der Seitenaufrufe im WiSe 17/18 im Vergleich zum Vorjahr

In [27]:
def w(what, node):
    p = (pageviews(node, "20171001", "20180331") / pageviews(node, "20161001", "20170331") - 1)*100
    print(what, ("%.1f" % p) + "%")

w("Gesamt:", mfnf)
w("Grundlagenbuch:", grund)
w("Analysis 1:", ana1)
w("Lineare Algebra 1:", la1)

Gesamt: 43.2%
Grundlagenbuch: 12.1%
Analysis 1: 61.0%
Lineare Algebra 1: 148.3%
