# requirements
+ mwparserfromhell (0.6.4)

In [55]:
# format cells using black
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import pickle
import os
import re
import requests
import mwparserfromhell

from time import sleep

pd.options.display.max_columns = 20
pd.options.display.max_rows = 50
pd.options.mode.chained_assignment = None

<IPython.core.display.Javascript object>

# Wikipedia API

In [3]:
api_url = "https://de.wikipedia.org/w/api.php"
# parameters
params = {
    "action": "query",
    "titles": "Albert Einstein",
    "prop": "revisions",
    "rvprop": "ids|timestamp|comment|content|flags|user|userid",
    "rvlimit": "max",
    "continue": "",
    "format": "json",
}
params["titles"] = params["titles"].replace(" ", "_")

<IPython.core.display.Javascript object>

In [54]:
def get_revisions(request_params):
    pageids = request_params.get("pageids")
    titles = request_params.get("titles")
    if pageids:
        pageids = str(pageids).split("|")
        if len(pageids) > 1:
            raise Exception("Please provide only 1 page id in request_params.")
        page_id = pageids[0]
    elif titles:
        titles = str(titles).split("|")
        if len(titles) > 1:
            raise Exception("Please provide only 1 page title in request_params.")
        page_title = titles[0]
    else:
        raise Exception(
            'Please provide "pageids" or "titles" parameter in request_params.'
        )
    rvcontinue = request_params.get("rvcontinue")
    session = requests.session()
    # headers = {'User-Agent': ,
    #            'From': }
    while True:
        # continue downloading as long as we reach the given rev_id limit
        if rvcontinue is not None:
            request_params["rvcontinue"] = rvcontinue
        result = session.get(url=api_url, params=request_params, timeout=30).json()
        if "error" in result:
            raise Exception(
                "Wikipedia API returned the following error:" + str(result["error"])
            )
        # if 'query' in result:
        pages = result["query"]["pages"]
        if "-1" in pages:
            raise Exception(
                "The article ({}) you are trying to request does not exist!".format(
                    page_title or page_id
                )
            )
        _, page = result["query"]["pages"].popitem()
        for rev_data in page.get("revisions", []):
            yield rev_data
        if "continue" not in result:
            break
        rvcontinue = result["continue"]["rvcontinue"]
        # sleep(0.005)
        sleep(0.01)

<IPython.core.display.Javascript object>

## load wikipedia article titles

In [5]:
path = os.path.abspath("")
data_df_rel = "data/df/"
data_df_ndb_abs = os.path.join(path, data_df_rel, "df_ndb_wikipedia.pkl")
corpus = pd.read_pickle(data_df_ndb_abs)
titles = corpus["wikipedia_title"].tolist()
gnds = corpus["gnd"].tolist()
assert len(titles) == len(gnds)
number_biographies = len(gnds)
print(len(gnds))

20484


<IPython.core.display.Javascript object>

## store additional data for each article
+ number of revisions within each month
+ creator and other contributors of each article 

In [6]:
def get_months() -> dict:
    min_month = 6
    min_year = 2001
    max_month = 5
    max_year = 2022
    months = {}
    for year in range(min_year, max_year + 1):
        for month in range(1, 13):
            padding_zero = ""
            if year == min_year and month < min_month:
                continue
            if year == max_year and month > max_month:
                continue
            if month < 10:
                padding_zero = "0"
            months[
                f"{year}-{padding_zero}{month}"
            ] = 0  # number of revisions for that article in that month
    return months

<IPython.core.display.Javascript object>

## get and store revisions (monthly newest revision) 

In [56]:
path = os.path.abspath("")
data_revisions_rel = "data/revisions/"
data_revisions_meta_rel = "data/revisions_meta/"
j = 0
k = 0  # biographies with revsions
parser_errors = 0
connection_errors = 0
missing_revision = {}
missing_userid_key = 0
missing_first_revision = 0  # parentid != 0 (first revision got deleted?)

for gnd, title in zip(gnds, titles):
    print(
        f"processed biographies: {j}/{number_biographies} (pkl stored: {k})| skipped: {len(missing_revision)}",
        end="\r",
    )
    sleep(0.002)
    j += 1
    data_revisions_abs = os.path.join(
        path, data_revisions_rel, gnd + "|" + title + ".pkl"
    )
    data_revisions_meta_abs = os.path.join(path, data_revisions_meta_rel, gnd + ".pkl")

    meta = {
        "months": get_months(),
        "creator_user_id": None,
        "contributors": {},
        "created_month": None,
        "last_change_month": None,
    }
    # contributors :: {user_id:{changes:<number>,user_names:[<user_names>]}

    # relevant for reruns
    if os.path.exists(data_revisions_abs):
        continue

    revisions = []
    # update params!
    params_updated = params
    params_updated["titles"] = title.strip().replace(" ", "_")
    params_updated.pop("rvcontinue", None)  # reset params!
    try:
        for revision in get_revisions(params_updated):
            revisions.append(revision)
    except requests.exceptions.ConnectionError:
        # just rerun cell until dataset is complete
        connection_errors += 1

    if len(revisions) == 0:
        continue

    revisions_parsed = []
    # parse content
    for r in revisions:
        if "*" in r:
            r["text"] = r.pop("*")
            r.pop("anon", None)
            r.pop("minor", None)
            obj = r
            wikicode = r["text"]
            try:
                wikicode_parsed = mwparserfromhell.parse(wikicode)
                obj["text"] = wikicode_parsed.strip_code(normalize=True, collapse=True)
                obj["text"] = (
                    obj["text"]
                    .replace("\r\n", "\n")
                    .replace("\r", "\n")
                    .replace("\n", " ")
                )
                obj["text"] = re.sub(
                    " {2,}", " ", obj["text"]
                )  # remove multiple whitespaces
            except mwparserfromhell.parser.ParserError:
                parser_errors += 1
            finally:
                revisions_parsed.append(obj)

    # drop all but last version of every year (month not possible due to limited memory ressources)
    pattern_year_month = re.compile(r"^\d{4}-\d{2}")
    # pattern_year = re.compile(r"^\d{4}")
    revisions_latest_by_month = []
    # revisions_latest_by_year = []
    occured_year_months = {}
    # occured_years = {}

    first_revision = revisions_parsed[-1]
    # assure first revision -> article created with that version
    if first_revision["parentid"] != 0:
        missing_first_revision += 1
        missing_revision[gnd] = revisions_parsed
        continue

    meta["creator_user_id"] = first_revision["userid"]
    meta["created_month"] = pattern_year_month.search(
        first_revision["timestamp"]
    ).group(0)
    meta["last_change_month"] = pattern_year_month.search(
        revisions_parsed[0]["timestamp"]
    ).group(0)
    try:
        for rev in revisions_parsed:
            timestamp = rev["timestamp"]

            year_month = pattern_year_month.search(timestamp).group(0)

            meta["months"][year_month] += 1
            if rev["userid"] in meta["contributors"]:
                meta["contributors"][rev["userid"]]["changes"] += 1
                if rev["user"] not in meta["contributors"][rev["userid"]]["user_names"]:
                    meta["contributors"][rev["userid"]]["user_names"].append(
                        rev["user"]
                    )
            else:
                meta["contributors"][rev["userid"]] = {
                    "changes": 1,
                    "user_names": [rev["user"]],
                }

            if year_month not in occured_year_months:
                occured_year_months[year_month] = None
                revisions_latest_by_month.append(rev)

            """
            year = pattern_year.search(timestamp).group(0)
            if year not in occured_years:
                occured_years[year] = None
                revisions_latest_by_year.append(rev)
            """
    except KeyError as e:
        missing_revision[gnd] = revisions_parsed
        missing_userid_key += 1
        continue

    assert len(revisions_parsed) == sum(meta["months"].values())
    assert len(revisions_parsed) == sum(
        [v["changes"] for k, v in meta["contributors"].items()]
    )

    # store df: <gnd>|<wikititle>.pkl
    # revisions_latest_by_year.reverse()
    revisions_latest_by_month.reverse()
    # df_biographie_revisions = pd.DataFrame(revisions_latest_by_year, dtype="string")
    df_biographie_revisions = pd.DataFrame(revisions_latest_by_month, dtype="string")
    df_biographie_revisions.to_pickle(data_revisions_abs)
    with open(data_revisions_meta_abs, "wb") as f:
        pickle.dump(meta, f, protocol=pickle.HIGHEST_PROTOCOL)

    k += 1
    sleep(0.01)

print("\n")
print(f"{parser_errors=}")
print(f"{connection_errors=}")
print(f"{missing_first_revision=}")
print(f"{missing_userid_key=}")

processed biographies: 20483/20484 (pkl stored: 0)| skipped: 27

parser_errors=0
connection_errors=0
missing_first_revision=22
missing_userid_key=5


<IPython.core.display.Javascript object>

In [50]:
# 20.484 (biographies that could be mapped)
# 20.457 (revisions+meta-data)

# 27 missing (5 (missing first revision) + 22 (missing userid attribute))

<IPython.core.display.Javascript object>

## evaluate

In [36]:
path = os.path.abspath("")
data_df_rel = "data/revisions/"
data_df_ndb_abs = os.path.join(path, data_df_rel, "189548401|Ernst Bücken.pkl")
print(data_df_ndb_abs)
corpus = pd.read_pickle(data_df_ndb_abs)
corpus

/home/andreas/thesis/ndb-wikipedia-reuse/data/revisions/189548401|Ernst Bücken.pkl


Unnamed: 0,revid,parentid,user,userid,timestamp,contentformat,contentmodel,comment,text
0,105037428,105034856,Widerborst,1181049,2012-06-30T20:25:02Z,text/x-wiki,wikitext,/* Leben */,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...
1,106351470,106351331,Gudrun Meyer,298151,2012-08-03T21:17:15Z,text/x-wiki,wikitext,/* Leben */ fehlendes Wort ergänzt,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...
2,110007150,106351470,Adippold,1014143,2012-11-01T23:44:57Z,text/x-wiki,wikitext,/* Leben */ typo,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...
3,113244073,110007150,Fomafix,154724,2013-01-21T15:47:21Z,text/x-wiki,wikitext,[[Halbgeviertstrich]] ([[–]]) statt [[Minuszei...,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...
4,122909321,113244073,Nicht aus dem Sinn,1308505,2013-09-26T20:33:21Z,text/x-wiki,wikitext,,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...
5,126344918,122909321,Majo statt Senf,1722994,2014-01-11T05:41:30Z,text/x-wiki,wikitext,[[WP:HC|HC]]: +[[Kategorie:Hochschullehrer (Hf...,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...
6,151198992,126344918,Jaellee,214011,2016-02-07T16:55:02Z,text/x-wiki,wikitext,Typographische Anführungszeichen korrigiert | ...,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...
7,153151500,151198992,JohnSpecialK,1211895,2016-04-04T08:17:42Z,text/x-wiki,wikitext,/* Leben */,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...
8,161343017,153151500,TaxonBot,1824919,2017-01-06T03:39:57Z,text/x-wiki,wikitext,Bot: Korrektur Halbgeviertstrich,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...
9,167739363,161343017,Reiner Stoppok,230318,2017-07-31T03:03:12Z,text/x-wiki,wikitext,/* Werke (Auswahl) */,Ernst Bücken (* 2. Juni 1884 in Aachen; † 28. ...


<IPython.core.display.Javascript object>