# Downloading Text for WP10 Quality Predictions

For training the quality model I'll use the evaluation of https://github.com/wikimedia/articlequality/blob/master/datasets/enwiki.labeling_revisions.nettrom_30k.json

In [62]:
import mwapi
import json
import itertools
import pandas as pd
import gzip
import os.path

In [23]:
input_file = "../rawdata/enwiki.labeling_revisions.nettrom_30k.json"

In [24]:
user_agent = "<jeffrey.arnold@gmail.com>"
session = mwapi.Session('https://en.wikipedia.org', user_agent=user_agent)

In [25]:
with open(input_file, "r") as f:
    revisions = {x['rev_id']: x['wp10'] for x in [json.loads(line) for line in f]}

In [26]:
def split_seq(iterable, size):
    it = iter(iterable)
    item = list(itertools.islice(it, size))
    while item:
        yield item
        item = list(itertools.islice(it, size))
    

- ids: Get the revid and, from 1.16 onward, the parentid. 1.11+
- roles: List content slot roles that exist in the revision. 1.32+
- flags: Whether the revision was a minor edit. 1.11+
- timestamp: The date and time the revision was made, in ISO 8601 combined date and time format.
- user: The user who made the revision, and if applicable, the flags: userhidden if revision deleted and/or anon if unregistered.
- userid: User id of revision creator, as well as userhidden and anon flags. 1.17+
- size: The size of the revision text in bytes. 1.11+
- sha1: SHA-1 (base 16) of the revision. 1.19+
- contentmodel: Content model id of the revision. 1.21+
- comment: The edit comment.
- parsedcomment: The edit/log comment in HTML format with wikilinks and section references expanded into hyperlinks 1.16+
- content: The revision content. If set, the maximum limit will be 10 times as low. (Note: If you want HTML rather than wikitext, use action=parse instead.)
- tags: Any tags for this revision, such as those added by AbuseFilter. 1.16+


In [64]:
newdata = []
chunksize = 50
rvprop='content|comment|sha1|size|userid|user|timestamp|flags|ids'

output_file = "../data/enwiki.labeling_revisions.w_text.nettrom_30k.ndjson.gz"
if os.path.exists(output_file):
    raise FileExistsError(f"{output_file} exists")

with gzip.open(output_file, "wt") as f:
    for i, rev_id in enumerate(split_seq(revisions, chunksize)):
        print(f"downloading chunk {i}")
        revids = '|'.join(str(x) for x in rev_id)
        r = session.get(action="query", revids=revids, prop='revisions', rvprop=rvprop, rvslots='main')
        pages = r['query']['pages']
        for page in pages.values():
            for revision in page['revisions']:
                # A few of these revisions have had their content removed
                try:
                    revision['wikitext'] = revision['slots']['main']['*']
                except KeyError:
                    print(revision)
                    continue
                del revision['slots']
                for k in ('pageid', 'ns', 'title'):
                    revision[k] = page[k]
                f.write(json.dumps(revision) + "\n")


NameError: name 'stop' is not defined