In [1]:
import mwapi
session = mwapi.Session('https://en.wikipedia.org', user_agent="<jeffrey.arnold@gmail.com>")

In [41]:
# coding: utf-8
"""Download metadata and texts for WP10 Quality sample."""
import gzip
import json
import os.path

import mwapi


def flatten_page(page):
    """Flatten page/revision response"""
    revision = page['revisions'][0]
    # A few of these revisions have had their content removed
    try:
        revision['content'] = revision['slots']['main']['*']
    except KeyError:
        pass
    del revision['slots']
    for k in ('pageid', 'ns', 'title'):
        revision[k] = page[k]
    return revision


def iter_revisions(response):
    for p in response['query']['pages'].values():
        yield flatten_page(p)
    

user_agent = "<jeffrey.arnold@gmail.com>"

# Possible rvprop
# - ids: Get the revid and, from 1.16 onward, the parentid. 1.11+
# - roles: List content slot roles that exist in the revision. 1.32+
# - flags: Whether the revision was a minor edit. 1.11+
# - timestamp: The date and time the revision was made, in ISO 8601 combined date and time format.
# - user: The user who made the revision, and if applicable, the flags: userhidden if revision deleted and/or anon if unregistered.
# - userid: User id of revision creator, as well as userhidden and anon flags. 1.17+
# - size: The size of the revision text in bytes. 1.11+
# - sha1: SHA-1 (base 16) of the revision. 1.19+
# - contentmodel: Content model id of the revision. 1.21+
# - comment: The edit comment.
# - parsedcomment: The edit/log comment in HTML format with wikilinks and section references expanded into hyperlinks 1.16+
# - content: The revision content. If set, the maximum limit will be 10 times as low. (Note: If you want HTML rather than wikitext, use action=parse instead.)
# - tags: Any tags for this revision, such as those added by AbuseFilter. 1.16+
rvprop = 'ids|timestamp|content|sha1'

session = mwapi.Session('https://en.wikipedia.org', user_agent=user_agent)


req = session.get(action='query',
                   prop='revisions',
                   generator='embeddedin', 
                   geititle='Template:POV',
                   rvslots='main', 
                   rvprop='timestamp|sha1|content',
                continuation=True)

from tqdm import tqdm
n = None
output_file = "../data/npov.ndjson.gz"
with gzip.open(output_file, "wt") as f:
    for resp in tqdm(islice(req, n)):
        for rev in iter_revisions(resp):
            f.write(json.dumps(rev) + "\n")




0it [00:00, ?it/s][A[A

1it [00:00,  1.71it/s][A[A

2it [00:00,  1.97it/s][A[A

3it [00:01,  2.33it/s][A[A

4it [00:01,  2.44it/s][A[A

5it [00:01,  2.68it/s][A[A

6it [00:02,  2.88it/s][A[A

7it [00:02,  2.71it/s][A[A

8it [00:02,  2.94it/s][A[A

9it [00:03,  3.21it/s][A[A

10it [00:03,  3.51it/s][A[A

11it [00:03,  3.35it/s][A[A

12it [00:03,  3.23it/s][A[A

13it [00:04,  3.49it/s][A[A

14it [00:04,  3.50it/s][A[A

15it [00:04,  3.91it/s][A[A

16it [00:05,  2.79it/s][A[A

17it [00:05,  3.13it/s][A[A

18it [00:05,  2.68it/s][A[A

19it [00:06,  2.79it/s][A[A

20it [00:06,  3.09it/s][A[A

21it [00:06,  3.54it/s][A[A

22it [00:07,  2.90it/s][A[A

23it [00:07,  3.26it/s][A[A

24it [00:07,  3.27it/s][A[A

25it [00:07,  3.55it/s][A[A

26it [00:08,  3.67it/s][A[A

27it [00:08,  3.53it/s][A[A

28it [00:08,  3.80it/s][A[A

29it [00:08,  3.96it/s][A[A

30it [00:09,  4.06it/s][A[A

31it [00:09,  4.05it/s][A[A

32it [00:09,  3.89it

In [42]:
with gzip.open(output_file, 'rt') as f:
    data = [json.loads(line) for line in f]