In [None]:
import mwxml
import bz2
import gzip
import itertools
import json
from mwxml import Dump
import csv
import os
import os.path
from toolz import itertoolz
from functools import partial
import pandas as pd
from multiprocessing import Pool, cpu_count

def iter_revisions(dump, max_rev=None):
    i = 0
    for page in dump.pages:
        for revision in page:
            if max_rev and i >= max_rev:
                return
            yield revision
            i += 1

def process_chunk(outdir, i, revisions):
    fields = ['id',
     'timestamp',
     'minor',
     'comment',
     'sha1',
     'parent_id',
     'model',
     'format',
     'page_id',
     'page_title',
     'page_namespace',
     'page_redirect',
     'page_restrictions',
     'user_id',
     'user_text',
     'deleted_text',
     'deleted_comment',
     'deleted_user']
    parser = mwparser.parser.Parser()
    filename = os.path.join(outdir, f"{i}.csv.gz")
    print(f"Writing to {filename}")
    records = (process_revision(rev, parser) for rev in revisions if rev is not None)
    df = pd.DataFrame.from_records(records)
    df = df[fields]
    df.to_csv(filename, index=False, compression="gzip")
    return filename
            
def process_revision(rev, parser):
    rev = rev.to_json()
    for k, v in rev['page'].items():
        if k != "restrictions":
            rev[f"page_{k}"] = v
        else:
            rev[f"page_restrictions"] = " ".join(str(x) for x in v)
    del rev["page"]
    for k, v in rev["deleted"].items():
        rev[f"deleted_{k}"] = v
    del rev["deleted"]
    for k, v in rev["user"].items():
        rev[f"user_{k}"] = v
    del rev["user"]
    return rev

input_file = "../data/dumps.wikipedia.org/enwiki/20180901/enwiki-20180901-pages-articles.xml.bz2"
output_dir = "../data/enwiki-20180901-pages-articles-csv/"
max_rev = None
partition_size = 20000
workers = 4

os.makedirs(output_dir, exist_ok=True)
_process_chunk = partial(process_chunk, output_dir)
with bz2.open(input_file, "rt") as fin:
    dump = Dump.from_file(fin)
    revisions = iter_revisions(dump, max_rev=max_rev)
    chunks = enumerate(itertoolz.partition(partition_size, revisions, None))
    #for chunk in chunks:
     #   _process_chunk(*chunk)
    with Pool(workers) as p:
        p.starmap(_process_chunk, chunks)