## IMPORTS

In [None]:
import cfg

import gzip
import os
from progress_bar import ProgressBar
from progress_bar.utils import iter_progress
from progress_bar.labeling import get_custom_labeling_fun

## CHECK IF ALL THE RAW FILES EXIST

In [None]:
# https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
assert os.path.isfile(cfg.raw_dir + "enwiki-latest-pages-articles.xml.gz")
# https://www.mediawiki.org/wiki/Wikibase/DataModel/JSON#Claims_and_Statements
assert os.path.isfile(cfg.raw_dir + "wikidata-all.json.gz")
# http://www.openoffice.org/lingucomponent/thesaurus.html
assert os.path.isfile(cfg.raw_dir + "thesaurus_en_openoffice_v1.txt.gz")

## CREATE THE OUTPUT FILES

In [None]:
wiki_parts_path = cfg.processed_dir + "enwiki-latest-pages-articles_parts/"
if not os.path.exists(wiki_parts_path):
    os.mkdir(wiki_parts_path)

In [None]:
outfiles = [
    gzip.open("{}part_{}_{}.xml.gz".format(wiki_parts_path, i+1, cfg.wiki_preprocessing_split_into), "w")
    for i in range(cfg.wiki_preprocessing_split_into)
]

## FILL THE OUTPUT FILES

In [None]:
# https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
assert os.path.isfile(cfg.raw_dir + "enwiki-latest-pages-articles.xml.gz")

In [None]:
index = 0

buffer_rows = []
buffer_chars = 0
buffer_num_pages = 0
buffer_max_chars = 8 * 1024 * 1024

pb = ProgressBar(labeling_fun=get_custom_labeling_fun(prefix="Pages", show_remaining_time=False)).display()
with gzip.open(cfg.raw_dir + "enwiki-latest-pages-articles.xml.gz", "r") as infile:
    for line in iter_progress(infile, labeling_fun=get_custom_labeling_fun(prefix="Rows", show_remaining_time=False)):
        if line == "  <page>\n":
            buffer_num_pages += 1

            if buffer_chars >= buffer_max_chars:
                outfiles[index].write("".join(buffer_rows))
                index = (index + 1) % cfg.wiki_preprocessing_split_into
                pb.increase_many(buffer_num_pages)

                buffer_rows = []
                buffer_chars = 0
                buffer_num_pages = 0

        buffer_rows.append(line)
        buffer_chars += len(line)                

    # write the reamining part
    if buffer_chars > 0:
        outfiles[index].write("".join(buffer_rows))
        pb.increase_many(buffer_num_pages)
    pb.stop(True)

# Summary: 17.152.607 Pages - 938.636.136 Rows (8115s - 2h 15m 15s)
# it lasts 2h 15m

## CLOSE THE OUTPUT FILES

In [None]:
for outfile in outfiles:
    outfile.close()
outfiles = []