In [9]:
from threading import Thread
import json
import re
from html2text import html2text as htt
import wikitextparser as wtp


def dewiki(text):
    text = wtp.parse(text).plain_text()  # wiki to plaintext 
    text = htt(text)  # remove any HTML
    text = text.replace('\\n',' ')  # replace newlines
    text = re.sub('\s+', ' ', text)  # replace excess whitespace
    return text


def analyze_chunk(text):
    try:
        if '<redirect title="' in text:  # this is not the main article
            return None
        if '(disambiguation)' in text:  # this is not an article
            return None
        else:
            title = text.split('<title>')[1].split('</title>')[0]
            title = htt(title)
            if ':' in title:  # most articles with : in them are not articles we care about
                return None
        serial = text.split('<id>')[1].split('</id>')[0]
        content = text.split('</text')[0].split('<text')[1].split('>', maxsplit=1)[1]
        content = dewiki(content)
        return {'title': title.strip(), 'text': content.strip(), 'id': serial.strip()}
    except Exception as oops:
        print(oops)
        return None


def save_article(article, savedir):
    doc = analyze_chunk(article)
    if doc:
        # print('SAVING:', doc['title'])
#         filename = doc['id'] + '.json'
#         with open(savedir + filename, 'w', encoding='utf-8') as outfile:
#             json.dump(doc, outfile, sort_keys=True, indent=1, ensure_ascii=False)
        return doc
    else:
        return None

import pandas as pd
def process_file_text(filename, savedir, batch_size=10000):
    article = ''
    batch_count = 1
    with open(filename, 'r', encoding='utf-8') as infile:
        article_batch = []
        for i, line in enumerate(infile):
            if '<page>' in line:
                article = ''
            elif '</page>' in line:  # end of article
                doc = save_article(article, savedir)
                if doc is not None:
                    article_batch.append(doc)
                if len(article_batch) > batch_size:
                    print(f"save {batch_count * batch_size}")
                    pd.DataFrame(article_batch).to_parquet(f"{savedir}/{batch_count}.parquet")
                    batch_count += 1
                    article_batch = []
            else:
                article += line

In [10]:

#wiki_xml_file = 'F:/simplewiki-20210401/simplewiki-20210401.xml'  # update this
wiki_xml_file = 'enwiki-20230901-pages-articles-multistream.xml'  # update this
json_save_dir = 'parse_parquet/'

if __name__ == '__main__':
    process_file_text(wiki_xml_file, json_save_dir)

save 10
save 20
save 30
save 40
save 50
save 60
save 70
save 80


KeyboardInterrupt: 