cf. 
- https://stackoverflow.com/questions/56888333/how-can-i-parse-a-wikipedia-xml-dump-with-python
- https://github.com/mediawiki-utilities/python-mwxml
- https://github.com/mediawiki-utilities/python-mwxml/blob/master/ipython/labs_example.ipynb

Wikipedia dump version in use:

https://dumps.wikimedia.org/jawiki/20250401/

In [1]:
import mwxml
import re
from tqdm import tqdm

In [2]:
# This is from ChatGPT :) 
def clean_wiki_text(text):
    """
    清理维基百科文本，移除标记和模板。

    Args:
        text (str): 维基百科文本。

    Returns:
        str: 清理后的文本。
    """
    # 移除链接
    text = re.sub(r'\[\[.*?\]\]', '', text)
    # 移除模板
    text = re.sub(r'\{\{.*?\}\}', '', text)
    # 移除HTML标签
    text = re.sub(r'<.*?>', '', text)
    # 移除文件和图像链接
    text = re.sub(r'\[\[ファイル:.*?\]\]', '', text)
    text = re.sub(r'\[\[File:.*?\]\]', '', text)
    # 移除分类链接
    text = re.sub(r'\[\[Category:.*?\]\]', '', text)
    text = re.sub(r'\[\[カテゴリ:.*?\]\]', '', text)
    # 移除粗体和斜体标记
    text = re.sub(r"'''(.*?)'''", r"\1", text)
    text = re.sub(r"''(.*?)''", r"\1", text)
    # 移除其他维基百科标记
    text = re.sub(r'==.*?==', '', text)  # 移除标题
    text = re.sub(r'^\*.*$', '', text, flags=re.MULTILINE)  # 移除列表
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)  # 移除注释
    # 移除引用
    text = re.sub(r'<ref>.*?</ref>', '', text, flags=re.DOTALL)
    # 移除空行和多余空格
    text = re.sub(r'\n+', '\n', text).strip()
    return text


In [3]:
dump = mwxml.Dump.from_file(
    open('data/wiki_ja.xml', 'r')
)

In [4]:
page_text_list = []
page_title_list = []
last_visit_page_idx = 1
for page_idx, page in tqdm(enumerate(dump, start=1)):
    for revision in page:
        # Because what we have downloaded is the dump of each page, WITHOUT revision history.
        # So there should be only one "revision" objectg in the "page" object.
        # cf. the raw XML for the file structure.
        # We dump each article separately for the conveinence of future processing.
        if revision.text is not None:
            page_title_list.append(page.title)
            page_text_list.append(clean_wiki_text(revision.text))
        if ( (page_idx % 100000) == 0 ):
            # Notice that, the dump variable is an iterator.
            # Therefore, we dropped last 10,000 articles in the dump data.
            # The total number of articles is 2,979,559. It's fine.
            with open('data/text/article_' + str(last_visit_page_idx) + '-' + str(page_idx) + '.txt', 'w', encoding='utf-8') as f_handle:
                for title, text in zip(page_title_list, page_text_list):
                    f_handle.write(title)
                    f_handle.write(text)
                    f_handle.write('\n')
                last_visit_page_idx = page_idx
                page_text_list = []
    if page_idx > 1000000:
        break

1000000it [06:06, 2728.02it/s]
