In [1]:
import json
import pandas as pd
from pathlib import Path

# Train a german ULMFIT language model from scratch

Inspired by: https://github.com/insightfactory-app/ULMFIT-Persian/blob/master/Fast_ai_build_persian_pretrained_model.ipynb

## Prepare Wikipedia Dump

Find a mirror for the Wikipedia XML-dump in the prefered language here https://wikimedia.mirror.us.dev/backup-index.html (scroll down for the different languages) and download the dump with `wget`.

In [3]:
!git clone https://github.com/attardi/wikiextractor
! cd wikiextractor;git checkout e4abb4c;cd ..

dump = 'https://dumps.wikimedia.org/idwiki/latest/idwiki-latest-pages-articles.xml.bz2'
!cd data;wget $dump; cd ..

fatal: destination path 'wikiextractor' already exists and is not an empty directory.
HEAD is now at e4abb4c Fix typo
--2021-01-25 23:24:09--  https://dumps.wikimedia.org/idwiki/latest/idwiki-latest-pages-articles.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.155.106, 2620:0:861:4:208:80:155:106
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.155.106|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 648485003 (618M) [application/octet-stream]
Saving to: ‘idwiki-latest-pages-articles.xml.bz2.1’

-latest-pages-artic   0%[                    ]   1.54M   420KB/s    eta 25m 16s^C


In [10]:
from fastai.basics import *
import re


def get_wiki(path,lang):
    name = f'{lang}wiki'
    if (path/name).exists():
        print(f"{path/name} already exists; not downloading")
        return

    xml_fn = f"{lang}wiki-latest-pages-articles.xml"
    zip_fn = f"{xml_fn}.bz2"

    if not (path/xml_fn).exists():
        print("downloading...")
        download_url(f'https://dumps.wikimedia.org/{name}/latest/{zip_fn}', path/zip_fn)
        print("unzipping...")
        bunzip(path/zip_fn)

    with working_directory(path):
        if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/attardi/wikiextractor.git')
        print("extracting...")
        os.system("python wikiextractor.WikiExtractor.py --processes 4 --no_templates " +
            f"--min_text_length 1800 --filter_disambig_pages --log_file log -b 100G -q {xml_fn}")
    shutil.move(str(path/'text/AA/wiki_00'), str(path/name))
    shutil.rmtree(path/'text')


def split_wiki(path,lang):
    dest = path/'docs'
    name = f'{lang}wiki'
    if dest.exists():
        print(f"{dest} already exists; not splitting")
        return dest

    dest.mkdir(exist_ok=True, parents=True)
    title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
    lines = (path/name).open()
    f=None

    for i,l in enumerate(lines):
        if i%100000 == 0: print(i)
        if l.startswith('<doc id="'):
            title = title_re.findall(l)[0].replace('/','_')
            if len(title)>150: continue
            if f: f.close()
            f = (dest/f'{title}.txt').open('w')
        else: f.write(l)
    f.close()
    return dest

Inspired by https://github.com/fastai/course-nlp/blob/master/nlputils.py

In [1]:
from fastai.basics import *
import re

In [3]:
path = Path('data')
zip_fn = 'idwiki-latest-pages-articles.xml.bz2'

In [4]:
xml_fn = 'idwiki-latest-pages-articles.xml'
xml_fn

'idwiki-latest-pages-articles.xml'

In [5]:
name = 'idwiki-latest-pages-articles'
name

'idwiki-latest-pages-articles'

In [6]:
download_url(dump, path/zip_fn)

In [7]:
bunzip(path/zip_fn)

AssertionError: data/idwiki-latest-pages-articles.xml already exists

In [12]:
# run in python 3.6 env to avoid regex warnings
!python -m wikiextractor.WikiExtractor --processes 4 --no_templates \
--min_text_length 1800 --filter_disambig_pages --log_file log -b 100G -q --out data/wiki_ext data/$xml_fn

  re.S | re.U)
  re.X | re.S | re.U)


In [13]:
shutil.move(str(path/'wiki_ext/AA/wiki_00'), str(path/name))
shutil.rmtree(path/'wiki_ext/')

In [14]:
def split_wiki(path,name,lang):
    dest = path/'docs'
#     if dest.exists():
#         print(f"{dest} already exists; not splitting")
#         return dest

    dest.mkdir(exist_ok=True, parents=True)
    title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
    lines = (path/name).open()
    f=None

    for i,l in enumerate(lines):
        if i%100000 == 0: print(i)
        if l.startswith('<doc id="'):
            title = title_re.findall(l)[0].replace('/','_')
            if len(title)>150: continue
            if f: f.close()
            f = (dest/f'{title}.txt').open('w')
        elif l.startswith('</doc>'): pass
        else: f.write(l)
    f.close()
    return dest

In [15]:
split_wiki(path,name,lang='id')

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000


Path('data/docs')