# Download wikipedia and extract one text file per article

In [None]:
# Fastai : Jeremy Howard / Sylvain Gugger
# https://github.com/fastai/fastai/blob/master/fastai/core.py
# https://github.com/fastai/course-nlp/blob/master/nlputils.py

# wikiextractor : Guiseppe Attardi
# https://github.com/attardi/wikiextractor

# NLP & fastai | French Language Model : Pierre Guillou
# https://medium.com/@pierre_guillou/nlp-fastai-french-language-model-d0e2a9e12cab
# https://github.com/piegu/language-models/blob/master/nlputils2.py

import bz2
from contextlib import contextmanager
import os
from pathlib import Path
Path.ls = lambda x: list(x.iterdir())
import re
import requests
import shutil
import urllib.request

def get_wiki_download(path,lang):
    name = f'{lang}wiki'
    xml_fn = f"{lang}wiki-latest-pages-articles.xml"
    zip_fn = f"{xml_fn}.bz2"    
    
    if (path/zip_fn).exists():
        print(f"{path/zip_fn} already exists; not downloading")
        return
    else:
        print("downloading...")
        download_url(f'https://dumps.wikimedia.org/{name}/latest/{zip_fn}', path/zip_fn)

def get_wiki_unzip(path,lang):
    name = f'{lang}wiki'
    xml_fn = f"{lang}wiki-latest-pages-articles.xml"
    zip_fn = f"{xml_fn}.bz2"
    
    if (path/xml_fn).exists():
        print(f"{path/xml_fn} already exists; not unzip")
        return    
    else:
        print("unzipping...")
        bunzip(path/zip_fn)
    
def get_wiki_extract(path,lang):
    name = f'{lang}wiki'
    xml_fn = f"{lang}wiki-latest-pages-articles.xml"
    zip_fn = f"{xml_fn}.bz2"

    with working_directory(path):
        
        # get updated wikiextractor folder from albertvillanova, not attardi
        if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/attardi/wikiextractor.git')
#         if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/albertvillanova/wikiextractor.git')
        
        # if you cloned the wikiextractor folder from attardi, get the platform-independent WikiExtractor.py file with this code
        file_path = path/'wikiextractor/WikiExtractor.py'
        os.unlink(file_path) # delete existing file
        url = 'https://raw.githubusercontent.com/piegu/fastai-projects/master/WikiExtractor.py' # updated file url
        urllib.request.urlretrieve(url, file_path) # get updated file
        
        if (path/'wikiextractor/WikiExtractor.py').exists(): 
            print("extracting...")
            os.system("python wikiextractor/WikiExtractor.py --processes 4 --no_templates " +
                f"--min_text_length 1800 --filter_disambig_pages --log_file log -b 100G -q {xml_fn}")
            shutil.move(str(path/'text/AA/wiki_00'), str(path/name))
            shutil.rmtree(path/'text')
        else:
            print(f"the file {path}\wikiextractor\WikiExtractor.py does not exist")

def split_wiki2(path,lang):
    dest = path/'docs'
    name = f'{lang}wiki'
    if dest.exists():
        print(f"{dest} already exists; not splitting")
        return dest

    dest.mkdir(exist_ok=True, parents=True)
    title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
#     re_punc = re.compile("([\"'().,;:/_?!—\-*])") # replace ponctuation
    re_punc = re.compile("([^a-zA-Z0-9])") # replace ponctuation in title

#     lines = (path/name).open()
    lines = (path/name).open(encoding="utf8") # platform independent with utf8
    
    f=None

    for i,l in enumerate(lines):
        if i%100000 == 0: print(f"{i} lines", end='\r', flush=True)
        if l.startswith('<doc id="'):
#             title = title_re.findall(l)[0].replace('/','_')
            title = title_re.findall(l)[0]
            title = re_punc.sub(r"_", title)
            if len(title)>150: continue
            if title == "Com8": continue # exception
            if f: f.close()
#             f = (dest/f'{title}.txt').open('w')
            f = (dest/f'{title}.txt').open('w', encoding="utf8") # platform independent with utf8
        else: f.write(l)
    f.close()
    return dest


def clean_files(path,folder):

    dest = path/folder
    doc_re = re.compile(rf'([\w\W]*)<\/doc>') # delete </doc>
               
    for i,l in enumerate(dest.ls()):
        # open file and get content without first line which is the title
        f = l.open('r+', encoding="utf-8")
        f.readline()
        text = f.read()
        # get content without </doc> and delete empty line and whitespaces at the head and tail
        text = doc_re.findall(text)[0].strip()
        # delete file content
        f.seek(0)
        f.truncate()
        # write modificated text in file
        f.write(text)
        f.close()

def get_num_tokens(dest):
    
    # Getting an idea of the number of words
    files = dest.ls()
    num_tokens = 0

    for i,l in enumerate(files):
        f = l.open('r', encoding="utf-8")
        words = f.read()
        num_tokens += len(words.split())
        f.close()
        
    num_files = i+1
    
    return num_files, num_tokens

# Create a corpus of about obj_token words in a corpus_'obj_token' folder
def get_corpus(dest, path, num_tokens, obj_tokens=int(1e8)):
    
    num_tokens_article_min = 100
    
    if num_tokens >= obj_tokens:
    
        # number of tokens by text
        files = dest.ls()
        sizes = []
        list_idx = []

        for i,f in enumerate(files):
            sizes.append(os.path.getsize(f))

        total_size = np.array(sizes).astype(np.int64).sum()
        tokens_by_file = np.array(sizes)*(num_tokens/total_size)

        # Sorted list of texts ids 
        num = 0

        tokens_by_file_sorted = np.argsort(tokens_by_file)

        #for i,idx in enumerate(tokens_by_file_sorted[:-len(tokens_by_file_sorted)-1:-1]):
        for i,idx in enumerate(tokens_by_file_sorted):
            if tokens_by_file[idx] >= num_tokens_article_min:
                num += tokens_by_file[idx]
                list_idx.append(i)
            if num >= obj_tokens: break

        articles_idxs = tokens_by_file_sorted[list_idx]

        # creation of the corpus folder
        folder = 'corpus_'+str(int(obj_tokens))
        path_corpus = path/folder
        path_corpus.mkdir(exist_ok=True, parents=True)

        # copy text files to corpus folder
        for idx in articles_idxs:
            file = files[idx]
            shutil.copy(str(file), str(path_corpus))

        print(f'files copied to the new corpus folder: {path/folder}')

        return path_corpus
    
    else:
        
        print('As there are less than 100 000 000 tokens in the initial corpus, we use it.')
        
        return dest
    
def download_url(url:str, dest:str, overwrite:bool=False, show_progress=True, 
                 chunk_size=1024*1024, timeout=4, retries=5)->None:
    "Download `url` to `dest` unless it exists and not `overwrite`."
    if os.path.exists(dest) and not overwrite: return

    s = requests.Session()
    s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries))
    # additional line to identify as a firefox browser, see #2438
    s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'}) 
    u = s.get(url, stream=True, timeout=timeout)
    try: file_size = int(u.headers["Content-Length"])
    except: show_progress = False

    with open(dest, 'wb') as f:
        cntMB = 0
        try:           
            for chunk in u.iter_content(chunk_size=chunk_size):
                cntMB += 1
                if show_progress: print(f"{cntMB} MB", end='\r', flush=True)
                f.write(chunk)
        except requests.exceptions.ConnectionError as e:
            fname = url.split('/')[-1]
            print(f'\n Download of {url} has failed after {retries} retries')
            import sys;sys.exit(1)
            
def bunzip(fn):
    "bunzip `fn`, raising exception if output already exists"
    fn = Path(fn)
    assert fn.exists(), f"{fn} doesn't exist"
    out_fn = fn.with_suffix('')
    assert not out_fn.exists(), f"{out_fn} already exists"
    with bz2.BZ2File(fn, 'rb') as src, out_fn.open('wb') as dst:
        for d in iter(lambda: src.read(1024*1024), b''): dst.write(d)
        
@contextmanager
def working_directory(path):
    "Change working directory to `path` and return to previous on exit."
    prev_cwd = Path.cwd()
    os.chdir(path)
    try: yield
    finally: os.chdir(prev_cwd)

Choose the root path to download the Wikipedia data

In [50]:
from pathlib import Path

rootdir = Path(r"\\?\C:\tmp\wikipedia")

lang = 'fr'
name = f'{lang}wiki'
path = rootdir/name
path.mkdir(exist_ok=True, parents=True)

1. Download "frwiki-latest-pages-articles.xml.bz2" (4.5 GB)

In [None]:
%time get_wiki_download(path,lang)

2. Decompress the archive to "frwiki-latest-pages-articles.xml"

In [None]:
%time get_wiki_unzip(path,lang)

3. Extract plain text from the Wikipedia dump in "frwiki"

In [None]:
%time get_wiki_extract(path,lang)

4. Split plain text Wikipedia contents in individual text files in the "./doc" directory (one file per article)

In [None]:
%time dest = split_wiki2(path,lang)

5. Clean the extracted text files (in place)

In [None]:
folder = "docs"
%time clean_files(path,folder)

# Package wikipedia text files in one nlptextdoc DataFrame

In [65]:
import pandas as pd
import numpy as np

class WikipediaDocsReader:
    """Read output files of a wikipedia extraction in one pandas DataFrame.
    """    
    def __init__(self, path):
        self.path = path
        self.docsdir = path / "docs"
        
        self.documentCount = 0 
        self.nestingLevel = 1
        self.listDocId = []
        self.listType = []
        self.listCmd = []
        self.listLevel = []
        self.listText = []
        
    def load_dataframe(self):
        textdffile = self.docsdir / "nlptextdocs.dataframe.feather"
        if(textdffile.exists()):
            return pd.read_feather(textdffile)
        else:
            i = 0
            for textfile in self.docsdir.glob("*.txt"):                
                with textfile.open(mode="r", encoding="utf-8-sig") as f:   
                    self.textfile = textfile
                    self.documentCount = self.documentCount+1
                    self.onDocumentStart(str(self.documentCount))
                    self.onDocumentUri(textfile.name)
                    self.onDocumentTitle(textfile.stem.replace("_"," "))
                    for lineidx,line in enumerate(f):
                        line = line.strip()
                        if(not line): continue
                        self.lineidx = lineidx
                        self.readline(line)
                    self.onDocumentEnd(str(self.documentCount))
                i = i + 1
                if(i%1000 == 0):
                    print(f"{i} articles", end='\r', flush=True)
                if(i%100000 == 0):
                    self.write_dataframe(str(i)+".")
            textdf = self.write_dataframe(str(i)+".")            
            return textdf

    def write_dataframe(self,prefix=""):
        textdf = pd.DataFrame({"DocId": self.listDocId, "DocEltType": self.listType, "DocEltCmd" : self.listCmd, "NestingLevel": self.listLevel, "Text":self.listText})
        textdf = textdf.astype({"DocEltType": "category", "DocEltCmd": "category", "NestingLevel": np.uint8},copy=False)
        textdffile = self.path / (prefix + "nlptextdocs.dataframe.feather")
        textdf.to_feather(textdffile)
        self.__init__(self.path)
        return textdf
        
    def readline(self,line):
        self.onTextBlock(line)
        
    def onDocumentStart(self,docId):
        self.appendrow("Document","Start",docId)
    
    def onDocumentTitle(self,title):
        self.appendrow("Document","Title",title)
            
    def onDocumentUri(self,uri):
        self.appendrow("Document","Uri",uri)
    
    def onDocumentEnd(self,docId):
        self.appendrow("Document","End",docId)
    
    def onTextBlock(self,text):
        self.appendrow("TextBlock","Text",text)  
            
    def appendrow(self,docEltType,docEltCmd,text=None):
        self.listDocId.append(self.documentCount)
        self.listType.append(docEltType)
        self.listCmd.append(docEltCmd)
        self.listLevel.append(self.nestingLevel)
        if(text != None):
            text = text.replace("\\n","\n")
        self.listText.append(text)

In [66]:
wikireader = WikipediaDocsReader(path)
textdf = wikireader.load_dataframe()

22000 articles

In [67]:
textdf.head(50)

Unnamed: 0,DocId,DocEltType,DocEltCmd,NestingLevel,Text
0,1,Document,Start,1,1
1,1,Document,Uri,1,1000_bornes.txt
2,1,Document,Title,1,1000 bornes
3,1,TextBlock,Text,1,Les 1000 bornes est un jeu de société utilisan...
4,1,TextBlock,Text,1,"Sur les premières boîtes du jeu, il était sous..."
5,1,TextBlock,Text,1,"Depuis 2009, la fabrication se fait à l'usine ..."
6,1,TextBlock,Text,1,Le jeu comprend 106 cartes. Les deux joueurs d...
7,1,TextBlock,Text,1,La liberté du joueur est surtout dans la décis...
8,1,TextBlock,Text,1,Les images de cartes suivantes sont celles de ...
9,1,TextBlock,Text,1,Les distances parcourues sont associées à des ...


In [None]:
import os
from pathlib import Path

rootdir = Path(r"\\?\D:\Laurent\nlptextdoc-data-201909")

websites = []

SCOPE_KEY = "scope="
URL_KEY="rootUrl="

for entry in os.scandir(rootdir):
    if entry.is_dir():
        websitedir = Path(entry)
        if(not websitedir.name.startswith("wikipedia-")):
            continue
        configfile = websitedir / "_nlptextdoc" / "config.txt"
        scope = ""
        if configfile.exists():
            with configfile.open(mode="r", encoding="utf-8-sig") as f:   
                for lineidx,line in enumerate(f):
                    line = line.strip()
                    if (line.startswith(SCOPE_KEY)):
                        scope = line[len(SCOPE_KEY):]
                    if (line.startswith(URL_KEY)):
                        url = line[len(URL_KEY):]
                        websites.append((scope,url,websitedir))
                        break
                
websites

In [None]:
import pandas as pd
import numpy as np

websitedir = websites[0][2]
textdffile = websitedir / "_nlptextdoc" / "nlptextdocs.dataframe.feather"
textdf = pd.read_feather(textdffile)
textdf.head()

In [None]:
def gen_urls_dataframe(websitedir):
    urlsdffile = websitedir / "_nlptextdoc" / "urls.dataframe.feather" 
    if(urlsdffile.exists()):
            return pd.read_feather(urlsdffile)
    else:
        listDocUrls = []
        textdffile = websitedir / "_nlptextdoc" / "nlptextdocs.dataframe.feather"
        textdf = pd.read_feather(textdffile)
        for rowidx,row in textdf[textdf["DocEltCmd"] == "Uri"].iterrows():
            article = Path(row["Text"]).stem
            listDocUrls.append("https://fr.wikipedia.org/wiki/"+article)
        urlsdf = pd.DataFrame({"DocId" : range(1,len(listDocUrls)+1), "DocUrl" : listDocUrls})
        urlsdf = urlsdf.astype({"DocId" : np.uint32},copy=False)
        urlsdf.to_feather(urlsdffile)     
        return urlsdf

In [None]:
urlsdf = gen_urls_dataframe(websites[5][2])

In [None]:
urlsdf.head()