In [None]:
! mkdir ~/data/workshop_data
! wget -c --retry-connrefused --tries=0 https://dumps.wikimedia.org/enwiki/20190120/enwiki-20190120-pages-articles-multistream.xml.bz2 -O ~/data/workshop_data/enwiki-20190120-pages-articles-multistream.xml.bz2

In [None]:
from gensim.utils import smart_open
from gensim.corpora.wikicorpus import extract_pages, filter_wiki
from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from tqdm import tqdm
from multiprocessing import Pool
import re
from multiprocessing import Lock
from os.path import expanduser
import shutil

In [None]:
import os
home = expanduser("~")
in_file = home + '/data/workshop_data/enwiki-20190120-pages-articles-multistream.xml.bz2'
out_file = home + '/data/workshop_data/wiki_out.csv'
sentence_splitter = SpacySentenceSplitter(rule_based=True)


lock = Lock()


In [None]:
def prepare_text(text: str) -> str:
    """
    Replace all URLs and email addresses with "THISISAURL" and "THISISANEMAIL".
    Replace all versions of "E-Mail" with "Email" since dashes will be replaced by spaces.
    Fix misplaced periods to allow for better sentence tokenization.
    Remove "QUOTE" string.
    Replace dashes by spaces.

    This method is useful as a first pre-processing before learning a word2vec model.
    Follow this step by sentence tokenization and preparing sentences.

    :param text: text to be prepared
    :return: prepared text
    """

    text_mod = re.sub(r"e-?mail", "Email", text, flags=re.IGNORECASE)       # replace "E-Mail"/"e-mail" with "Email"
    text_mod = re.sub(r"http\S+(\s|$)", "THISISAURL ", text_mod)            # remove urls
    text_mod = re.sub(r"www\.\S+(\s|$)", "THISISAURL ", text_mod)           # remove urls
    text_mod = re.sub(r"\b\S+@\S+\b", "THISISANEMAIL ", text_mod)           # remove emails
    text_mod = re.sub(r"\.+", ".", text_mod)                               # replace multiple periods with one
    text_mod = re.sub(r"\s\.[ ^\.]", ". ", text_mod)                        # fix misplaced periods
    text_mod = re.sub(r"(?<!\d)\.[ ^\.]", ". ", text_mod)                   # fix misplaced periods
    text_mod = re.sub(r"\-+", " ", text_mod)                                # replace "-" with a space
    text_mod = re.sub(r":(?=\S)", ": ", text_mod)                      # add space after colon if there was none before
    # text_mod = re.sub(r"(?<=[^\w\s])(?=\S)", " ", text_mod)          # add space after any remaining special char
    # text_mod = re.sub(r"QUOTE", "", text_mod)     # remove QUOTE -- causes issues because some texts will be empty

    # if len(text_mod) < 10:
    #     logger.debug("len(text_mod) < 10:")
    #     logger.debug(text_mod)
    #     logger.debug("original:")
    #     logger.debug(text)

    return text_mod


def prepare_sentence(sentence: str):
    """
    Remove all punctuation.
    Replace any white space with single space.
    Remove excess white space at beginning or end of sentence.

    This method is useful as a second pre-processing before learning a word2vec model.
    This step should be preceded by prepare_text() and sentence_tokenize().

    :param sentence: sentence to be prepared
    :return: prepared sentence
    """

    sent_mod = re.sub(r"_+", " ", sentence)             # replace underscores with single white space
    sent_mod = re.sub(r"'+", "", sent_mod)              # remove '
    sent_mod = re.sub(r'"+', "", sent_mod)               # remove "
    sent_mod = re.sub(r"{+", "", sent_mod)             # remove {
    sent_mod = re.sub(r"}+", "", sent_mod)             # remove }
    sent_mod = re.sub(r"\[+", "", sent_mod)             # remove [
    sent_mod = re.sub(r"\]+", "", sent_mod)             # remove ]
    sent_mod = re.sub(r"\|+", " ", sent_mod)             # replace | with single white space
    sent_mod = re.sub(r"\*+", "", sent_mod)             # remove *
    sent_mod = re.sub(r"=+", " ", sent_mod)             # replace equal with single white space
    sent_mod = re.sub(r";+", ",", sent_mod)             # replace semicolon with single white space
    sent_mod = re.sub(r"\s+", " ", sent_mod)            # replace any whitespace with single space
    sent_mod = re.sub(r",+", "", sent_mod)              # remove comma
    sent_mod = re.sub(r".+", "", sent_mod)              # remove point
    sent_mod = re.sub(r"\!+", "", sent_mod)             # remove exclemation mark
    sent_mod = re.sub(r"\?+", "", sent_mod)             # remove question mark
    sent_mod = re.sub(r"\0+", "", sent_mod)             # remove null byte
    sent_mod = sent_mod.strip()                         # remove any white space at beginning or end of sentence

    # if len(sent_mod) < 10:
    #     logger.debug("len(sent_mod) < 10: {}".format(len(sent_mod)))
    #     logger.debug(sent_mod)
    #     logger.debug("original:")
    #     logger.debug(sentence)

    return sent_mod


def filter_article(x):
    title, text, page_id = x
    sentences = sentence_splitter.split_sentences(prepare_text(filter_wiki(text)))
    out_str = ''
    sentences = [prepare_sentence(sent) for sent in sentences]
    out_str = ''
    for sentence in sentences:
        for word in sentence.split():
            if not (word.startswith('{') or word.startswith('[') or word.startswith('File:')):
                out_str += word + ';'
        out_str += '\n'
    with lock:
        with open(out_file, 'a') as f:
            f.write(out_str)
            
            

In [None]:
if os.path.exists(out_file):
    os.remove(out_file)
with open(out_file, 'a') as f:
    f.write('sentence\n')
pool = Pool()
pool.imap(filter_article, tqdm(extract_pages(smart_open(in_file))))

In [None]:
for sentences in tqdm(extract_pages(smart_open(in_file))):
    sentences = filter_article(sentences)