In [None]:
from gensim.utils import smart_open
from gensim.corpora.wikicorpus import extract_pages, filter_wiki
from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter
from tqdm import tqdm
from multiprocessing import Pool
from multiprocessing import Lock

In [None]:
in_file = '/home/jendrik/data/enwiki-20190120-pages-articles-multistream.xml.bz2'
out_file = '/home/jendrik/data/wiki_out.csv'
sentence_splitter = SpacySentenceSplitter(rule_based=True)


lock = Lock()


In [None]:
def prepare_text(text: str) -> str:
    """
    Replace all URLs and email addresses with "THISISAURL" and "THISISANEMAIL".
    Replace all versions of "E-Mail" with "Email" since dashes will be replaced by spaces.
    Fix misplaced periods to allow for better sentence tokenization.
    Remove "QUOTE" string.
    Replace dashes by spaces.

    This method is useful as a first pre-processing before learning a word2vec model.
    Follow this step by sentence tokenization and preparing sentences.

    :param text: text to be prepared
    :return: prepared text
    """

    text_mod = re.sub(r"e-?mail", "Email", text, flags=re.IGNORECASE)       # replace "E-Mail"/"e-mail" with "Email"
    text_mod = re.sub(r"http\S+(\s|$)", "THISISAURL ", text_mod)            # remove urls
    text_mod = re.sub(r"www\.\S+(\s|$)", "THISISAURL ", text_mod)           # remove urls
    text_mod = re.sub(r"\b\S+@\S+\b", "THISISANEMAIL ", text_mod)           # remove emails
    text_mod = re.sub(r"\.+", ".", text_mod)                               # replace multiple periods with one
    text_mod = re.sub(r"\s\.[ ^\.]", ". ", text_mod)                        # fix misplaced periods
    text_mod = re.sub(r"(?<!\d)\.[ ^\.]", ". ", text_mod)                   # fix misplaced periods
    text_mod = re.sub(r"\-+", " ", text_mod)                                # replace "-" with a space
    text_mod = re.sub(r":(?=\S)", ": ", text_mod)                      # add space after colon if there was none before
    # text_mod = re.sub(r"(?<=[^\w\s])(?=\S)", " ", text_mod)          # add space after any remaining special char
    # text_mod = re.sub(r"QUOTE", "", text_mod)     # remove QUOTE -- causes issues because some texts will be empty

    # if len(text_mod) < 10:
    #     logger.debug("len(text_mod) < 10:")
    #     logger.debug(text_mod)
    #     logger.debug("original:")
    #     logger.debug(text)

    return text_mod

def filter_article(x):
    title, text, page_id = x
    sentences = sentence_splitter.split_sentences(prepare_text(filter_wiki(text)))
    print(len(sentences))
    out_str = ''
    for sentence in sentences:
        out_str += sentence + '\n'
    print(len(out_str))
    with lock:
        with open(out_file, 'w+') as f:
            f.write(out_str)
            
            

In [None]:
pool = Pool()
pool.imap(filter_article, tqdm(extract_pages(smart_open(in_file))))

In [None]:
buffer = []
with open(out_file, 'w') as f:
    f.write('sentence\n')
    for title, text, page_id in tqdm(extract_pages(smart_open(in_file))):
        buffer.append(filter_wiki(text))
        if len(buffer) > 1023:
            articles = sentence_splitter.batch_split_sentences(buffer)
            for art in articles:
                out_str = ''
                for sentence in art:
                    out_str += sentence + '\n'
                f.write(out_str)
            buffer = []
    