# Collecting Data from Wikipedia

This notebook covers the basics of extracting text data from wikipedia as a base for NLP tasks.

Based on this [KDNuggets Tutorial Post](https://www.kdnuggets.com/2017/11/building-wikipedia-text-corpus-nlp.html)

Other Resources:
- Wikipedia [dumpfile collection](https://dumps.wikimedia.org/enwiki/latest/)
- Documentation on [gensim](https://radimrehurek.com/gensim/corpora/wikicorpus.html)

For requirements, run: `pip3 install -r requirements.txt`


In [None]:
# the dependencies we need for processing wikipedia data
import multiprocessing
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec

wiki_infile = 'data/enwiki-latest-pages-articles1.xml-p10p30302.bz2'
wiki_outfile = 'data/wiki.en.txt'

def store_wiki_dump(infile, outfile):
    '''
    Accepts a wikipedia dump file as 'wiki_infile', processes the wiki data,
    and stores raw text to 'wiki_outfile'.
    '''
    
    # Using gensim.corpora.wikicorpus, we can process a wiki dump file
    wiki = WikiCorpus(infile, lemmatize=False, dictionary={})

    # Save the processed XML wiki dump as raw text in file
    # Depending on the size of the dump (usually quite large), this could take hours
    with open(wiki_outfile, 'w') as outfile:
        i = 0
        for text in wiki.get_texts():
            outfile.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
            i = i + 1
            if (i % 2000 == 0):
                print('Processed ' + str(i) + ' articles')
        print('Done processing wiki data.')
        
store_wiki_dump(wiki_infile, wiki_outfile)

In [None]:
import sys
import time

def check_corpus(input_file):
    """
    Reads some lines of corpus from text file
    """

    while True:
        for lines in range(50):
            print(input_file.readline())
        user_input = input('>>> Continue to next line of text? [y|N] <<< ')
        if user_input == 'N':
            break
            
def load_corpus(input_file):
    """
    Loads corpus from text file
    """

    print('Loading corpus...')
    time1 = time.time()
    corpus = input_file.read()
    time2 = time.time()
    total_time = time2-time1
    print('It took %0.3f seconds to load corpus' %total_time)
    return corpus

def run():
    if len(sys.argv) != 2:
        print('Usage: python check_wiki_corpus.py <corpus_file>')
        sys.exit(1)

    corpus_file = open(sys.argv[1],'r')
    check_corpus(corpus_file)
    corpus = load_corpus(corpus_file)