In this script I am going to process the raw downloaded data with the following transformations:
    1. split text into lines
    2. split lines into sentences
    3. leave only thos sentences which contain words: 'mouse" or 'mice' (filtering) and replace 'mice' with 'mouse' (normalization)

In [14]:
# set up environment
import os
import codecs

import wikipedia
from nltk.tokenize import sent_tokenize


PROJECT_ROOT = os.path.abspath(os.path.join(os.pardir))
DATA_PATH = os.path.join(PROJECT_ROOT, 'data')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')
INTERIM_DATA_PATH = os.path.join(DATA_PATH, 'interim')

ENCODING = 'utf-8'

In [11]:
# define processing pipeline

def process_text(text):
    ''' processes raw text downloaded from wikipedia
    
    Input:
        content of the wikipedia page or any other string
        
    Output:
        list of sentences containing word mouse
    '''
    def split_into_lines(text):
        return text.splitlines()
    
    def split_into_sentences(lines):
        sentences = []
        for line in lines:
            sentences.extend(sent_tokenize(line))
        return sentences
    
    def filter_normalize_sentences(sentences):
        valid_sentences = []
        for sentence in sentences:
            if 'mice' in sentence:
                sentence = sentence.replace('mice', 'mouse')
            
            if 'mouse' in sentence:
                valid_sentences.append(sentence)
        return valid_sentences
                
    lines = split_into_lines(text)
    sentences = split_into_sentences(lines)
    valid_sentences = filter_normalize_sentences(sentences)
    return valid_sentences

# let's test our freshly defined function
test_sentences = 'mice and mouse.\nmouse is a small rodent.\ncomputer mouse.\nthere is no valid word here.'
expected_output = ['mouse and mouse.', 'mouse is a small rodent.', 'computer mouse.']
assert expected_output == process_text(test_sentences), 'ERROR: expected = {}, Actual = {}'.\
format(expected_output, process_text(test_sentences))

In [21]:
# For both 'device' and 'animal' built a file containing only valid sentences
for context in ['animal', 'device']:
    read_dir = os.path.join(RAW_DATA_PATH, context)
    save_dir = os.path.join(INTERIM_DATA_PATH, '{}.txt'.format(context))
    filenames = [filename for filename in os.listdir(read_dir) if filename.endswith('.txt')]
    print(filenames)
    
    with codecs.open(save_dir, 'w', ENCODING) as of:
        for filename in filenames:
            read_path = os.path.join(read_dir, filename)
            with codecs.open(read_path, 'r', ENCODING) as rf:
                text = rf.read()
                processed_text = process_text(text)
                for sentence in processed_text:
                    of.write(sentence)
                    of.write('\n')

['hopping mouse.txt', 'kangaroo mouse.txt', 'mouse.txt']
['computer mouse.txt', 'optical mouse.txt']
