In [76]:
import pandas as pd
import numpy as np
import nltk

from collections import Counter

## Initial Configuration Format of Text
The initial format, at least for workability will need to be a list of words and punctuation. Getting into this format will be the first step of configuration that the text will undergo as it is simplest when trying to create a dictionary.

#### Example Text (Emma):
Using the `nltk` library (and for simplicity's sake) we will read in Jane Austen's *Emma*, which can conveniently be loaded in as a list of words.

In [17]:
print(nltk.corpus.gutenberg.fileids())
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [39]:
# print the first 20 words/punctuations of emma
emma[0: 20]

['[',
 'Emma',
 'by',
 'Jane',
 'Austen',
 '1816',
 ']',
 'VOLUME',
 'I',
 'CHAPTER',
 'I',
 'Emma',
 'Woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich']

### Preprocessing
Once we have the corpus in a list, we need to provide some preprocessing options that can prepare the data for being loaded up into a neural net. Again, these are just options and could potentially improve performance. 

In [54]:
def preprocess(corpus_list, lowercase=False, stopwords_list=None):
    """Preprocess the Corpus List.
    
    Provides some simple preprocessing steps that could be beneficial for training purposes. This includes 
    an option to lowercase all words in the corpus list and to remove stopwords.
    
    Parameters
    ----------
    corpus_list: list
        List of words in the corpus.
        
    lowercase: bool
        Option to lowercase all words int he corpus_list.
        
    stopwords_list: list
        List of stopwords to remove from the corpus_list.
        
    Returns
    -------
    corpus_list: list
        The preprocessed corpus_list.
    """
    
    # lowercases words in corpus
    if lowercase:
        corpus_list = [w.lower() for w in corpus_list]
    
    # removes stopwords
    if stopwords_list is not None:
        corpus_list = [w for w in corpus_list if w not in stopwords_list]
        
    return corpus_list
        

In [58]:
preprocess(emma[0:20], lowercase=True, stopwords_list= ['emma', 'volume'])

['[',
 'by',
 'jane',
 'austen',
 '1816',
 ']',
 'i',
 'chapter',
 'i',
 'woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich']

### Vocabulary Dictionary
Neural nets don't take text inputs so we need to convert the words into integers that act as key references back to the words. `create_dictionary` is this step in the process and provides a dictionary where the word acts as the key and the integer is the value.

In [71]:
def create_dictionary(corpus_list, lowercase=False, stopwords_list=None):
    """Create a Vocabulary Dictionary.
    
    Create a dictionary of the vocab from a list of words in a corpus. This function
    all so provides the option to preprocess on the fly.
    
    Parameters
    ----------
    corpus_list: list
        List of words in the corpus.
        
    lowercase: bool
        Option to lowercase all words int he corpus_list.
        
    stopwords_list: list
        List of stopwords to remove from the corpus_list.
        
    Returns
    -------
    vocab_dict: tuple
        A vocabulary dictionary => {word: int} and a reverse dictionary => {int: word}.
    """
    
    corpus_list = preprocess(corpus_list, lowercase=lowercase, stopwords_list=stopwords_list)
    
    uniq_words = list(set(corpus_list))
    word_indexes = list(range(0, len(uniq_words)))
    
    vocab_dict = dict(zip(uniq_words, word_indexes))
    reverse_dict = dict(zip(word_indexes, uniq_words))
    
    return vocab_dict, reverse_dict
    

In [75]:
create_dictionary(emma[0:20], lowercase=True, stopwords_list=[','])

{'1816': 6,
 '[': 9,
 ']': 10,
 'and': 3,
 'austen': 2,
 'by': 7,
 'chapter': 0,
 'clever': 12,
 'emma': 11,
 'handsome': 13,
 'i': 1,
 'jane': 14,
 'rich': 5,
 'volume': 8,
 'woodhouse': 4}

In [30]:
config_lists = []
for i in range(0, len(emma) - 3):
    config_lists.append(list(emma[i: i + 4]))
    

In [31]:
len(emma)

192427

In [34]:
np.array(config_lists)

array([['[', 'Emma', 'by', 'Jane'],
       ['Emma', 'by', 'Jane', 'Austen'],
       ['by', 'Jane', 'Austen', '1816'],
       ...,
       ['happiness', 'of', 'the', 'union'],
       ['of', 'the', 'union', '.'],
       ['the', 'union', '.', 'FINIS']], dtype='<U17')