In [2]:
import pandas as pd
import numpy as np
import nltk

## Initial Configuration Format of Text
The initial format, at least for workability will need to be a list of words and punctuation. Getting into this format will be the first step of configuration that the text will undergo as it is simplest when trying to create a dictionary.

#### Example Text (Emma):
Using the `nltk` library (and for simplicity's sake) we will read in Jane Austen's *Emma*, which can conveniently be loaded in as a list of words.

In [17]:
print(nltk.corpus.gutenberg.fileids())
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [39]:
# print the first 20 words/punctuations of emma
emma[0: 20]

['[',
 'Emma',
 'by',
 'Jane',
 'Austen',
 '1816',
 ']',
 'VOLUME',
 'I',
 'CHAPTER',
 'I',
 'Emma',
 'Woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich']

### Preprocessing
Once we have the corpus in a list, we need to provide some preprocessing options that can prepare the data for being loaded up into a neural net. Again, these are just options and could potentially improve performance. 

In [54]:
def preprocess(corpus_list, lowercase=False, stopwords_list=None):
    """Preprocess the Corpus List.
    
    Provides some simple preprocessing steps that could be beneficial for training purposes. This includes 
    an option to lowercase all words in the corpus list and to remove stopwords.
    
    Parameters
    ----------
    corpus_list: list
        List of words in the corpus.
        
    lowercase: bool
        Option to lowercase all words int he corpus_list.
        
    stopwords_list: list
        List of stopwords to remove from the corpus_list.
    """
    
    # lowercases words in corpus
    if lowercase:
        corpus_list = [w.lower() for w in corpus_list]
    
    # removes stopwords
    if stopwords_list is not None:
        corpus_list = [w for w in corpus_list if w not in stopwords_list]
        
    return corpus_list
        

In [58]:
preprocess(emma[0:20], lowercase=True, stopwords_list= ['emma', 'volume'])

['[',
 'by',
 'jane',
 'austen',
 '1816',
 ']',
 'i',
 'chapter',
 'i',
 'woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich']

In [64]:
tmp = ['james', 'chester', 'bain']
tmp2 = list(range(0,3))
dict(zip(tmp, tmp2))

{'bain': 2, 'chester': 1, 'james': 0}

In [71]:
def create_dictionary(corpus_list, lowercase=False, stopwords_list=None):
    
    corpus_list = preprocess(corpus_list, lowercase=lowercase, stopwords_list=stopwords_list)
    
    uniq_words = list(set(corpus_list))
    word_indexes = list(range(0, len(uniq_words)))
    
    vocab_dict = dict(zip(uniq_words, word_indexes))
    
    return vocab_dict
    

In [73]:
create_dictionary(emma, lowercase=True, stopwords_list=['heartedness'])

{'served': 0,
 'give': 1,
 'throughout': 2,
 'charitable': 3,
 'slumbering': 4,
 'pervert': 5,
 'scissors': 6,
 'apprehend': 7,
 'successively': 8,
 'industriously': 9,
 'accent': 10,
 'attentive': 11,
 'swell': 12,
 '_was_': 13,
 'destination': 14,
 'measured': 15,
 'prevalent': 16,
 'systems': 17,
 ";'": 18,
 'airing': 19,
 'gallantry': 20,
 'sofa': 21,
 'slice': 22,
 'sentences': 23,
 'honour': 24,
 'qualify': 25,
 'ceased': 26,
 'reign': 27,
 'deplore': 28,
 'bid': 29,
 'articles': 30,
 'witty': 31,
 'shade': 32,
 'baking': 33,
 'lashes': 34,
 '_first_': 35,
 'glance': 36,
 'wore': 37,
 'indistinct': 38,
 'liquor': 39,
 'wondering': 40,
 'cloud': 41,
 'harry': 42,
 'are': 43,
 'unsettled': 44,
 'classed': 45,
 'accord': 46,
 'gates': 47,
 'consisting': 48,
 'merely': 49,
 'gipsy': 50,
 'heroism': 51,
 'daughters': 52,
 'managed': 53,
 'cases': 54,
 'unlikely': 55,
 'raising': 56,
 'inch': 57,
 'urbanity': 58,
 'emma': 59,
 'freshness': 60,
 'talker': 61,
 'advisable': 62,
 'insinua

In [30]:
config_lists = []
for i in range(0, len(emma) - 3):
    config_lists.append(list(emma[i: i + 4]))
    

In [31]:
len(emma)

192427

In [34]:
np.array(config_lists)

array([['[', 'Emma', 'by', 'Jane'],
       ['Emma', 'by', 'Jane', 'Austen'],
       ['by', 'Jane', 'Austen', '1816'],
       ...,
       ['happiness', 'of', 'the', 'union'],
       ['of', 'the', 'union', '.'],
       ['the', 'union', '.', 'FINIS']], dtype='<U17')

'.'