In [76]:
import numpy as np
import nltk

from collections import Counter

## Initial Configuration Format of Text
The initial format, at least for workability will need to be a list of words and punctuation. Getting into this format will be the first step of configuration that the text will undergo as it is simplest when trying to create a dictionary.

#### Example Text (Emma):
Using the `nltk` library (and for simplicity's sake) we will read in Jane Austen's *Emma*, which can conveniently be loaded in as a list of words.

In [17]:
print(nltk.corpus.gutenberg.fileids())
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [39]:
# print the first 20 words/punctuations of emma
emma[0: 20]

['[',
 'Emma',
 'by',
 'Jane',
 'Austen',
 '1816',
 ']',
 'VOLUME',
 'I',
 'CHAPTER',
 'I',
 'Emma',
 'Woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich']

### Preprocessing
Once we have the corpus in a list, we need to provide some preprocessing options that can prepare the data for being loaded up into a neural net. Again, these are just options and could potentially improve performance. 

In [54]:
def preprocess(corpus_list, lowercase=False, stopwords_list=None):
    """Preprocess the Corpus List.
    
    Provides some simple preprocessing steps that could be beneficial for training purposes. This includes 
    an option to lowercase all words in the corpus list and to remove stopwords.
    
    Parameters
    ----------
    corpus_list: list
        List of words in the corpus.
        
    lowercase: bool
        Option to lowercase all words int he corpus_list.
        
    stopwords_list: list
        List of stopwords to remove from the corpus_list.
        
    Returns
    -------
    corpus_list: list
        The preprocessed corpus_list.
    """
    
    # lowercases words in corpus
    if lowercase:
        corpus_list = [w.lower() for w in corpus_list]
    
    # removes stopwords
    if stopwords_list is not None:
        corpus_list = [w for w in corpus_list if w not in stopwords_list]
        
    return corpus_list
        

In [87]:
preped = preprocess(emma[0:20], lowercase=True, stopwords_list= ['emma', 'volume'])

### Vocabulary Dictionary
Neural nets don't take text inputs so we need to convert the words into integers that act as key references back to the words. `create_dictionary` is this step in the process and provides a dictionary where the word acts as the key and the integer is the value.

In [79]:
def create_dictionary(preprocessed_list):
    """Create a Vocabulary Dictionary.
    
    Create a dictionary of the vocab from a list of words in a corpus. This function
    all so provides the option to preprocess on the fly.
    
    Parameters
    ----------
    preprocessed_list: list
        List of preprocessed words in the corpus.
        
    Returns
    -------
    vocab_dict: tuple
        A vocabulary dictionary => {word: int} and a reverse dictionary => {int: word}.
    """
 
    uniq_words = list(set(preprocessed_list))
    word_indexes = list(range(0, len(uniq_words)))
    
    vocab_dict = dict(zip(uniq_words, word_indexes))
    reverse_dict = dict(zip(word_indexes, uniq_words))
    
    return vocab_dict, reverse_dict
    

In [88]:
vocabs = create_dictionary(preped)

### Encoding

Since the neural net can't take text, it is necessary to map the text to a vector of word integers corresponding to the `vocab_dict`.

In [91]:
def encode_list(preprocessed_list, vocab_dict):
    """Encode Vocabulary List.
    
    Encodes the preprocessed text using the vocabulary dict.
    
    Parameters
    ----------
    preprocessed_list: list
        The preprocessed words int he corpus.
        
    vocab_dict: dict
        The vocabulary dict => {word: int}.
        
    Returns
    -------
    encoded_list: list
        The encoded version of the text list.
    """
    encoded_list = [vocab_dict[w] for w in preprocessed_list]
    
    return encoded_list

In [94]:
encoded = encode_list(preped, vocabs[0])

### Configure Arrays
The next step is to configure the encoded list into a 2-dimensional array in which the number of inputs can be specified. Since the purpose of this particular application is to create text, each row represents a sequence of `n` words where the `n + 1` word is the target. The next row is then just shifted by one.

In [93]:
def configure_arrays(encoded_list, num_inputs):
    """Configure the Encoded Data into a 2-dimensional Array

    Creates a 2-dimensional array from the encoded data of an arbitrary number of inputs. Each row 
    contains `num_inputs` + 1 values where the last value in each row represents the target value and
    the ones previous the inputs. These are just shifting rows where the first `num_inputs` are the first 
    inputs in row 1 and then for row 2 the row shifts over by 1.
    
    Parameters
    ----------
    encoded_list: list
        A list of encoded words.
        
    num_inputs: int
        The number of input values per row.
        
    Returns
    -------
    numpy.array
        A 2-dimensional array.
    """
    config_lists = []
    for i in range(0, len(encoded_list) - num_inputs):
        config_lists.append(list(encoded_list[i: i + (num_inputs + 1)]))
        
    return np.array(config_lists)

In [98]:
configure_arrays(encoded, 3)

array([[ 8,  7, 13,  2],
       [ 7, 13,  2,  6],
       [13,  2,  6,  9],
       [ 2,  6,  9,  1],
       [ 6,  9,  1,  0],
       [ 9,  1,  0,  1],
       [ 1,  0,  1,  4],
       [ 0,  1,  4, 12],
       [ 1,  4, 12, 11],
       [ 4, 12, 11, 12],
       [12, 11, 12, 10],
       [11, 12, 10, 12],
       [12, 10, 12,  3],
       [10, 12,  3,  5]])

In [156]:
class Encodings(object):
    def __init__(self, corpus_list, num_inputs, lowercase=False, stopwords_list=None):
        self.corpus_list = corpus_list
        self.num_inputs = num_inputs
        self.preprocessed = self._preprocess(self.corpus_list, lowercase=lowercase, stopwords_list=None)
        self.vocab_dict, self.reverse_dict = self._create_dictionary(self.preprocessed)
        self.encoded_list = self._encode_list(self.preprocessed, self.vocab_dict)
        self.encodings = self._configure_arrays(self.encoded_list, self.num_inputs)
        
    def _preprocess(self, corpus_list, lowercase=False, stopwords_list=None):
        """Preprocess the Corpus List.
    
        Provides some simple preprocessing steps that could be beneficial for training purposes. This includes 
        an option to lowercase all words in the corpus list and to remove stopwords.
        
        Parameters
        ----------
        corpus_list: list
            List of words in the corpus.
            
        lowercase: bool
            Option to lowercase all words int he corpus_list.
            
        stopwords_list: list
            List of stopwords to remove from the corpus_list.
            
        Returns
        -------
        corpus_list: list
            The preprocessed corpus_list.
        """
    
        # lowercases words in corpus
        if lowercase:
            corpus_list = [w.lower() for w in corpus_list]
        
        # removes stopwords
        if stopwords_list is not None:
            corpus_list = [w for w in corpus_list if w not in stopwords_list]
            
        return corpus_list
    
    def _create_dictionary(self ,preprocessed_list):
        """Create a Vocabulary Dictionary.
        
        Create a dictionary of the vocab from a list of words in a corpus. This function
        all so provides the option to preprocess on the fly.
        
        Parameters
        ----------
        preprocessed_list: list
            List of preprocessed words in the corpus.
            
        Returns
        -------
        vocab_dict: tuple
            A vocabulary dictionary => {word: int} and a reverse dictionary => {int: word}.
        """
     
        uniq_words = list(set(preprocessed_list))
        word_indexes = list(range(0, len(uniq_words)))
        
        vocab_dict = dict(zip(uniq_words, word_indexes))
        reverse_dict = dict(zip(word_indexes, uniq_words))
        
        return vocab_dict, reverse_dict
    
    def _encode_list(self, preprocessed_list, vocab_dict):
        """Encode Vocabulary List.
        
        Encodes the preprocessed text using the vocabulary dict.
        
        Parameters
        ----------
        preprocessed_list: list
            The preprocessed words int he corpus.
            
        vocab_dict: dict
            The vocabulary dict => {word: int}.
            
        Returns
        -------
        encoded_list: list
            The encoded version of the text list.
        """
        encoded_list = [vocab_dict[w] for w in preprocessed_list]
        
        return encoded_list
    
    def _configure_arrays(self, encoded_list, num_inputs):
        """Configure the Encoded Data into a 2-dimensional Array
    
        Creates a 2-dimensional array from the encoded data of an arbitrary number of inputs. Each row 
        contains `num_inputs` + 1 values where the last value in each row represents the target value and
        the ones previous the inputs. These are just shifting rows where the first `num_inputs` are the first 
        inputs in row 1 and then for row 2 the row shifts over by 1.
        
        Parameters
        ----------
        encoded_list: list
            A list of encoded words.
            
        num_inputs: int
            The number of input values per row.
            
        Returns
        -------
        numpy.array
            A 2-dimensional array.
        """
        config_lists = []
        for i in range(0, len(encoded_list) - num_inputs):
            config_lists.append(list(encoded_list[i: i + (num_inputs + 1)]))
            
        return np.array(config_lists)
            

In [159]:
e = Encodings(emma[0:20], num_inputs=2, lowercase=True, stopwords_list=['chapter'])
e.encodings

array([[ 9, 11,  7],
       [11,  7, 15],
       [ 7, 15,  2],
       [15,  2,  6],
       [ 2,  6, 10],
       [ 6, 10,  8],
       [10,  8,  1],
       [ 8,  1,  0],
       [ 1,  0,  1],
       [ 0,  1, 11],
       [ 1, 11,  4],
       [11,  4, 14],
       [ 4, 14, 13],
       [14, 13, 14],
       [13, 14, 12],
       [14, 12, 14],
       [12, 14,  3],
       [14,  3,  5]])

The next bit of information is simply just a print message to keep track of epochs.

In [129]:
print("       ________\n{\\__/}||E: 10  |\n(• ,•)||_______|\n/ > />||")

       ________
{\__/}||E: 10  |
(• ,•)||_______|
/ > />||
