In [189]:
import glob
import re
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import keras.utils as ku
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

In [163]:
def get_sentences(s):
    sentences = re.split("\.|\;|\!|\?|\:", s)
    sentences = [s.strip() for s in sentences if s != '']
    return(sentences)
    """
    Input:
        string s
    Output:
        list of strings
    """
    


def tokenize(s):
    # turns string into list of strings separated by spaces
    return s.split()
    """
    Input:
        string s
    Output:
        list of strings
    """
    
def preprocess(s, lowercase=True, strip_punctuation=True):
    """
    Input:
        string s
        boolean lowercase
        boolean strip_punctuation
    Output:
        list of strings
    """
    punctuation='.,?<>:;"\'!%'
    if isinstance(s, str):
        # if s is an instance of the string class
        s = tokenize(s)
    if lowercase:
        # if the lowercase option is true, go through each element in the list and turn the characters lowercase
        s = [t.lower() for t in s]
    if strip_punctuation:
        # get rid of the punction for each of the elements in the list
        # need to do this after splitting each word up because strip only looks at the beginning and end of a string
        s = [t.strip(punctuation) for t in s]
    return s
    
def token_frequency(tokens, tf=None, relative=False):
    # We want to have raw frequency as default because if we want to look at multiple texts, then we can add the raw frequencies together
    """
    Input: 
        tokens = list of strings or None
        tf = dict or None
        realtive = boolean
    Return:
        dictionary of tokens and frequency {t:f}
    """
    token_frequency = {} # dictionary to hold token counts
    
    # adding previous token frequency dictionary
    token_frequency={} if tf==None else tf
    if len(token_frequency) != 0 and relative==True:
        if isinstance(list(token_frequency.items())[0][1], float):
            print('warning, adding raw counts to relative frequency')
            return tf
    
    # counting up the words
    for token in tokens:
        if token in token_frequency:
            # if the token is already a key in the dictionary
            token_frequency[token] += 1
        else:
            # token is not a key in the dictionary
            token_frequency[token] = 1
    
    # changing to relative or keeping as raw frequency
    if relative:
        # relative frequency is wanted
        total_words = sum([v for k, v in token_frequency.items()]) # total number of words in the string that was input and the new tf dictionary
        return {k:v/total_words for k,v in token_frequency.items()} # go through each key, value pair in the dictionary and divide the value by the total number of words 
        # note: dictionary.items() turns the dictionary into a list of tuples ie. [(key1, value1), (key2, value2)]
    else:
        # want raw frequency
        return token_frequency

In [171]:
files = glob.glob("./shakespeare/*.txt")

# total token frequency
total_frequency = {}

# list of sentences
sentences = []

for file in files:
    # tokenize text
    text = open(file, "r").read()
    tokens = preprocess(text)
    total_frequency = token_frequency(tokens, tf=total_frequency) # update running total token frequency
    sentences += get_sentences(text)
    
# sentences is list of strings. Each entry is one sentence

sentences = [preprocess(s) for s in sentences]

# sentences is list of list of strings. Each row (first dim) is a sentences. Each col is a word in the sentence

28882


In [172]:
print(sentences[0])

['so', 'shaken', 'as', 'we', 'are', 'so', 'wan', 'with', 'care', 'find', 'we', 'a', 'time', 'for', 'frighted', 'peace', 'to', 'pant', 'and', 'breathe', 'short-winded', 'accents', 'of', 'new', 'broils', 'to', 'be', 'commenced', 'in', 'strands', 'afar', 'remote']


In [173]:
# convert words to numbers by going through each word in order and assigning it a number in a lookup table
word_lookup = {}

for i1, s in enumerate(sentences):
    for i2, w in enumerate(s):
        if w in word_lookup:
            # word seen before
            sentences[i1][i2] = word_lookup[w]
        else:
            sentences[i1][i2] = len(word_lookup)
            word_lookup[w] = len(word_lookup)

In [174]:
print(len(sentences))
sentences = [s for s in sentences if s!=[]]
print(len(sentences))
print(sentences[0])

82097
82066
[0, 1, 2, 3, 4, 0, 5, 6, 7, 8, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 14, 23, 24, 25, 26, 27, 28]


In [175]:
# find max length of sentence
max_len = max([len(s) for s in sentences])
print(max_len)

# find total number of words
total_words = len(word_lookup)
print(total_words)

160
28665


In [176]:
# pad sentences to all have same length
input_sequences = np.array(pad_sequences(sentences, maxlen=max_len, padding='pre'))

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=total_words) # one hot encoding

In [185]:
print(input_sequences)
print('\n')
print(predictors)
print(label[0][28])

[[    0     0     0 ...    26    27    28]
 [    0     0     0 ...    40    41    42]
 [    0     0     0 ...    20    52    53]
 ...
 [    0     0     0 ...    14   389  1078]
 [    0     0     0 ...     3   111 28664]
 [    0     0     0 ... 19489  1075   986]]


[[    0     0     0 ...    25    26    27]
 [    0     0     0 ...    38    40    41]
 [    0     0     0 ...    51    20    52]
 ...
 [    0     0     0 ... 18003    14   389]
 [    0     0     0 ...  1369     3   111]
 [    0     0     0 ...     0 19489  1075]]
1.0


#### Creating Model

In [195]:
# Add Input Embedding Layer
model = Sequential()

model.add(Embedding(total_words, 10, input_length=max_len-1))

# Add Hidden Layer 1 - LSTM Layer
model.add(LSTM(100))
model.add(Dropout(0.1))

# Add Output Layer
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 159, 10)           286650    
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 28665)             2895165   
Total params: 3,226,215
Trainable params: 3,226,215
Non-trainable params: 0
_________________________________________________________________
None


In [197]:
model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x25ee2f59488>

In [198]:
model.save("shakespeare.h5")

In [None]:
def predict(seed):
    """
    input: 
    """