In [40]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
### YOUR CODE HERE
from tensorflow.keras.regularizers import L1, L2, L1L2
###
import tensorflow.keras.utils as ku 
import numpy as np 

# Data Processing
## Import Data

In [41]:
!wget https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \
    -O /home/jupyter/+data/sonnets.txt

--2021-06-15 04:07:34--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.192.128, 209.85.146.128, 142.250.125.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.192.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93578 (91K) [text/plain]
Saving to: ‘/home/jupyter/+data/sonnets.txt’


2021-06-15 04:07:34 (93.3 MB/s) - ‘/home/jupyter/+data/sonnets.txt’ saved [93578/93578]



In [42]:
data = open("/home/jupyter/+data/sonnets.txt").read()
corpus = data.lower().split("\n")

## Tokenize, n-gram and pad

In [43]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [44]:
sequences = tokenizer.texts_to_sequences(corpus)
n_gram = []
for tokens in sequences:
    for i in range(2, len(tokens)+1):
        n_gram.append(tokens[:i])

# pad sequence
max_sequence_len = max(len(x) for x in n_gram)
padded = pad_sequences(n_gram, maxlen=max_sequence_len)

In [45]:
x, y = padded[:,:-1], padded[:,-1]
y = ku.to_categorical(y, num_classes=total_words)

# Model

In [46]:
embedding_dim = 100

In [47]:
model = Sequential([
    Embedding(total_words, embedding_dim, input_length=max_sequence_len-1),
    Bidirectional(LSTM(150, return_sequences=True)),
    Dropout(0.2),
    LSTM(100),
    Dense(total_words//2, activation="relu", kernel_regularizer=L1L2(l1=0.01)),
    Dense(total_words, activation="softmax")
])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 10, 100)           321100    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 10, 300)           301200    
_________________________________________________________________
dropout_5 (Dropout)          (None, 10, 300)           0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 100)               160400    
_________________________________________________________________
dense_8 (Dense)              (None, 1605)              162105    
_________________________________________________________________
dense_9 (Dense)              (None, 3211)              5156866   
Total params: 6,101,671
Trainable params: 6,101,671
Non-trainable params: 0
____________________________________________

In [18]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["acc"])

In [19]:
history = model.fit(x, y, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [39]:
seed_text = "Help me Obi Wan Kenobi, you're my only hope"
next_words = 50

reverse_index = {v:k for k, v in tokenizer.word_index.items()}

for _ in range(next_words):
    tokens = tokenizer.texts_to_sequences([seed_text])[0]
    padded = pad_sequences([tokens], maxlen=max_sequence_len-1)
    predicted = model.predict_classes(padded)
    word = reverse_index[predicted[0]]
    seed_text += " " + word
print(seed_text)

Help me Obi Wan Kenobi, you're my only hope the sweet love all the i love doth in thou wilt thou i i than then i am thou eye in the time to in thou i i love i in thy i love shall see thou than thou wilt thou wilt thou i and thou wilt all the i
