In [1]:
import tensorflow as tf
tf.config.optimizer.set_jit(True)
from tensorflow import keras
from keras.layers import *
import keras_nlp
from keras import backend

import re
import requests
import numpy as np
import random
import math

In [2]:
datafile = open("Shakespeare.txt", "r", encoding="utf-8")
text = datafile.read()
split = int(len(text) * 0.2)
text = text[split:]
validation_text = text[:split]
datafile.close()

In [3]:
vocab = sorted(list(set(text)))
embed_dim = math.ceil(math.log2(len(vocab)+1))

In [4]:
mask = 0
stoi = {ch:i+1 for i,ch in enumerate(vocab)}
itos = {i+1:ch for i,ch in enumerate(vocab)}
tokenize = lambda s: [stoi[c] for c in s]
detokenize = lambda l: "".join([itos[i] for i in l])
tokenized_text = tokenize(text)
text_size = len(tokenized_text)
vocab_size = len(vocab)
print(tokenized_text[:20])

[19, 48, 57, 58, 59, 2, 16, 48, 59, 48, 65, 44, 53, 11, 1, 15, 44, 45, 54, 57]


In [5]:
def get_train_sample(input_size):
    end = random.randint(0, text_size-1)
    start = max(end - input_size, 0)
    sample = tokenized_text[start : end]
    
    if (input_size - len(sample)) > 0:
        sample = [mask] * (input_size - len(sample)) + sample
    
    return sample, tokenized_text[end]


def get_train_epoch(epoch_size, input_size):
    X = []
    Y = []
    for _ in range(epoch_size):
        x, y = get_train_sample(input_size)
        X.append(x)
        Y.append(y)
    X = tf.reshape(tf.constant(X), (len(X), input_size))
    Y = tf.reshape(tf.constant(Y), (len(Y), ))
    return X, Y

In [6]:
get_train_epoch(2, 5)

(<tf.Tensor: shape=(2, 5), dtype=int32, numpy=
 array([[51, 48, 40, 53, 59],
        [ 2, 47, 44, 57, 44]])>,
 <tf.Tensor: shape=(2,), dtype=int32, numpy=array([ 2, 13])>)

In [11]:
def build_model(input_size=128, embed_dim=8):
    inputs = Input(shape=(input_size,))
    embs = Embedding(vocab_size + 1, embed_dim)(inputs)
    
    x = LSTM(128, activation="leaky_relu", return_sequences=True)(embs)
    x = LSTM(128, activation="leaky_relu", return_sequences=True)(x)
    x = LSTM(128, activation="leaky_relu")(x)
    x = Dense(128, activation="leaky_relu")(x)
    x = Dense(vocab_size + 1, activation="softmax")(x)

    model = keras.Model(inputs=inputs, outputs=x)
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        metrics=["accuracy"]
    )
    return model

input_size = 128
model = build_model(input_size=input_size, embed_dim=8)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 128, 8)            528       
                                                                 
 lstm_6 (LSTM)               (None, 128, 128)          70144     
                                                                 
 lstm_7 (LSTM)               (None, 128, 128)          131584    
                                                                 
 lstm_8 (LSTM)               (None, 128)               131584    
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dense_5 (Dense)             (None, 66)                8514

In [12]:
for i in range(100):
    X, Y = get_train_epoch(4096*8, input_size)
    model.fit(X, Y, shuffle=True, batch_size=256, epochs=1)

 10/128 [=>............................] - ETA: 4:53 - loss: 3.3008 - accuracy: 0.1625

KeyboardInterrupt: 

In [16]:
text = """First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their"""

print(text, end="")
for _ in range(256):
    tokens = tokenize(text)
    tokens = tokens[max(0, len(tokens) - input_size):]
    tokens = [mask] * (input_size - len(tokens)) + tokens
    
    tokens = tf.reshape(tf.constant(tokens), (1, input_size))
    output = model(tokens)[0]
    
    top_k = tf.math.top_k(output, 5)
    output = np.random.choice(top_k.indices, p=top_k.values.numpy()/np.sum(top_k.values.numpy()))
    if output == 0 or output > vocab_size:
        print("\nSTOPED\n")
        break
    output = detokenize([output])
    print(output, end="")
    text = text + "" + output

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their ooh    e ho

KeyboardInterrupt: 