In [159]:
# Fetch all the packages
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
from typing import List

In [160]:
# Fetch all the layer types that we gonna need
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [161]:
training_text_filename = "paul_graham.txt"
training_raw_text = open(
    training_text_filename,
    'r',
    encoding='utf-8')\
    .read()\
    .lower()\
    .replace("\r"," ")\
    .replace("\n"," ")\
    .replace("\t"," ")\
    .replace("\"", "")\
    .replace("\'", "")\
    .replace("(", "")\
    .replace(")", "")

In [162]:
# check that the data was actually read correctly
training_raw_text = training_raw_text

In [163]:
# create word tokens and mappings from token_id to token and token to token_id
tokens = sorted(list(set(training_raw_text.split(' '))))
token_to_id = dict((token, token_id) for token_id, token in enumerate(tokens))

In [164]:
def tokenize_string(a_string: str) -> List[str]:
    return a_string\
        .lower()\
        .replace("\r"," ")\
        .replace("\n"," ")\
        .replace("\t"," ")\
        .replace("\"", "")\
        .replace("\'", "")\
        .replace("(", "")\
        .replace(")", "")\
        .split(' ')

training_tokens = tokenize_string(training_raw_text)
#print(training_tokens)
training_token_ids = [token_to_id[token] for token in training_tokens]
#print("smol example:", training_token_ids[:5])
print(len(training_token_ids))

23541


In [165]:
training_length = len(training_token_ids)
unique_tokens = len(tokens)
print("training_length:", training_length)
print("training_unique_characters_count:", unique_tokens)

training_length: 23541
training_unique_characters_count: 3977


In [168]:
# Assign the lenght of the past words to take as input
input_sequence_length = 20

In [169]:
# Build the input dataset
# Example: if input seq length is 3
# For text a b c d e f g
# gen targets: -> [abc] => d, [bcd] => e, [cde] => f and so on ...
# inputs = [abc, bcd, cde]; targets = [d,e,f]
# We can call the set of inputs and targets as patterns
inputs = []
targets = []
for i in range(0, training_length - input_sequence_length, 1):
    sequence_input = training_tokens[i:i + input_sequence_length]
    target = training_tokens[i + input_sequence_length]
    inputs.append([token_to_id[token] for token in sequence_input])
    targets.append(token_to_id[target])
pattern_count = len(inputs)
pattern_count

23521

In [170]:
# see first input and output
print("EXAMPLE")
print("in:", inputs[0], "out", targets[0])

EXAMPLE
in: [3496, 288, 3646, 1026, 3817, 3565, 387, 2611, 2210, 2401, 2749, 242, 509, 137, 1822, 2211, 288, 3616, 931, 600] out 113


In [26]:
# --------------------------------
# FUN TIME - FORMAT DATA FOR MODEL
# --------------------------------

In [171]:
# unsure what 1 is, it says [samples, timesteps, features] in the article,
# I guess each character is one feature, if image data this could probs be many? idk?
# Dividing by the end with /triaining_unique_characters_count maps inputs to 0-1 range
ready_input_data = np.reshape(inputs, (pattern_count, input_sequence_length, 1)) / len(tokens)

In [172]:
# example prepared input, only showing first 10 of the 100 with [:10]
ready_input_data[0][:10]

array([[0.87905456],
       [0.07241639],
       [0.91677144],
       [0.2579834 ],
       [0.95976867],
       [0.89640432],
       [0.09730953],
       [0.65652502],
       [0.55569525],
       [0.6037214 ]])

In [173]:
ready_target_data = to_categorical(targets)

In [174]:
# examlpe of ready target data, one hot encoding so of the [0...training_unique_characters_count]
# the character is marked as a one where the rest are zeros
print(ready_input_data.shape)
ready_target_data

(23521, 20, 1)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [31]:
# ----------------------------------------------
# REAL FUN TIME - DEFINE MODEL AAAAND SEEEND IT
# ----------------------------------------------

In [194]:
text_generation_model = Sequential([
    LSTM(64, input_shape=(ready_input_data.shape[1], ready_input_data.shape[2])),
    Dropout(0.20),
    Dense(256, activation='linear'),
    Dense(32, activation='linear'),
    Dense(ready_target_data.shape[1], activation='softmax')
])

In [195]:
# prepare for training
text_generation_model.compile(
    loss='categorical_crossentropy',
    optimizer='adam')
# load weights if they exist
# if os.path.isfile('text-gen-words-weights.h5'):
#     text_generation_model.load_weights('text-gen-words-weights.h5')

In [196]:
#if not os.path.isfile('text-gen-words-weights.h5'):
    text_generation_model.fit(
        ready_input_data,
        ready_target_data,
        epochs=10,
        batch_size=32,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0c098c1ef0>

In [178]:
text_generation_model.save_weights('text-gen-words-weights.h5')  # load weights with model.load_weights(filename)
text_generation_model.save('text-gen-words-model.h5')

In [20]:
# ----------------------------------------------
# EVEN MORE REAL FUN TIME - GEN TEXT!!!
# ----------------------------------------------

In [197]:
input_id = np.random.randint(0, len(inputs) - 1)
seed = inputs[input_id]
input_token_ids = seed
complete_string = ' '.join([tokens[val] for val in seed])
complete_string
print("SEED:", complete_string)
for i in range(100):
    input_sequence = np.reshape(
        input_token_ids,
        (1, input_sequence_length, 1)
    ) / len(tokens)
    #print("input_sequence", input_sequence)
    output_vector = text_generation_model.predict(input_sequence)    
    next_token_id = np.argmax(output_vector)
    next_token = tokens[next_token_id]
    #print("next_token_id", next_token_id, "next_token", next_token)
    # append to indices and readable string
    #print(".", ending='')
    input_token_ids.append(next_token_id)
    #print(len(input_token_ids))
    input_token_ids = input_token_ids[1:]
    #print(len(input_token_ids))
    complete_string = complete_string + " " + next_token
print("GENERATION:", complete_string)

SEED: get the first commitment.  the biggest factor in most investors opinions of you is the opinion of other investors.
GENERATION: get the first commitment.  the biggest factor in most investors opinions of you is the opinion of other investors. to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to
