In [1]:
# Fetch all the packages
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
from typing import List

In [2]:
# Fetch all the layer types that we gonna need
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [3]:
training_text_filename = "paul_graham.txt"
training_raw_text = open(
    training_text_filename,
    'r',
    encoding='utf-8')\
    .read()\
    .lower()\
    .replace("\r"," ")\
    .replace("\n"," ")\
    .replace("\t"," ")\
    .replace("\"", "")\
    .replace("\'", "")\
    .replace("(", "")\
    .replace(")", "")

In [24]:
# check that the data was actually read correctly
training_raw_text = training_raw_text

In [27]:
# create word tokens and mappings from token_id to token and token to token_id
tokens = sorted(list(set(tokenize_string(training_raw_text))))
token_to_id = dict((token, token_id) for token_id, token in enumerate(tokens))

In [30]:
def tokenize_string(a_string: str) -> List[str]:
    stop_words = open("stop_words.txt", "r").read().split('\n')
    ret_list = []
    build_word = ""
    a_string = a_string\
        .lower()\
        .replace("\r"," ")\
        .replace("\n"," ")\
        .replace("\t"," ")\
        .replace("\"", "")\
        .replace("\'", "")\
        .replace("(", "")\
        .replace(")", "")\
        .split()
    for word in a_string:
        build_word = build_word + word + " "
        if word not in stop_words:
            ret_list.append(build_word.strip())
            build_word = ""
    return ret_list

training_tokens = tokenize_string(training_raw_text)
#print(training_tokens)
training_token_ids = [token_to_id[token] for token in training_tokens]
#print("smol example:", training_token_ids[:5])
print(len(training_token_ids))

17868


In [31]:
training_length = len(training_token_ids)
unique_tokens = len(tokens)
print("training_length:", training_length)
print("training_unique_characters_count:", unique_tokens)

training_length: 17868
training_unique_characters_count: 5745


In [32]:
# Assign the lenght of the past words to take as input
input_sequence_length = 20

In [33]:
# Build the input dataset
# Example: if input seq length is 3
# For text a b c d e f g
# gen targets: -> [abc] => d, [bcd] => e, [cde] => f and so on ...
# inputs = [abc, bcd, cde]; targets = [d,e,f]
# We can call the set of inputs and targets as patterns
inputs = []
targets = []
for i in range(0, training_length - input_sequence_length, 1):
    sequence_input = training_tokens[i:i + input_sequence_length]
    target = training_tokens[i + input_sequence_length]
    inputs.append([token_to_id[token] for token in sequence_input])
    targets.append(token_to_id[target])
pattern_count = len(inputs)
pattern_count

17848

In [34]:
# see first input and output
print("EXAMPLE")
print("in:", inputs[0], "out", targets[0])

EXAMPLE
in: [4836, 640, 5349, 1403, 5522, 4943, 3606, 2758, 3296, 603, 934, 2424, 2759, 640, 5332, 1330, 1077, 239, 2715, 870] out 3103


In [35]:
# --------------------------------
# FUN TIME - FORMAT DATA FOR MODEL
# --------------------------------

In [36]:
# unsure what 1 is, it says [samples, timesteps, features] in the article,
# I guess each character is one feature, if image data this could probs be many? idk?
# Dividing by the end with /triaining_unique_characters_count maps inputs to 0-1 range
ready_input_data = np.reshape(inputs, (pattern_count, input_sequence_length, 1)) / len(tokens)

In [37]:
# example prepared input, only showing first 10 of the 100 with [:10]
ready_input_data[0][:10]

array([[0.84177546],
       [0.11140122],
       [0.9310705 ],
       [0.24421236],
       [0.96118364],
       [0.86040035],
       [0.62767624],
       [0.48006963],
       [0.57371628],
       [0.10496084]])

In [38]:
ready_target_data = to_categorical(targets)

In [39]:
# examlpe of ready target data, one hot encoding so of the [0...training_unique_characters_count]
# the character is marked as a one where the rest are zeros
print(ready_input_data.shape)
ready_target_data

(17848, 20, 1)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [40]:
# ----------------------------------------------
# REAL FUN TIME - DEFINE MODEL AAAAND SEEEND IT
# ----------------------------------------------

In [41]:
text_generation_model = Sequential([
    LSTM(64, input_shape=(ready_input_data.shape[1], ready_input_data.shape[2])),
    Dropout(0.20),
    Dense(256, activation='linear'),
    Dense(32, activation='linear'),
    Dense(ready_target_data.shape[1], activation='softmax')
])

In [42]:
# prepare for training
text_generation_model.compile(
    loss='categorical_crossentropy',
    optimizer='adam')
# load weights if they exist
# if os.path.isfile('text-gen-words-weights.h5'):
#     text_generation_model.load_weights('text-gen-words-weights.h5')

In [44]:
#if not os.path.isfile('text-gen-words-weights.h5'):
    text_generation_model.fit(
        ready_input_data,
        ready_target_data,
        epochs=10,
        batch_size=32,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )

In [45]:
text_generation_model.save_weights('text-gen-words-weights.h5')  # load weights with model.load_weights(filename)
text_generation_model.save('text-gen-words-model.h5')

In [46]:
# ----------------------------------------------
# EVEN MORE REAL FUN TIME - GEN TEXT!!!
# ----------------------------------------------

In [47]:
input_id = np.random.randint(0, len(inputs) - 1)
seed = inputs[input_id]
input_token_ids = seed
complete_string = ' '.join([tokens[val] for val in seed])
complete_string
print("SEED:", complete_string)
for i in range(100):
    input_sequence = np.reshape(
        input_token_ids,
        (1, input_sequence_length, 1)
    ) / len(tokens)
    #print("input_sequence", input_sequence)
    output_vector = text_generation_model.predict(input_sequence)    
    next_token_id = np.argmax(output_vector)
    next_token = tokens[next_token_id]
    #print("next_token_id", next_token_id, "next_token", next_token)
    # append to indices and readable string
    #print(".", ending='')
    input_token_ids.append(next_token_id)
    #print(len(input_token_ids))
    input_token_ids = input_token_ids[1:]
    #print(len(input_token_ids))
    complete_string = complete_string + " " + next_token
print("GENERATION:", complete_string)

SEED: powerful than other peoples skepticism: your own skepticism. you too will judge your early work too harshly. how do you
GENERATION: powerful than other peoples skepticism: your own skepticism. you too will judge your early work too harshly. how do you maintenance. maintenance. maintenance. abilities. abilities. abilities. abilities. abilities. heller, abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. abilities. ab