In [1]:
# Fetch all the packages
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

In [2]:
# Fetch all the layer types that we gonna need
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [3]:
training_text_filename = "paul_graham.txt"
training_raw_text = open(
    training_text_filename,
    'r',
    encoding='utf-8')\
    .read()\
    .lower()\
    .replace("\r"," ")\
    .replace("\n"," ")

In [4]:
# check that the data was actually read correctly
training_raw_text[:300]

'there are two distinct ways to be politically moderate: on purpose and by accident. intentional moderates are trimmers, deliberately choosing a position mid-way between the extremes of right and left. accidental moderates end up in the middle, on average, because they make up their own minds about e'

In [5]:
# create mapping of unique chars to integers
chars = sorted(list(set(training_raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [6]:
training_raw_text_length = len(training_raw_text)
training_unique_characters_count = len(chars)
print("training_raw_text_length:", training_raw_text_length)
print("training_unique_characters_count:", training_unique_characters_count)

training_raw_text_length: 132013
training_unique_characters_count: 61


In [23]:
# Assign the lenght of the char input arr.
# Example: if input seq length was 3, take for instance input "fuc", my best guess for next char is "k".
input_sequence_length = 100

In [24]:
# Build the input dataset
# Example: if input seq length is 3
# For text a b c d e f g
# gen targets: -> [abc] => d, [bcd] => e, [cde] => f and so on ...
# inputs = [abc, bcd, cde]; targets = [d,e,f]
# We can call the set of inputs and targets as patterns
inputs = []
targets = []
for i in range(0, training_raw_text_length - input_sequence_length, 1):
    sequence_input = training_raw_text[i:i + input_sequence_length]
    target = training_raw_text[i + input_sequence_length]
    inputs.append([char_to_int[char] for char in sequence_input])
    targets.append(char_to_int[target])
pattern_count = len(inputs)

In [25]:
# see first input and output
print("EXAMPLE")
print("in:", inputs[0], "out", targets[0])

EXAMPLE
in: [51, 39, 36, 49, 36, 0, 32, 49, 36, 0, 51, 54, 46, 0, 35, 40, 50, 51, 40, 45, 34, 51, 0, 54, 32, 56, 50, 0, 51, 46, 0, 33, 36, 0, 47, 46, 43, 40, 51, 40, 34, 32, 43, 43, 56, 0, 44, 46, 35, 36, 49, 32, 51, 36, 23, 0, 46, 45, 0, 47, 52, 49, 47, 46, 50, 36, 0, 32, 45, 35, 0, 33, 56, 0, 32, 34, 34, 40, 35, 36, 45, 51, 11, 0, 40, 45, 51, 36, 45, 51, 40, 46, 45, 32, 43, 0, 44, 46, 35, 36] out 49


In [26]:
# --------------------------------
# FUN TIME - FORMAT DATA FOR MODEL
# --------------------------------

In [27]:
# unsure what 1 is, it says [samples, timesteps, features] in the article,
# I guess each character is one feature, if image data this could probs be many? idk?
# Dividing by the end with /triaining_unique_characters_count maps inputs to 0-1 range
ready_input_data = np.reshape(inputs, (pattern_count, input_sequence_length, 1)) / training_unique_characters_count

In [28]:
# example prepared input, only showing first 10 of the 100 with [:10]
ready_input_data[0][:10]

array([[0.83606557],
       [0.63934426],
       [0.59016393],
       [0.80327869],
       [0.59016393],
       [0.        ],
       [0.52459016],
       [0.80327869],
       [0.59016393],
       [0.        ]])

In [29]:
ready_target_data = to_categorical(targets)

In [30]:
# examlpe of ready target data, one hot encoding so of the [0...training_unique_characters_count]
# the character is marked as a one where the rest are zeros
print(ready_input_data.shape)
ready_target_data

(131913, 100, 1)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [31]:
# ----------------------------------------------
# REAL FUN TIME - DEFINE MODEL AAAAND SEEEND IT
# ----------------------------------------------

In [32]:
text_generation_model = Sequential([
    LSTM(256, input_shape=(ready_input_data.shape[1], ready_input_data.shape[2])),
    Dropout(0.05),
    Dense(32, activation='relu'),
    Dense(ready_target_data.shape[1], activation='softmax')
])

In [33]:
# prepare for training
text_generation_model.compile(
    loss='categorical_crossentropy',
    optimizer='adam')
# load weights if they exist
if os.path.isfile('text-gen-weights.h5'):
    text_generation_model.load_weights('text-gen-weights.h5')

In [34]:
#if not os.path.isfile('text-gen-weights.h5'):
    text_generation_model.fit(
        ready_input_data,
        ready_target_data,
        epochs=2,
        batch_size=32,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f048e349320>

In [19]:
text_generation_model.save_weights('text-gen-weights.h5')  # load weights with model.load_weights(filename)
text_generation_model.save('text-gen-model.h5')

In [20]:
# ----------------------------------------------
# EVEN MORE REAL FUN TIME - GEN TEXT!!!
# ----------------------------------------------

In [22]:
input_id = np.random.randint(0, len(inputs) - 1)
seed = inputs[input_id]
input_char_indices = seed
complete_string = ''.join([chars[val] for val in seed])

print(complete_string)
for i in range(200):
    input_sequence = np.reshape(
        input_char_indices,
        (1, input_sequence_length, 1)
    ) / training_unique_characters_count
    # print("input_sequence", input_sequence)
    output_vector = text_generation_model.predict(input_sequence)    
    next_char_index = np.argmax(output_vector)
    next_char = chars[next_char_index]
    print("next_char_index", next_char_index, "next_char", next_char)
    # append to indices and readable string
    #print(".", ending='')
    input_char_indices.append(next_char_index)
    input_char_indices = input_char_indices[1:input_sequence_length+1]
    complete_string = complete_string + next_char
print(complete_string)

y? will you be able 
next_char_index 51 next_char t
next_char_index 39 next_char h
next_char_index 0 next_char  
next_char_index 51 next_char t
next_char_index 39 next_char h
next_char_index 36 next_char e
next_char_index 0 next_char  
next_char_index 51 next_char t
next_char_index 46 next_char o
next_char_index 0 next_char  
next_char_index 51 next_char t
next_char_index 46 next_char o
next_char_index 36 next_char e
next_char_index 0 next_char  
next_char_index 51 next_char t
next_char_index 46 next_char o
next_char_index 0 next_char  
next_char_index 51 next_char t
next_char_index 39 next_char h
next_char_index 36 next_char e
next_char_index 0 next_char  
next_char_index 51 next_char t
next_char_index 46 next_char o
next_char_index 0 next_char  
next_char_index 51 next_char t
next_char_index 46 next_char o
next_char_index 36 next_char e
next_char_index 0 next_char  
next_char_index 51 next_char t
next_char_index 46 next_char o
next_char_index 0 next_char  
next_char_index 51 next_cha