In [27]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Dense, Activation
import datetime

In [28]:
results = open('results.txt','w')

# Prepare data

In [29]:
def remove_non_alpha(text):
    return ''.join(i for i in text if i.isalpha() or i == ' ')

In [30]:
pokemon_df = pd.read_csv('data/pokemon.csv')
pokemon_names = pokemon_df.name.apply(remove_non_alpha).str.lower().tolist()
data = ''.join(pokemon_names)
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

There are 5968 total characters and 28 unique characters in your data.


In [31]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

{0: ' ', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 27: 'é'}


# Keras Format
Expected format: (batch_size, timesteps, input_dim)
Source: https://keras.io/layers/recurrent/

In [32]:
SEQ_LENGTH = len(max(pokemon_names, key=len))
print(max(pokemon_names, key=len), SEQ_LENGTH)

crabominable 12


In [33]:
batch_size = len(data) // SEQ_LENGTH
X = np.zeros((batch_size, SEQ_LENGTH, vocab_size))
y = np.zeros((batch_size, SEQ_LENGTH, vocab_size))

for i in range(batch_size):
    X_sequence = data[i*SEQ_LENGTH:(i+1)*SEQ_LENGTH]
    X_sequence_ix = [char_to_ix[val] for val in X_sequence]
    
    input_sequence = np.zeros((SEQ_LENGTH, vocab_size))
    for j in range(SEQ_LENGTH):
        input_sequence[j][X_sequence_ix[j]] = 1.
    X[i] = input_sequence

    y_sequence = data[i*SEQ_LENGTH+1:(i+1)*SEQ_LENGTH+1]
    y_sequence_ix = [char_to_ix[val] for val in y_sequence]
    
    target_sequence = np.zeros((SEQ_LENGTH, vocab_size))
    new_length = len(y_sequence)
    for j in range(new_length):
        target_sequence[j][y_sequence_ix[j]] = 1.
    y[i] = target_sequence

In [34]:
print(X.shape, y.shape)

(497, 12, 28) (497, 12, 28)


# Train

In [35]:
model = Sequential()
model.add(LSTM(vocab_size, input_shape=(None, vocab_size), dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(LSTM(vocab_size, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(LSTM(vocab_size, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", 
              optimizer="adam")

In [36]:
# model.load_weights('<some-prev-day>.hdf5')

In [37]:
def generate_text(model, max_length):
    # starting with random character
    ix = [np.random.randint(1, vocab_size)] # don't start off with space
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, max_length, vocab_size))
    for i in range(max_length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="", file=results)
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [38]:
NUM_EPOCHS = 1000

In [39]:
for i in range(NUM_EPOCHS):
    print('\n', file=results)
    model.fit(X, y, batch_size=50, verbose=0, epochs=1) # show progress
    print(i, file=results)
    # print 5 pokemon names each time
    for j in range(5):
        generate_text(model, np.random.randint(3, SEQ_LENGTH))
        print('\n', file=results)

In [40]:
now = datetime.datetime.today().strftime('%Y-%m-%d')
model.save_weights('{save_name}.hdf5'.format(save_name=now))

In [41]:
results.close()