In [6]:
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import itertools
import random
import string
import io

In [45]:
character_list = list(string.ascii_lowercase) + [".","-"," ","+"]
character_lookup = dict(zip(character_list, range(len(character_list))))
max_length = 10
num_characters = len(character_lookup)

In [37]:
pet_data = pd.read_csv("seattle_pet_licenses.csv", dtype = {"Animal's Name": str, 'Species': str,'Primary Breed': str,
                                                      'Secondary Breed': str},
                                                      usecols=["Animal's Name",'Species',
                                                      'Primary Breed','Secondary Breed'],
                                                      keep_default_na=False)
pet_data.columns = ["name","species","primary_breed","secondary_breed"]

for c in pet_data.columns:
    pet_data[c] = [s.lower() for s in pet_data[c]]

pet_data = pet_data[(pet_data["name"] != '') & (pet_data["species"] != '')]
pet_data = pet_data[pet_data["name"].str.match("^[ \\.a-z-]+$")]

In [39]:
def make_subsequences(name, **kwargs):
    value = kwargs
    characters = name + '+'
    subsequences = [list(characters[0:(i+1)]) for i in range(len(characters))]
    for i, s in enumerate(subsequences):
        full_dict = kwargs.copy()
        full_dict.update({'subsequence': s})
        subsequences[i] = full_dict
    return subsequences

subsequences = [make_subsequences(pet_data.iloc[i]["name"], species = pet_data.iloc[i]["species"]) for i in range(len(pet_data))]
subsequences = list(itertools.chain.from_iterable(subsequences))
random.shuffle(subsequences)

In [61]:
def characters_to_matrix(character_data):
    character_data = [[character_lookup[chr] for chr in c] for c in character_data]
    padded_character_data = keras.utils.pad_sequences(character_data, maxlen = max_length+1)
    text_matrix = keras.utils.to_categorical(padded_character_data, num_classes = num_characters)
    return text_matrix

character_data = [s['subsequence'] for s in subsequences]
text_matrix = characters_to_matrix(character_data)

x_name = text_matrix[:,range(max_length),:]
y_name = text_matrix[:,max_length,:]

In [58]:
model = keras.Sequential(
    [
        keras.Input(shape=(max_length, num_characters)),
        layers.LSTM(32, return_sequences = True),
        layers.LSTM(32),
        layers.Dropout(0.2),
        layers.Dense(num_characters, activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [62]:
model.fit(x_name, y_name, batch_size = 64, epochs = 25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x2b732a340d0>

In [60]:
model.save("model.h5")