In [361]:
import keras
from keras.layers import Input 
from keras.layers import Dense 
from keras.layers import GRU
from keras import Model
from keras.utils import Sequence
from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
import numpy as np
import tensorflow as tf 
import re
import json

In [4]:
with open('pokemonNames.txt', 'r') as f:
    data = f.read()

In [363]:
xd = json.dumps(tokenized_data)

In [371]:
json.loads(xd)

{'d': 0,
 'n': 1,
 'c': 2,
 'o': 3,
 'y': 4,
 "'": 5,
 'é': 6,
 'x': 7,
 '.': 8,
 'v': 9,
 'k': 10,
 '-': 11,
 'u': 12,
 'z': 13,
 ':': 14,
 'f': 15,
 'j': 16,
 'g': 17,
 '2': 18,
 'e': 19,
 'b': 20,
 '♀': 21,
 'w': 22,
 's': 23,
 'q': 24,
 '♂': 25,
 'p': 26,
 't': 27,
 'r': 28,
 'h': 29,
 'l': 30,
 'i': 31,
 'a': 32,
 'm': 33}

In [5]:
class MyBatchGenerator(Sequence):
    def __init__(self, tokenized_data, char_to_index, shuffle = True):
        self.x, self.y = create_x_y(tokenized_data, char_to_index)
        # training using SGD
        # otherwise would use padding and masking and process in parallel 
        self.batch_size = 1
        self.shuffle = shuffle
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.floor(len(self.y)/self.batch_size))
    
    def __getitem__(self, index):
        return self.__data_generation(index)
    
    def on_epoch_end(self):
        self.indices = np.arange(len(self.y))
        if self.shuffle:
            np.random.shuffle(self.indices) 
    
    def __data_generation(self, index):
        sentence = self.x[index].reshape(1, *self.x[index].shape)
        labels = self.y[index].reshape(1, self.y[index].shape[0], 1)
        return sentence, labels 

In [227]:
tok_data = data_preprocess_pipeline(data)

In [235]:
def data_preprocess_pipeline():
    with open('pokemonNames.txt', 'r') as f:
        data = f.read()
    # remove spaces 
    tokenized = data.lower().split('\n')
    # remove empty spaces
    tokenized = list(filter(lambda x: x, tokenized))
    # for sentence in tokenized data
    for i,sentence in enumerate(tokenized):
        tokenized[i] = re.sub(r'\s+', '', sentence)
    return tokenized

def create_poke_maps(text):
    unique_chars = set(text) 
    char_to_index = {c:i for i,c in enumerate(unique_chars)}
    index_to_char = {i:c for i,c in enumerate(unique_chars)}
    return char_to_index, index_to_char

In [236]:
def char_to_one_hot_vector(char, char_to_index):
    vector = np.zeros((1, len(char_to_index)), dtype = "float32")
    vector[:,char_to_index[char]] = 1 
    return vector

In [237]:
def word_to_OHvectors(word, char_to_index):
    # timestep x dim_vocab matrix 
    matrix = np.empty((len(word), len(char_to_index)))
    # one hot encode every single char vector 
    for i,char in enumerate(word):
        matrix[i] = char_to_one_hot_vector(char, char_to_index) 
    return matrix 

In [238]:
def create_x_y(preprocessed_list, char_to_index):
    x = []
    y = []
    for name in preprocessed_list:
        # training language model - the labels at every time step t
        # is the char at time step t+1 
        x.append(word_to_OHvectors(name[:-1], char_to_index))
        y.append(label_to_int(name[1:], char_to_index))
    x = np.array(x) 
    return x,y

In [239]:
def label_to_int(word, char_to_index):
    labels = []
    for char in word:
        labels.append(char_to_index[char])
    return np.array(labels)

In [241]:
def make_name(model, index_to_char, char_to_index, min_seq_len = 3, max_seq_len = 15):
    # start model off with a vector of zeros at timestep 0
    x = np.zeros((1, 1, len(index_to_char)))
    # we stop generating names when the max sequence length is hit
    # but in order to get a variety of different outputs we sample an int between
    # the low bound and high bound
    curr_seq_len = np.random.randint(min_seq_len, max_seq_len)
    generated_name = [] 
    for i in range(curr_seq_len):
        raw_logits = model.predict(x)[0]
        # get generated char at this timestep 
        predicted_index = tf.random.categorical(raw_logits, num_samples=1).numpy()[0][0]
        gen_char = index_to_char[predicted_index]
        # append char to name we're making, and create a new vector to be input at the
        # next timestep
        generated_name.append(gen_char)
        x[0,:,:] = char_to_one_hot_vector(gen_char, char_to_index)
    return "".join(generated_name)

In [352]:
class languageModel:
    def __init__(self, data_pipeline, vocab_dim, hidden_dim, learning_rate):
        self.tokenized_data = data_pipeline() 
        self._char_to_index, self.index_to_char = create_poke_maps("".join(self.tokenized_data))
        self.model = self.create_char_gru_model(vocab_dim, hidden_dim, learning_rate)

    def create_char_gru_model(self, vocab_dim, hidden_dim, learning_rate):
        char_input = Input(shape = (None, vocab_dim), batch_size = 1)
        GRU_cell = GRU(hidden_dim, return_sequences=True)(char_input)
        predicted_char = keras.layers.TimeDistributed(Dense(vocab_dim))(GRU_cell)
        model = Model(inputs = char_input, outputs = predicted_char)


        optimizer = keras.optimizers.Adam(
            learning_rate = learning_rate
        )
        loss = keras.losses.SparseCategoricalCrossentropy(
            from_logits=True
        )
        model.compile(
            loss=loss, 
            optimizer = optimizer,
            metrics = loss
        )
        return model 

    def make_name(self, min_seq_len = 3, max_seq_len = 15):
        # start model off with a vector of zeros at timestep 0
        x = np.zeros((1, 1, len(self.index_to_char)))
        # we stop generating names when the max sequence length is hit
        # but in order to get a variety of different outputs we sample an int between
        # the low bound and high bound
        curr_seq_len = np.random.randint(min_seq_len, max_seq_len)
        generated_name = [] 
        for i in range(curr_seq_len):
            raw_logits = self.model.predict(x)[0]
            # get generated char at this timestep 
            predicted_index = tf.random.categorical(raw_logits, num_samples=1).numpy()[0][0]
            gen_char = self.index_to_char[predicted_index]
            # append char to name we're making, and create a new vector to be input at the
            # next timestep
            generated_name.append(gen_char)
            x[0,:,:] = char_to_one_hot_vector(gen_char, char_to_index)
        return "".join(generated_name)

    def train_callback_generate_names(self, epoch, logs):
        # check progress every 27 epochs
        # if epoch % 27 == 0:
            print('\n')
            for i in range(5):
                generated_name = self.make_name()
                print(generated_name)
            print('\n')

    def fit(self, epochs):
        genNamesDuringTraining = LambdaCallback(
            on_epoch_end=self.train_callback_generate_names
        )

        modelCheckpointTraining = ModelCheckpoint(
            filepath = os.path.abspath('') + "/weights.{epoch:02d}.hdf5",
            save_weights_only=True,
            period = 1
        )

        self.model.fit(MyBatchGenerator(self.tokenized_data, self._char_to_index), 
                epochs = 10, callbacks = [genNamesDuringTraining, modelCheckpointTraining])

In [359]:
os.path.abspath('') + "/modelnum.{epoch:02d}.hdf5"

'/Users/khaira/Documents/Pokemon-MachineLearning-App/ML_Model_Serving/languageModel/model/modelnum.{epoch:02d}.hdf5'

In [353]:
lm = languageModel(data_preprocess_pipeline,34, 100, 0.001)

In [354]:
lm.make_name()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3


'psb'

In [355]:
lm.fit(10)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3

ool
mbrorav
naehelpirmkxa
ian-teotetenc
tena-itxaqiato


Epoch 2/10

oreebanacizi
oratoastci
rly
rskovnceurax
difazaniarboun


Epoch 3/10

ieog
irrorher
filllmeauramia
tabgdrishibavu
ishanotrtaara


Epoch 4/10

ushuzuokosivai
eaur
irer


KeyboardInterrupt: 

In [137]:
batchObj.x[0].shape

(8, 34)

In [154]:
probs = model.predict(np.ones((1,1,34)))[0]
keras.losses.sparse_categorical_crossentropy([5],probs, from_logits=True)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([23.683773], dtype=float32)>

In [161]:
probs.shape

(1, 34)

In [210]:
tf.random.categorical(probs, 1).numpy()

array([[12]])