Imports

In [15]:
import eng_to_ipa as ipa
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow_hub as hub
from tensorflow.keras.models import  Sequential, Model
from tensorflow.keras.layers import Layer, Concatenate, Input, Masking, LSTM, Embedding, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy


Load Data (data file already created in data_import.py)

In [35]:
file = open('words.txt','r')
lines = file.readlines()
file.close()

file = open('data.csv','w')

file.write('word,pronunciation\n')
for word in lines:
    if ipa.isin_cmu(word):
        line = word.strip('\n') + ',' + ipa.convert(word) + '\n'
        file.write(line)

file.close()

Data Processing

In [73]:
df = pd.read_csv ('data.csv',delimiter=',')

df.word = df.word.astype(str) 
df.pronunciation = df.pronunciation.astype(str) 

#df.applymap(lambda s: s.replace(s,' '.join(str(s))))
df['word'] = df['word'].str.replace('',' ')
df['pronunciation'] = df['pronunciation'].str.replace('',' ')

MAX_NUM_WORDS = 40000

# 10% for val, 10% for test, 70% for train
val_size = int(df.shape[0] * 0.1)
test_size = int(df.shape[0] * 0.1)

# Shuffle the data
df = df.sample(frac=1)
# Split df to test/val/train
test_df = df[:test_size]
val_df = df[test_size:test_size+val_size]
train_df = df[test_size+val_size:]


train_words, train_pronounciations = list(train_df.word), list(train_df.pronunciation)
val_words, val_pronounciations     = list(val_df.word), list(val_df.pronunciation)
test_words, test_pronounciations   = list(test_df.word), list(test_df.pronunciation)


# Check that idces do not overlap
assert set(train_df.index).intersection(set(val_df.index)) == set({})
assert set(test_df.index).intersection(set(train_df.index)) == set({})
assert set(val_df.index).intersection(set(test_df.index)) == set({})
# Check that all idces are present
assert df.shape[0] == len(train_pronounciations) + len(val_pronounciations) + len(test_pronounciations)

# Sizes
print(
    f"Size of initial data: {df.shape[0]}\n"
    f"Train size: {len(train_pronounciations)}\n"
    f"Val size: {len(val_pronounciations)}\n"
    f"Test size: {len(test_prounciations)}\n"
)

for i in range(len(train_pronounciations)):
    train_pronounciations[i] = "<START>" + train_pronounciations[i] + "<END>"

for i in range(len(val_pronounciations)):
    val_pronounciations[i] = "<START>" + val_pronounciations[i] + "<END>"

for i in range(len(test_pronounciations)):
    test_pronounciations[i] = "<START>" + test_pronounciations[i] + "<END>"

ipa_tokenizer = Tokenizer(num_words = MAX_NUM_WORDS, filters = '')
ipa_tokenizer.fit_on_texts(train_pronounciations)
ipa_int_seq = ipa_tokenizer.texts_to_sequences(train_pronounciations)

ipa_word_to_indx = ipa_tokenizer.word_index

max_ipa_len = max(len(sen) for sen in ipa_int_seq)

padded_tokenized_ipa = tf.keras.preprocessing.sequence.pad_sequences(ipa_int_seq, maxlen = max_ipa_len, padding = 'post', value = 0)

padded_tokenized_ipa.shape



val_tokenizer = Tokenizer(num_words = MAX_NUM_WORDS, filters = '')
val_tokenizer.fit_on_texts(val_pronounciations)
val_int_seq = val_tokenizer.texts_to_sequences(val_pronounciations)

val_word_to_indx = ipa_tokenizer.word_index

max_val_len = max(len(sen) for sen in val_int_seq)

padded_tokenized_val = tf.keras.preprocessing.sequence.pad_sequences(val_int_seq, maxlen = max_val_len, padding = 'post', value = 0)

padded_tokenized_val.shape


Size of initial data: 40315
Train size: 32253
Val size: 4031
Test size: 4031



(4031, 22)

In [74]:

train_data = tf.data.Dataset.from_tensor_slices((train_words, padded_tokenized_ipa))
valid_data = tf.data.Dataset.from_tensor_slices((val_words, padded_tokenized_val))

def str_split(e, g):
    e = tf.strings.split(e)
    return e, g
 
train_data = train_data.map(str_split)
valid_data = valid_data.map(str_split)
       
embedding_layer = hub.load("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128-with-normalization/1")    
    
def embed_english(x, y):
    return embedding_layer(x), y
 
train_data = train_data.map(embed_english)
valid_data = valid_data.map(embed_english)


def remove_long_sentence(e, g):
    return tf.shape(e)[0] <= 13
 
train_data = train_data.filter(remove_long_sentence)
valid_data = valid_data.filter(remove_long_sentence)


def pad_english(e, g):
    return tf.pad(e, paddings = [[13-tf.shape(e)[0],0], [0,0]], mode='CONSTANT', constant_values=0), g
 
train_data = train_data.map(pad_english)
valid_data = valid_data.map(pad_english)

train_data = train_data.batch(16)
valid_data = valid_data.batch(16)

print(train_data.element_spec)
print(valid_data.element_spec)

for e, g in train_data.take(1):
    print(e.shape)
 
for e, g in valid_data.take(1):
    print(g)


(TensorSpec(shape=(None, None, 128), dtype=tf.float32, name=None), TensorSpec(shape=(None, 24), dtype=tf.int32, name=None))
(TensorSpec(shape=(None, None, 128), dtype=tf.float32, name=None), TensorSpec(shape=(None, 22), dtype=tf.int32, name=None))
(16, 13, 128)
tf.Tensor(
[[ 1  3 36 18 30 14  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  3 11 19 10  4 23  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  3 36 32 10  5 31  4  8 15  2  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  3 10 25  5 23 12  7 25  5 17  2  0  0  0  0  0  0  0  0  0  0]
 [ 1 18  3 23 14  7  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  3  6  4 16 35  4  6 13  2  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  9  7 19 31  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  6 18  9  7  9  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 11  4  8  3 23  5 31 37  4  6  2  0  0  0  0  0  0  0  0  0  0]
 [ 1  3  6 25  5 12  8 28 22  2  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 11  6 25  5  4  6  2  0  0  0  0

In [52]:
class CustomLayer(Layer):
 
    def __init__(self, **kwargs):
        super(CustomLayer, self).__init__(**kwargs)
        self.embed = tf.Variable(initial_value=tf.zeros(shape=(1,128)), trainable=True, dtype='float32')
         
    def call(self, inputs):
        x = tf.tile(self.embed, [tf.shape(inputs)[0], 1])
        x = tf.expand_dims(x, axis=1)
        return tf.concat([inputs, x], axis=1)


In [53]:
custom_layer = CustomLayer()
e, g = next(iter(train_data.take(1)))
print(e.shape)
o = custom_layer(e)
o.shape

(16, 13, 128)


TensorShape([16, 14, 128])

In [54]:
inputs = Input(batch_shape = (None, 13, 128), name='input')
x = CustomLayer(name='custom_layer')(inputs)
x = Masking(mask_value=0, name='masking_layer')(x)
x, h, c = LSTM(units=512, return_state=True, name='lstm')(x)
encoder_model = Model(inputs = inputs, outputs = [h, c], name='encoder')

In [80]:
#tokenizer = Tokenizer(train_words)
#pronounciation_tokenizer = Tokenizer(train_pronounciations)
#print(word_tokenizer.index_word)

{}


In [56]:
class Decoder(Model):
     
    def __init__(self, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.embed = Embedding(input_dim=len(ipa_tokenizer.index_word)+1, output_dim=128, mask_zero=True, name='embedding_layer')
        self.lstm = LSTM(units = 512, return_state = True, return_sequences = True, name='lstm_layer')
        self.dense = Dense(len(ipa_tokenizer.index_word)+1, name='dense_layer')
         
    def call(self, inputs, hidden_state = None, cell_state = None):
        x = self.embed(inputs)
        x, hidden_state, cell_state = self.lstm(x, initial_state = [hidden_state, cell_state]) \
                                                     if hidden_state is not None and cell_state is not None else self.lstm(x)
        x = self.dense(x)
        return x, hidden_state, cell_state
 
decoder_model = Decoder(name='decoder')
e, g_in = next(iter(train_data.take(1)))
h, c = encoder_model(e)
g_out, h, c = decoder_model(g_in, h, c)
 
print(g_out.shape, h.shape, c.shape)

(16, 22, 39) (16, 512) (16, 512)


In [124]:
def get_ipa_decoder_data(g):
    g1 = g.numpy()
    #print(g1)
    #new_g1 =  np.delete(g1, np.where(g1 == 2), axis = 0)
    '''
    for list in g1:
        for int in range(len(list)):
            if list[int] == 2:
                np.delete(list, [int])
    '''
    
    x, y = g1.shape
    #after delete
    new_g1 = g1[g1 != 2]
    new_g1 = np.reshape(new_g1, (x, y-1))

    #print(new_g1)
    
    g2 = g.numpy()
    new_g2 =  np.delete(g2,0, axis = 1)
    new_g1 = np.pad(new_g1, ((0,0),(1,0)), 'constant')
    g_in = tf.convert_to_tensor(new_g1, dtype=tf.int32)
    
    #print(g_in)
    new_g2 = np.pad(new_g2, ((0,0),(0,1)), 'constant')
    g_out = tf.convert_to_tensor(new_g2, dtype=tf.int32)
    #print(g_out)
    return g_in, g_out

@tf.function
def forward_backward(encoder_model, decoder_model, e, g_in, g_out, loss):
    with tf.GradientTape() as tape:
        h, c = encoder_model(e)
        d_g_out, _, _ = decoder_model(g_in, h, c)
        cur_loss = loss(g_out, d_g_out)
        grads = tape.gradient(cur_loss, encoder_model.trainable_variables + decoder_model.trainable_variables)
    return cur_loss, grads
 
def train_encoder_decoder(encoder_model, decoder_model, num_epochs, train_data, valid_data, valid_steps, 
                          optimizer, loss, grad_fn):
    train_losses = []
    val_loasses = []
    for epoch in range(num_epochs):
        train_epoch_loss_avg = tf.keras.metrics.Mean()
        val_epoch_loss_avg = tf.keras.metrics.Mean()
        for e, g in train_data:
            g_in, g_out = get_ipa_decoder_data(g)
            #print(g_out)
            train_loss, grads = grad_fn(encoder_model, decoder_model, e, g_in, g_out, loss)
            optimizer.apply_gradients(zip(grads, encoder_model.trainable_variables + decoder_model.trainable_variables))
            train_epoch_loss_avg.update_state(train_loss)    
        for e_v, g_v in valid_data.take(valid_steps):
            g_v_in, g_v_out = get_german_decoder_data(g_v)
            val_loss, _ = grad_fn(encoder_model, decoder_model, e_v, g_v_in, g_v_out, loss)
            val_epoch_loss_avg.update_state(val_loss)        
        print(f'epoch: {epoch}, train loss: {train_epoch_loss_avg.result()}, validation loss: {val_epoch_loss_avg.result()}')    
        train_losses.append(train_epoch_loss_avg.result())
        val_loasses.append(val_epoch_loss_avg.result())
    return train_losses, val_loasses
 
optimizer_obj = Adam(learning_rate = 1e-3)
loss_obj = SparseCategoricalCrossentropy(from_logits=True)
train_loss_results, valid_loss_results = train_encoder_decoder(encoder_model, decoder_model, 20, train_data, valid_data, 20,
                                                          optimizer_obj, loss_obj, forward_backward)

InvalidArgumentError: Graph execution error:

Detected at node 'decoder/embedding_layer/embedding_lookup' defined at (most recent call last):
    File "f:\college stuff\python\lib\runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "f:\college stuff\python\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "f:\college stuff\python\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "f:\college stuff\python\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "f:\college stuff\python\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "f:\college stuff\python\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "f:\college stuff\python\lib\asyncio\base_events.py", line 570, in run_forever
      self._run_once()
    File "f:\college stuff\python\lib\asyncio\base_events.py", line 1859, in _run_once
      handle._run()
    File "f:\college stuff\python\lib\asyncio\events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "f:\college stuff\python\lib\site-packages\ipykernel\kernelbase.py", line 504, in dispatch_queue
      await self.process_one()
    File "f:\college stuff\python\lib\site-packages\ipykernel\kernelbase.py", line 493, in process_one
      await dispatch(*args)
    File "f:\college stuff\python\lib\site-packages\ipykernel\kernelbase.py", line 400, in dispatch_shell
      await result
    File "f:\college stuff\python\lib\site-packages\ipykernel\kernelbase.py", line 724, in execute_request
      reply_content = await reply_content
    File "f:\college stuff\python\lib\site-packages\ipykernel\ipkernel.py", line 390, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "f:\college stuff\python\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "f:\college stuff\python\lib\site-packages\IPython\core\interactiveshell.py", line 2863, in run_cell
      result = self._run_cell(
    File "f:\college stuff\python\lib\site-packages\IPython\core\interactiveshell.py", line 2909, in _run_cell
      return runner(coro)
    File "f:\college stuff\python\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "f:\college stuff\python\lib\site-packages\IPython\core\interactiveshell.py", line 3106, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "f:\college stuff\python\lib\site-packages\IPython\core\interactiveshell.py", line 3309, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "f:\college stuff\python\lib\site-packages\IPython\core\interactiveshell.py", line 3369, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\jojoy\AppData\Local\Temp\ipykernel_35420\1793995889.py", line 63, in <cell line: 63>
      train_loss_results, valid_loss_results = train_encoder_decoder(encoder_model, decoder_model, 20, train_data, valid_data, 20,
    File "C:\Users\jojoy\AppData\Local\Temp\ipykernel_35420\1793995889.py", line 49, in train_encoder_decoder
      train_loss, grads = grad_fn(encoder_model, decoder_model, e, g_in, g_out, loss)
    File "C:\Users\jojoy\AppData\Local\Temp\ipykernel_35420\712443347.py", line 34, in forward_backward
      d_g_out, _, _ = decoder_model(g_in, h, c)
    File "f:\college stuff\python\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "f:\college stuff\python\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "f:\college stuff\python\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\jojoy\AppData\Local\Temp\ipykernel_35420\1055797883.py", line 10, in call
      x = self.embed(inputs)
    File "f:\college stuff\python\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "f:\college stuff\python\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "f:\college stuff\python\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "f:\college stuff\python\lib\site-packages\keras\layers\embeddings.py", line 197, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'decoder/embedding_layer/embedding_lookup'
indices[13,2] = 40 is not in [0, 39)
	 [[{{node decoder/embedding_layer/embedding_lookup}}]] [Op:__inference_forward_backward_110982]

In [65]:
#Tokenize
#word_tokenizer = Tokenizer(train_words)
#pronounciation_tokenizer = Tokenizer(train_pronounciations)


Model

In [17]:
#model = Sequential()
#model.add(Embedding(len(train_words), 512, input_length=20, mask_zero=True))
#model.add(LSTM(512))
#model.add(RepeatVector(20))
#model.add(LSTM(512, return_sequences=True))
#model.add(Dense(len(train_pronounciations), activation='softmax'))
 
#rms = keras.optimizers.RMSprop(learning_rate=0.001)
#model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

#model.summary()


Performance