Imports

In [19]:
import eng_to_ipa as ipa
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow_hub as hub
from tensorflow.keras.models import  Sequential, Model
from tensorflow.keras.layers import Layer, Concatenate, Input, Masking, LSTM, Embedding, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy


Load Data (data file already created in data_import.py)

In [10]:
file = open('words.txt','r')
lines = file.readlines()
file.close()

file = open('data.csv','w')

file.write('word,pronunciation\n')
for word in lines:
    if ipa.isin_cmu(word):
        line = word.strip('\n') + ',' + ipa.convert(word) + '\n'
        file.write(line)

file.close()

UnicodeEncodeError: 'charmap' codec can't encode characters in position 9-10: character maps to <undefined>

Data Processing

In [24]:
df = pd.read_csv ('data.csv',delimiter=',')

df.word = df.word.astype(str) 
df.pronunciation = df.pronunciation.astype(str) 

#df.applymap(lambda s: s.replace(s,' '.join(str(s))))
df['word'] = df['word'].str.replace('',' ')
df['pronunciation'] = df['pronunciation'].str.replace('',' ')

MAX_NUM_WORDS = 40000

# 10% for val, 10% for test, 70% for train
val_size = int(df.shape[0] * 0.1)
test_size = int(df.shape[0] * 0.1)

# Shuffle the data
df = df.sample(frac=1)
# Split df to test/val/train
test_df = df[:test_size]
val_df = df[test_size:test_size+val_size]
train_df = df[test_size+val_size:]


train_words, train_pronounciations = list(train_df.word), list(train_df.pronunciation)
val_words, val_pronounciations     = list(val_df.word), list(val_df.pronunciation)
test_words, test_pronounciations   = list(test_df.word), list(test_df.pronunciation)


# Check that idces do not overlap
assert set(train_df.index).intersection(set(val_df.index)) == set({})
assert set(test_df.index).intersection(set(train_df.index)) == set({})
assert set(val_df.index).intersection(set(test_df.index)) == set({})
# Check that all idces are present
assert df.shape[0] == len(train_pronounciations) + len(val_pronounciations) + len(test_pronounciations)

# Sizes
print(
    f"Size of initial data: {df.shape[0]}\n"
    f"Train size: {len(train_pronounciations)}\n"
    f"Val size: {len(val_pronounciations)}\n"
    f"Test size: {len(test_pronounciations)}\n"
)

for i in range(len(train_pronounciations)):
    train_pronounciations[i] = "<START>" + train_pronounciations[i] + "<END>"

for i in range(len(val_pronounciations)):
    val_pronounciations[i] = "<START>" + val_pronounciations[i] + "<END>"

for i in range(len(test_pronounciations)):
    test_pronounciations[i] = "<START>" + test_pronounciations[i] + "<END>"

ipa_tokenizer = Tokenizer(num_words = MAX_NUM_WORDS, filters = '')
ipa_tokenizer.fit_on_texts(train_pronounciations)
ipa_int_seq = ipa_tokenizer.texts_to_sequences(train_pronounciations)

ipa_word_to_indx = ipa_tokenizer.word_index

max_ipa_len = max(len(sen) for sen in ipa_int_seq)

padded_tokenized_ipa = tf.keras.preprocessing.sequence.pad_sequences(ipa_int_seq, maxlen = max_ipa_len, padding = 'post', value = 0)

padded_tokenized_ipa.shape



val_tokenizer = Tokenizer(num_words = MAX_NUM_WORDS, filters = '')
val_tokenizer.fit_on_texts(val_pronounciations)
val_int_seq = val_tokenizer.texts_to_sequences(val_pronounciations)

val_word_to_indx = ipa_tokenizer.word_index

max_val_len = max(len(sen) for sen in val_int_seq)

padded_tokenized_val = tf.keras.preprocessing.sequence.pad_sequences(val_int_seq, maxlen = max_val_len, padding = 'post', value = 0)

padded_tokenized_val.shape


Size of initial data: 40315
Train size: 32253
Val size: 4031
Test size: 4031



(4031, 23)

In [25]:

train_data = tf.data.Dataset.from_tensor_slices((train_words, padded_tokenized_ipa))
valid_data = tf.data.Dataset.from_tensor_slices((val_words, padded_tokenized_val))

def str_split(e, g):
    e = tf.strings.split(e)
    return e, g
 
train_data = train_data.map(str_split)
valid_data = valid_data.map(str_split)
       
embedding_layer = hub.load("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128-with-normalization/1")    
    
def embed_english(x, y):
    return embedding_layer(x), y
 
train_data = train_data.map(embed_english)
valid_data = valid_data.map(embed_english)


def remove_long_sentence(e, g):
    return tf.shape(e)[0] <= 13
 
train_data = train_data.filter(remove_long_sentence)
valid_data = valid_data.filter(remove_long_sentence)


def pad_english(e, g):
    return tf.pad(e, paddings = [[13-tf.shape(e)[0],0], [0,0]], mode='CONSTANT', constant_values=0), g
 
train_data = train_data.map(pad_english)
valid_data = valid_data.map(pad_english)

train_data = train_data.batch(16)
valid_data = valid_data.batch(16)

print(train_data.element_spec)
print(valid_data.element_spec)

for e, g in train_data.take(1):
    print(e.shape)
 
for e, g in valid_data.take(1):
    print(g)


(TensorSpec(shape=(None, None, 128), dtype=tf.float32, name=None), TensorSpec(shape=(None, 24), dtype=tf.int32, name=None))
(TensorSpec(shape=(None, None, 128), dtype=tf.float32, name=None), TensorSpec(shape=(None, 23), dtype=tf.int32, name=None))
(16, 13, 128)
tf.Tensor(
[[ 1 23  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  3 18 19 37 13 20 23 12  8  5 15  2  0  0  0  0  0  0  0  0  0  0]
 [ 1  4  9  3 16 10 17 15  4  9  7 10 14  2  0  0  0  0  0  0  0  0  0]
 [ 1  3 33 17  7 13  8 31  7  2  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 18 29  6  8  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  3 18 23 11  4  6 13 21  5  7  4  6 15  2  0  0  0  0  0  0  0  0]
 [ 1  3  9  4  6 37  4  6  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 12  5 10  7  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  3  5  9 24  4  9  7  8  2  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 11 17 10 27 15  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  3

In [26]:
class CustomLayer(Layer):
 
    def __init__(self, **kwargs):
        super(CustomLayer, self).__init__(**kwargs)
        self.embed = tf.Variable(initial_value=tf.zeros(shape=(1,128)), trainable=True, dtype='float32')
         
    def call(self, inputs):
        x = tf.tile(self.embed, [tf.shape(inputs)[0], 1])
        x = tf.expand_dims(x, axis=1)
        return tf.concat([inputs, x], axis=1)


In [27]:
custom_layer = CustomLayer()
e, g = next(iter(train_data.take(1)))
print(e.shape)
o = custom_layer(e)
o.shape

(16, 13, 128)


TensorShape([16, 14, 128])

In [28]:
inputs = Input(batch_shape = (None, 13, 128), name='input')
x = CustomLayer(name='custom_layer')(inputs)
x = Masking(mask_value=0, name='masking_layer')(x)
x, h, c = LSTM(units=512, return_state=True, name='lstm')(x)
encoder_model = Model(inputs = inputs, outputs = [h, c], name='encoder')

In [80]:
#tokenizer = Tokenizer(train_words)
#pronounciation_tokenizer = Tokenizer(train_pronounciations)
#print(word_tokenizer.index_word)

{}


In [29]:
class Decoder(Model):
     
    def __init__(self, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.embed = Embedding(input_dim=len(ipa_tokenizer.index_word)+1, output_dim=128, mask_zero=True, name='embedding_layer')
        self.lstm = LSTM(units = 512, return_state = True, return_sequences = True, name='lstm_layer')
        self.dense = Dense(len(ipa_tokenizer.index_word)+1, name='dense_layer')
         
    def call(self, inputs, hidden_state = None, cell_state = None):
        x = self.embed(inputs)
        x, hidden_state, cell_state = self.lstm(x, initial_state = [hidden_state, cell_state]) \
                                                     if hidden_state is not None and cell_state is not None else self.lstm(x)
        x = self.dense(x)
        return x, hidden_state, cell_state
 
decoder_model = Decoder(name='decoder')
e, g_in = next(iter(train_data.take(1)))
h, c = encoder_model(e)
g_out, h, c = decoder_model(g_in, h, c)
 
print(g_out.shape, h.shape, c.shape)

decoder_model.summary()

(16, 24, 41) (16, 512) (16, 512)
Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding)  multiple                 5248      
                                                                 
 lstm_layer (LSTM)           multiple                  1312768   
                                                                 
 dense_layer (Dense)         multiple                  21033     
                                                                 
Total params: 1,339,049
Trainable params: 1,339,049
Non-trainable params: 0
_________________________________________________________________


In [None]:
def get_ipa_decoder_data(g):
    g1 = g.numpy()
    #print(g1)
    #new_g1 =  np.delete(g1, np.where(g1 == 2), axis = 0)
    '''
    for list in g1:
        for int in range(len(list)):
            if list[int] == 2:
                np.delete(list, [int])
    '''
    
    x, y = g1.shape
    #after delete
    new_g1 = g1[g1 != 2]
    new_g1 = np.reshape(new_g1, (x, y-1))

    #print(new_g1)
    
    g2 = g.numpy()
    new_g2 =  np.delete(g2,0, axis = 1)
    new_g1 = np.pad(new_g1, ((0,0),(1,0)), 'constant')
    g_in = tf.convert_to_tensor(new_g1, dtype=tf.int32)
    
    #print(g_in)
    new_g2 = np.pad(new_g2, ((0,0),(0,1)), 'constant')
    g_out = tf.convert_to_tensor(new_g2, dtype=tf.int32)
    #print(g_out)
    return g_in, g_out

@tf.function
def forward_backward(encoder_model, decoder_model, e, g_in, g_out, loss):
    with tf.GradientTape() as tape:
        h, c = encoder_model(e)
        d_g_out, _, _ = decoder_model(g_in, h, c)
        cur_loss = loss(g_out, d_g_out)
        grads = tape.gradient(cur_loss, encoder_model.trainable_variables + decoder_model.trainable_variables)
    return cur_loss, grads
 
def train_encoder_decoder(encoder_model, decoder_model, num_epochs, train_data, valid_data, valid_steps, 
                          optimizer, loss, grad_fn):
    train_losses = []
    val_loasses = []
    for epoch in range(num_epochs):
        train_epoch_loss_avg = tf.keras.metrics.Mean()
        val_epoch_loss_avg = tf.keras.metrics.Mean()
        for e, g in train_data:
            g_in, g_out = get_ipa_decoder_data(g)
            #print(g_out)
            train_loss, grads = grad_fn(encoder_model, decoder_model, e, g_in, g_out, loss)
            optimizer.apply_gradients(zip(grads, encoder_model.trainable_variables + decoder_model.trainable_variables))
            train_epoch_loss_avg.update_state(train_loss)    
        for e_v, g_v in valid_data.take(valid_steps):
            g_v_in, g_v_out = get_german_decoder_data(g_v)
            val_loss, _ = grad_fn(encoder_model, decoder_model, e_v, g_v_in, g_v_out, loss)
            val_epoch_loss_avg.update_state(val_loss)        
        print(f'epoch: {epoch}, train loss: {train_epoch_loss_avg.result()}, validation loss: {val_epoch_loss_avg.result()}')    
        train_losses.append(train_epoch_loss_avg.result())
        val_loasses.append(val_epoch_loss_avg.result())
    return train_losses, val_loasses
 
optimizer_obj = Adam(learning_rate = 1e-3)
loss_obj = SparseCategoricalCrossentropy(from_logits=True)
train_loss_results, valid_loss_results = train_encoder_decoder(encoder_model, decoder_model, 20, train_data, valid_data, 20,
                                                          optimizer_obj, loss_obj, forward_backward)

import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.xlabel("Epochs", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.title('Loss vs epochs')
plt.plot(train_loss_results, label='train')
plt.plot(valid_loss_results, label='valid')
plt.legend()
plt.show()

In [79]:
english = test_words

#print(english)

indices = np.random.choice(len(english), 5)

print(indices)

test_data = tf.data.Dataset.from_tensor_slices(np.array([english[i] for i in indices]))
test_data = test_data.map(tf.strings.split)
test_data = test_data.map(embedding_layer)
test_data = test_data.filter(lambda x: tf.shape(x)[0] <= 13)
test_data = test_data.map(lambda x: tf.pad(x, paddings = [[13-tf.shape(x)[0],0], [0,0]], mode='CONSTANT', constant_values=0))
print(test_data.element_spec)
# TensorSpec(shape=(None, 128), dtype=tf.float32, name=None)

#for x in test_data:
    #print(x)
    
#testing = test_data.take(6)
#for x in testing:
    #print(x)
    
n = 0

start_token = np.array(ipa_tokenizer.texts_to_sequences(['']))
end_token = np.array(ipa_tokenizer.texts_to_sequences(['']))
for e, i in zip(test_data.take(5), indices):
    #print(test_data.take(0))
    
    print(e)
    print()
    print(i)
    
    h, c = encoder_model(tf.expand_dims(e, axis=0))
    g_t = []
    g_in = start_token
    g_out, h, c = decoder_model(g_in, h, c)
    g_t.append('')
    g_out = tf.argmax(g_out, axis=2)
    while g_out != end_token: 
        g_out, h, c = decoder_model(g_in, h, c)
        g_out = tf.argmax(g_out, axis=2)
        g_in = g_out
        g_t.append(ipa_tokenizer.index_word.get(tf.squeeze(g_out).numpy(), 'UNK'))
    print(f'English Text: {english[i]}')
    print(f'German Translation: {" ".join(g_t)}')
    print()
    n += 1

[ 102 3256 3141  838 3471]
TensorSpec(shape=(None, 128), dtype=tf.float32, name=None)
tf.Tensor(
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.12092672  0.11381436 -0.06334078 ...  0.01318553  0.0630509
  -0.13633421]
 [ 0.05008225  0.11326285  0.16089416 ...  0.13769226 -0.00445221
  -0.1637492 ]
 [ 0.11089496  0.05322636 -0.07900235 ...  0.06433398 -0.03737464
  -0.15818046]], shape=(13, 128), dtype=float32)

102


InvalidArgumentError: Exception encountered when calling layer "lstm_layer" (type LSTM).

slice index 0 of dimension 0 out of bounds. [Op:StridedSlice] name: decoder/lstm_layer/strided_slice/

Call arguments received:
  • inputs=tf.Tensor(shape=(1, 0, 128), dtype=float32)
  • mask=tf.Tensor(shape=(1, 0), dtype=bool)
  • training=None
  • initial_state=['tf.Tensor(shape=(1, 512), dtype=float32)', 'tf.Tensor(shape=(1, 512), dtype=float32)']

In [65]:
#Tokenize
#word_tokenizer = Tokenizer(train_words)
#pronounciation_tokenizer = Tokenizer(train_pronounciations)


Model

In [17]:
#model = Sequential()
#model.add(Embedding(len(train_words), 512, input_length=20, mask_zero=True))
#model.add(LSTM(512))
#model.add(RepeatVector(20))
#model.add(LSTM(512, return_sequences=True))
#model.add(Dense(len(train_pronounciations), activation='softmax'))
 
#rms = keras.optimizers.RMSprop(learning_rate=0.001)
#model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

#model.summary()


Performance