In [21]:
from tensorflow import keras
import numpy as np
from datasets import load_dataset
import os.path
%run data_preprocessing.ipynb

In [22]:
#Variables that were altered to test for best architecture
EPOCHS =1
BATCH_SIZE = 1
OPTIMIZER = 'rmsprop'
LOSS_FUNCTION = 'categorical_crossentropy'
LATENT_DIM = 256
STEP_SIZE = 10

#Globals
target_characters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q','r','s','t','u','v','w','x','y','z',' ']
target_characters = sorted(target_characters)
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

image_name = 0
x_processed_data = []
y_processed_data = []

In [23]:
def train_model(ds): 
    global image_name
    global x_processed_data
    global y_processed_data
    
    image_name = 0
    x_processed_data = []
    y_processed_data = []
    
    for i in range(STEP_SIZE):
        x_processed_data.append(audio_file_prep(ds[i]['path']))
        y_processed_data.append(transcript_prep(ds[i]["sentence"]))


        #if something about the data is bad, it is removed from the set
        if(x_processed_data[len(x_processed_data)-1] == -1 or len(y_processed_data[len(y_processed_data)-1])==0):
            x_processed_data.pop(len(x_processed_data)-1)
            y_processed_data.pop(len(y_processed_data)-1)

    num_encoder_tokens = 256
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_length = max(len(audio) for audio in x_processed_data)
    max_decoder_seq_length = 200

    encoder_input_data = np.zeros( (len(x_processed_data), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
    decoder_input_data = np.zeros( (len(y_processed_data), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
    decoder_target_data = np.zeros( (len(y_processed_data), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

    for i, (x_datapoint, y_datapoint) in enumerate(zip(x_processed_data, y_processed_data)):
        for t, val in enumerate(x_datapoint):
            encoder_input_data[i, t, int(val*255)] = 1.0
        encoder_input_data[i, t + 1 :, 0] = 1.0
        for t, char in enumerate(y_datapoint):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
        decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
        decoder_target_data[i, t:, target_token_index[" "]] = 1.0

    model = None
    if(os.path.exists('./s2s_model.keras')):
        model = keras.models.load_model("s2s_model.keras")

    else: 
        # Define an input sequence and process it.
        encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
        encoder = keras.layers.LSTM(LATENT_DIM, return_state=True)
        encoder_outputs, state_h, state_c = encoder(encoder_inputs)

        # We discard `encoder_outputs` and only keep the states.
        encoder_states = [state_h, state_c]

        # Set up the decoder, using `encoder_states` as initial state.
        decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

        # We set up our decoder to return full output sequences,
        # and to return internal states as well. We don't use the
        # return states in the training model, but we will use them in inference.
        decoder_lstm = keras.layers.LSTM(LATENT_DIM, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
        decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
        decoder_outputs = decoder_dense(decoder_outputs)

        # Define the model that will turn
        # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
        model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

        model.compile(
            optimizer=OPTIMIZER, loss=LOSS_FUNCTION, metrics=["accuracy"]
        )
    model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=0.2,
    )
    # Save model
    model.save("s2s_model.keras")
    print("Updated Model Saved")

In [24]:
for data in range(110,100000,STEP_SIZE):
    #dataset loaded from hugging face
    print("Start of for loop")
    data_segment = 'train['+str(data)+":"+str(data+STEP_SIZE)+"]"
    print("Data segment: " + data_segment)
    ds = load_dataset("mozilla-foundation/common_voice_11_0", "en",  split=data_segment)
    print("Dataset loaded")
    train_model(ds)
    print("Model Trained")

Start of for loop
Data segment: train[110:120]


Found cached dataset common_voice_11_0 (/home/hetricke/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/en/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)


Dataset loaded


  Z = 10. * np.log10(spec)
  Z = 10. * np.log10(spec)
2023-12-09 10:48:34.061075: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2023-12-09 10:48:39.997123: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-12-09 10:48:39.998742: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split