In [24]:
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
from pydub import AudioSegment
from scipy.io import wavfile
from tempfile import mktemp
from datasets import load_dataset

In [25]:
#Variables that were altered to test for best architecture
EPOCHS =1
BATCH_SIZE = 1
OPTIMIZER = 'rmsprop'
LOSS_FUNCTION = 'categorical_crossentropy'
LATENT_DIM = 256

#Globals
target_characters = []
image_name = 0
x_processed_data = []
y_processed_data = []

In [26]:
#dataset loaded from hugging face
ds = load_dataset("mozilla-foundation/common_voice_11_0", "en")

Found cached dataset common_voice_11_0 (/home/hetricke/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/en/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)
100%|██████████| 5/5 [00:18<00:00,  3.69s/it]


In [27]:
#helper function used to remove silence from beginning and end of audio segments
def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    trim_ms = 0

    assert chunk_size > 0
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

In [28]:
#regulates transcripts and converts it into a list of integers
def transcript_prep(transcription):

    global target_characters

    #removes trailing quotation marks
    if transcription.startswith('"') and transcription.endswith('"'):
        transcription = transcription[1:-1]

    transcription = transcription.lower()
    
    #adds a period to the end of sentence if it doesn't already end in punctuation
    if len(transcription) > 0:
        if transcription[-1] not in [".", "?", "!"]:
            transcription = transcription + "."

    for letter in transcription:
        if letter not in target_characters:
            target_characters.append(letter)

    return list(transcription)


In [29]:
#processes the audio file input for use in the neural network
def audio_file_prep(audio_path, sentence_length):

    global image_name

    #reads in mp3
    mp3_audio = AudioSegment.from_file(audio_path, format="mp3")  # read mp3

    #returns -1 if the mp3 file is empty
    if(round(mp3_audio.duration_seconds) == 0):
        return -1

    #removes silent audio from the beginning and end
    start_trim = detect_leading_silence(mp3_audio)
    end_trim = detect_leading_silence(mp3_audio.reverse())
    duration = len(mp3_audio)    
    trimmed_sound = mp3_audio[start_trim:duration-end_trim]

    #converts the mp3 into a wav file
    wname = mktemp('.wav')
    trimmed_sound.export(wname, format="wav")
    FS, audio_data = wavfile.read(wname)


    #creates a file name for the spectrogram
    file_name = "images/"+ str(image_name) + ".png"
    image_name = image_name + 1

    #creates and saves the spectrogram
    plt.figure()
    plt.specgram(audio_data, Fs=FS, NFFT=128, noverlap=0)  # plot
    plt.axis('off')
    plt.savefig(file_name, bbox_inches='tight')

    #clears the figure for the next audio transcript- otherwise it just overwrites the image
    plt.close()


    #loads the spectrogram and turns it into an array
    img = keras.preprocessing.image.load_img(file_name)
    img_array = keras.preprocessing.image.img_to_array(img)

    #flattens the array and normalizes the data
    new_dim = img_array.shape[0]*img_array.shape[1]
    img_array = img_array.reshape(new_dim, -1)
    img_array = img_array.flatten()
    img_array = img_array.tolist()
    img_array[:] = [x / 255 for x in img_array]

    return img_array


In [39]:
def train_model(beginning_datapoint, ending_datapoint): 
    #pre-processes all the data
    global image_name
    global x_processed_data
    global y_processed_data
    global target_characters

    image_name = 0
    x_processed_data = []
    y_processed_data = []
    
    for i in range(ending_datapoint-beginning_datapoint):
        x_processed_data.append(audio_file_prep(ds['train'][i+beginning_datapoint]['path'], len(ds['train'][i+beginning_datapoint]["sentence"])))
        y_processed_data.append(transcript_prep(ds['train'][i]["sentence"]))


        #if something about the data is bad, it is removed from the set
        if(x_processed_data[len(x_processed_data)-1] == -1 or len(y_processed_data[len(y_processed_data)-1])==0):
            x_processed_data.pop(len(x_processed_data)-1)
            y_processed_data.pop(len(y_processed_data)-1)


    target_characters = sorted(target_characters)
    num_encoder_tokens = 256
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_length = max(len(audio) for audio in x_processed_data)
    max_decoder_seq_length = max(len(transcript) for transcript in y_processed_data)

    target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

    encoder_input_data = np.zeros( (len(x_processed_data), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
    decoder_input_data = np.zeros( (len(y_processed_data), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
    decoder_target_data = np.zeros( (len(y_processed_data), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

    for i, (x_datapoint, y_datapoint) in enumerate(zip(x_processed_data, y_processed_data)):
        for t, val in enumerate(x_datapoint):
            #print(str(i)+", "+str(t)+", "+str(val))
            encoder_input_data[i, t, int(val*255)] = 1.0
        encoder_input_data[i, t + 1 :, 0] = 1.0
        for t, char in enumerate(y_datapoint):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
        decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
        decoder_target_data[i, t:, target_token_index[" "]] = 1.0

    # Define an input sequence and process it.
    encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
    encoder = keras.layers.LSTM(LATENT_DIM, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)

    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_lstm = keras.layers.LSTM(LATENT_DIM, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    model.compile(
        optimizer=OPTIMIZER, loss=LOSS_FUNCTION, metrics=["accuracy"]
    )
    model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=0.2,
    )
    # Save model
    model.save("s2s_model.keras")

In [40]:
for data in range(0,len(ds),10):
    train_model(data,data+10)


KeyError: "Invalid key: 0. Please first select a split. For example: `my_dataset_dictionary['train'][0]`. Available splits: ['invalidated', 'other', 'test', 'train', 'validation']"

In [12]:
model = keras.models.load_model("s2s_model.keras")

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(LATENT_DIM,))
decoder_state_input_c = keras.Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

2023-12-06 14:30:51.536683: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-12-06 14:30:51.537798: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-12-06 14:30:51.539101: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [15]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    #target_seq[0, 0, target_token_index[" "]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [16]:
for seq_index in range(1):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", y_processed_data[seq_index])
    print("Decoded sentence:", decoded_sentence)


2023-12-06 14:41:29.212864: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-12-06 14:41:29.214818: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-12-06 14:41:29.217312: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

-
Input sentence: ['t', 'h', 'e', ' ', 't', 'r', 'a', 'c', 'k', ' ', 'a', 'p', 'p', 'e', 'a', 'r', 's', ' ', 'o', 'n', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'm', 'p', 'i', 'l', 'a', 't', 'i', 'o', 'n', ' ', 'a', 'l', 'b', 'u', 'm', ' ', '"', 'k', 'r', 'a', 'f', 't', 'w', 'o', 'r', 'k', 's', '"', '.']
Decoded sentence: hheett                                                                                       
