In [1]:
from tensorflow.keras.utils import pad_sequences
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
from pydub import AudioSegment
from scipy.io import wavfile
from tempfile import mktemp
from datasets import load_dataset

2023-12-06 11:18:47.121912: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Variables that were altered to test for best architecture
TIME_STEPS = 5
EPOCHS = 2
BATCH_SIZE = 1
OPTIMIZER = 'rmsprop'
LOSS_FUNCTION = 'categorical_crossentropy'
LATENT_DIM = 256

#Globals
target_characters = []
image_name = 0

In [3]:
#dataset loaded from hugging face
ds = load_dataset("mozilla-foundation/common_voice_11_0", "en", split='train[:100]')

Found cached dataset common_voice_11_0 (/home/hetricke/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/en/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)


In [4]:
#helper function used to remove silence from beginning and end of audio segments
def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    trim_ms = 0

    assert chunk_size > 0
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

In [7]:
#regulates transcripts and converts it into a list of integers
def transcript_prep(transcription):

    global target_characters

    #removes trailing quotation marks
    if transcription.startswith('"') and transcription.endswith('"'):
        transcription = transcription[1:-1]

    transcription = transcription.lower()
    
    #adds a period to the end of sentence if it doesn't already end in punctuation
    if len(transcription) > 0:
        if transcription[-1] not in [".", "?", "!"]:
            transcription = transcription + "."

    for letter in transcription:
        if letter not in target_characters:
            target_characters.append(letter)

    return list(transcription)


In [8]:
#processes the audio file input for use in the neural network
def audio_file_prep(audio_path, sentence_length):

    global image_name

    #reads in mp3
    mp3_audio = AudioSegment.from_file(audio_path, format="mp3")  # read mp3

    #returns -1 if the mp3 file is empty
    if(round(mp3_audio.duration_seconds) == 0):
        return -1

    #removes silent audio from the beginning and end
    start_trim = detect_leading_silence(mp3_audio)
    end_trim = detect_leading_silence(mp3_audio.reverse())
    duration = len(mp3_audio)    
    trimmed_sound = mp3_audio[start_trim:duration-end_trim]

    #converts the mp3 into a wav file
    wname = mktemp('.wav')
    trimmed_sound.export(wname, format="wav")
    FS, audio_data = wavfile.read(wname)


    #creates a file name for the spectrogram
    file_name = "images/"+ str(image_name) + ".png"
    image_name = image_name + 1

    #creates and saves the spectrogram
    plt.figure()
    plt.specgram(audio_data, Fs=FS, NFFT=128, noverlap=0)  # plot
    plt.axis('off')
    plt.savefig(file_name, bbox_inches='tight')

    #clears the figure for the next audio transcript- otherwise it just overwrites the image
    plt.close()


    #loads the spectrogram and turns it into an array
    img = keras.preprocessing.image.load_img(file_name)
    img_array = keras.preprocessing.image.img_to_array(img)


    new_dim = img_array.shape[0]*img_array.shape[1]
    img_array = img_array.reshape(new_dim, -1)
    img_array = img_array.flatten()
    img_array = img_array.tolist()
    img_array[:] = [x / 255 for x in img_array]

    return img_array


In [9]:
image_name = 0
x_processed_data = []
y_processed_data = []

#pre-processes all the data
for i in range(25):

    x_processed_data.append(audio_file_prep(ds[i]['path'], len(ds[i]["sentence"])))
    y_processed_data.append(transcript_prep(ds[i]["sentence"]))

    #if something about the data is bad, it is removed from the set
    if(x_processed_data[len(x_processed_data)-1] == -1 or len(y_processed_data[len(y_processed_data)-1])==0):
       x_processed_data.pop(len(x_processed_data)-1)
       y_processed_data.pop(len(y_processed_data)-1)


target_characters = sorted(target_characters)
num_encoder_tokens = 256
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max(len(audio) for audio in x_processed_data)
max_decoder_seq_length = max(len(transcript) for transcript in y_processed_data)


In [10]:
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros( (len(x_processed_data), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
decoder_input_data = np.zeros( (len(y_processed_data), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
decoder_target_data = np.zeros( (len(y_processed_data), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

for i, (x_datapoint, y_datapoint) in enumerate(zip(x_processed_data, y_processed_data)):
    for t, val in enumerate(x_datapoint):
        #print(str(i)+", "+str(t)+", "+str(val))
        encoder_input_data[i, t, int(val*255)] = 1.0
    encoder_input_data[i, t + 1 :, 0] = 1.0
    for t, char in enumerate(y_datapoint):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0



In [11]:
# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(LATENT_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


2023-12-06 11:21:43.667752: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2023-12-06 11:21:44.787637: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-12-06 11:21:44.789905: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-12-06 11:21:44.7

In [None]:
model.compile(
    optimizer=OPTIMIZER, loss=LOSS_FUNCTION, metrics=["accuracy"]
)
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2,
)
# Save model
model.save("s2s_model.keras")


In [14]:
#tests the accurracy of the generated model
total_score = 0
total_accuracy = 0
for i in range(len(x_test)):
    score, accuracy = model.evaluate(x_test[i], y_test[i], batch_size=BATCH_SIZE)
    total_score += score
    total_accuracy += accuracy

total_score /= len(x_test)
total_accuracy /= len(x_test)

print(total_accuracy)

 6/60 [==>...........................] - ETA: 1s - loss: 3.6272 - accuracy: 0.1667

2023-12-05 21:58:31.432894: W tensorflow/core/grappler/utils/graph_view.cc:849] No registered '' OpKernel for CPU devices compatible with node {{node sequential/simple_rnn/while/body/_1/sequential/simple_rnn/while/simple_rnn_cell/Relu}}
	.  Registered:  <no registered kernels>

2023-12-05 21:58:31.440680: E tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc:134] tfg_optimizer{any(tfg-consolidate-attrs,tfg-toposort,tfg-shape-inference{graph-version=0},tfg-prepare-attrs-export)} failed: INVALID_ARGUMENT: Node sequential/simple_rnn/while/body/_1/sequential/simple_rnn/while/simple_rnn_cell/Relu has an empty op name
	when importing GraphDef to MLIR module in GrapplerHook
2023-12-05 21:58:31.450057: E tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc:134] tfg_optimizer{any(tfg-consolidate-attrs,tfg-functional-to-region,tfg.func(tfg-cf-sink),tfg-region-to-functional{force-control-capture=true},tfg-lift-legacy-call,symbol-privatize{},symbol-dce,tfg-prepare-attrs-export)} failed:

0.11888940781354904
