In [1]:
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
from pydub import AudioSegment
from scipy.io import wavfile
from tempfile import mktemp
import re

2023-12-09 10:39:05.722602: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#helper function used to remove silence from beginning and end of audio segments
def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    trim_ms = 0

    assert chunk_size > 0
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

In [3]:
#regulates transcripts and converts it into a list of integers
def transcript_prep(transcription):
    
    transcription = transcription.lower()
    
    #removes all values that aren't spaces or letters
    transcription = re.sub(r'[^a-z ]', '', transcription)

    return list(transcription)

In [4]:
#processes the audio file input for use in the neural network
def audio_file_prep(audio_file):

    #reads in mp3
    mp3_audio = AudioSegment.from_file(audio_file, format="mp3")  # read mp3

    #returns -1 if the mp3 file is empty
    if(round(mp3_audio.duration_seconds) == 0):
        return -1

    #removes silent audio from the beginning and end
    start_trim = detect_leading_silence(mp3_audio)
    end_trim = detect_leading_silence(mp3_audio.reverse())
    duration = len(mp3_audio)    
    trimmed_sound = mp3_audio[start_trim:duration-end_trim]

    #converts the mp3 into a wav file
    wname = mktemp('.wav')
    trimmed_sound.export(wname, format="wav")
    FS, audio_data = wavfile.read(wname)


    #creates a file name for the spectrogram
    file_name = str(audio_file[:-4]) + ".png"
    image_name = image_name + 1

    #creates and saves the spectrogram
    plt.figure()
    plt.specgram(audio_data, Fs=FS, NFFT=128, noverlap=0)  # plot
    plt.axis('off')
    plt.savefig(file_name, bbox_inches='tight')

    #clears the figure for the next audio transcript- otherwise it just overwrites the image
    plt.close()


    #loads the spectrogram and turns it into an array
    img = keras.preprocessing.image.load_img(file_name)
    img_array = keras.preprocessing.image.img_to_array(img)

    #flattens the array and normalizes the data
    new_dim = img_array.shape[0]*img_array.shape[1]
    img_array = img_array.reshape(new_dim, -1)
    img_array = img_array.flatten()
    img_array = img_array.tolist()
    img_array[:] = [x / 255 for x in img_array]

    return img_array
