In [1]:
# Utility for collecting execution times
from timer import Timer

# Audio libraries
from openai import OpenAI
import speech_recognition as sr
import librosa
import soundfile as sf

# Personal access tokens
from key import openai_key, wit_key

In [2]:
# Create an OpenAI client
client = OpenAI(
    api_key=openai_key,
)

# The different voices supported by OpenAI
voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
test_phrase = "Suzy sells sea shells by the sea shore"
voice_paths = []

# Generate a wav file for each voice and keep track of execution times
for voice in voices:
    voice_paths.append(f'./waves/{voice}_speech.wav')

    with Timer(voice):
        response = client.audio.speech.create(
            model="tts-1",
            voice=voice,
            input=test_phrase
        )

    # The following method is depricated and gives a warning but still seems to work fine.
    #response.stream_to_file(voice_path)
    # This does the same thing without the warning
    response.write_to_file(voice_paths[-1])

# Print execution times
Timer().report()
Timer().reset()



alloy ran 1 times
	Min time was 1027.3239612579346 at index 0
	Max time was 1027.3239612579346 at index 0
	Average time was 1027.3239612579346 ms
	Total time was 1027.3239612579346 ms

echo ran 1 times
	Min time was 747.3957538604736 at index 0
	Max time was 747.3957538604736 at index 0
	Average time was 747.3957538604736 ms
	Total time was 747.3957538604736 ms

fable ran 1 times
	Min time was 1000.8773803710938 at index 0
	Max time was 1000.8773803710938 at index 0
	Average time was 1000.8773803710938 ms
	Total time was 1000.8773803710938 ms

onyx ran 1 times
	Min time was 21293.11513900757 at index 0
	Max time was 21293.11513900757 at index 0
	Average time was 21293.11513900757 ms
	Total time was 21293.11513900757 ms

nova ran 1 times
	Min time was 21178.551197052002 at index 0
	Max time was 21178.551197052002 at index 0
	Average time was 21178.551197052002 ms
	Total time was 21178.551197052002 ms

shimmer ran 1 times
	Min time was 21204.77604866028 at index 0
	Max time was 21204.7

In [3]:
def wav2text(voice_path, service):
    '''
    Method to convert speech to text
    '''
    
    # Sometimes audio files don't have the necessary headers, this will add them
    x,_ = librosa.load(voice_path, sr=16000)
    sf.write(voice_path, x, 16000)

    r = sr.Recognizer()
    with sr.AudioFile(voice_path) as source:
        audio = r.record(source)  # read the entire audio file

    with Timer(service):
        if service == 'Sphinx':
            text = r.recognize_sphinx(audio)
        elif service =='GoogleSpeech':
            text = r.recognize_google(audio)
        elif service =='GoogleCloudSpeech':
            # You will need a Google Cloud account, enable the Cloud Speech-to-Text API
            # and create a credentials file to use this service. Get started by following
            # these instructions https://cloud.google.com/iam/docs/keys-create-delete#python
            text = r.recognize_google_cloud(audio, 'google_cloud_credentials.json')
        elif service =='Wit':
            # You will need to create an account at https://wit.ai/ and generate 
            # a 'Client Access Token' for the key argument below.
            text = r.recognize_wit(audio, key=wit_key)
            
    try:
        print(f"{service} thinks you said: \"{text}\"")
    except sr.UnknownValueError:
        print(f"{service} could not understand audio")
    except sr.RequestError as e:
        print(f"{service} error; {0}".format(e))

In [4]:
# Convert all the voices back to text
for voice_path in voice_paths:
    
    print(f'Converting file: {voice_path}')
    wav2text(voice_path, 'Sphinx')
    wav2text(voice_path, 'GoogleSpeech')
    wav2text(voice_path, 'GoogleCloudSpeech')
    wav2text(voice_path, 'Wit')
    print('')

# Print execution times
Timer().report()
Timer().reset()

Converting file: ./waves/alloy_speech.wav
Sphinx thinks you said: "susie south sea shells by the seashore"
GoogleSpeech thinks you said: "Susie sells seashells by the seashore"
GoogleCloudSpeech thinks you said: "Susie sells seashells by the seashore "
Wit thinks you said: "Susie sells seashells by the seashore"

Converting file: ./waves/echo_speech.wav
Sphinx thinks you said: "susie south sea shells by the seashore"
GoogleSpeech thinks you said: "Susie sells seashells by the seashore"
GoogleCloudSpeech thinks you said: "Susie sells seashells by the seashore "
Wit thinks you said: "Susie sells seashells by the seashore"

Converting file: ./waves/fable_speech.wav
Sphinx thinks you said: "susie south sea shells by the seashore"
GoogleSpeech thinks you said: "Susie sells seashells by the seashore"
GoogleCloudSpeech thinks you said: "Susie sells seashells by the seashore "
Wit thinks you said: "Susie sells seashells by the seashore"

Converting file: ./waves/onyx_speech.wav
Sphinx thinks y