In [10]:
# Standard Imports
import time

# Utility for collecting execution times
from timer import Timer

# Audio libraries
from openai import OpenAI
import speech_recognition as sr
import librosa
import soundfile as sf
from gtts import gTTS

# Personal access tokens
from key import openai_key, wit_key

In [12]:
# Create an OpenAI client
client = OpenAI(
    api_key=openai_key,
)

# The different voices supported by OpenAI
voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']

# A few test phrases
phrases = ["Suzy sells sea shells by the sea shore", 
           "Peter Piper picked a peck of pickled peppers",
           "The longest word in the English language that doesn't repeat a character is uncopyrightable"]

# To keep track of the generated files
voice_paths = []

# Generate a wav file for each voice and keep track of execution times
for voice in voices:
    time.sleep(60) # Sleep so we don't hit the OpenAI RPM while gathering metrics

    for i, phrase in enumerate(phrases):
        voice_paths.append(f'./waves/{voice}{i}_speech.wav')

        with Timer(voice):
            with client.audio.speech.with_streaming_response.create(
                model="tts-1",
                voice=voice,
                input=phrase
            ) as response:
                                
                response.stream_to_file(voice_paths[-1])

for i, phrase in enumerate(phrases):
    voice_paths.append(f'./waves/gTTS{i}_speech.wav')
    with Timer('gTTS'):
        gTTS(phrase).save(voice_paths[-1])

# Print execution times
Timer().report()
Timer().reset()



alloy ran 3 times
	Min time was 891.6032314300537 at index 1
	Max time was 1380.3138732910156 at index 2
	Average time was 1083.9062531789143 ms
	Total time was 3251.718759536743 ms

echo ran 3 times
	Min time was 797.9128360748291 at index 1
	Max time was 1190.8059120178223 at index 2
	Average time was 975.0773906707764 ms
	Total time was 2925.232172012329 ms

fable ran 3 times
	Min time was 718.355655670166 at index 1
	Max time was 1566.8666362762451 at index 2
	Average time was 1087.616999944051 ms
	Total time was 3262.8509998321533 ms

onyx ran 3 times
	Min time was 712.6829624176025 at index 0
	Max time was 1308.3128929138184 at index 2
	Average time was 966.5244420369467 ms
	Total time was 2899.57332611084 ms

nova ran 3 times
	Min time was 1436.908483505249 at index 1
	Max time was 1628.9491653442383 at index 0
	Average time was 1503.8784344991047 ms
	Total time was 4511.635303497314 ms

shimmer ran 3 times
	Min time was 1011.2628936767578 at index 1
	Max time was 1372.7216720

In [3]:
def wav2text(voice_path, service):
    '''
    Method to convert speech to text
    '''
    
    # Sometimes audio files don't have the necessary headers, this will add them
    x,_ = librosa.load(voice_path, sr=16000)
    sf.write(voice_path, x, 16000)

    r = sr.Recognizer()
    with sr.AudioFile(voice_path) as source:
        audio = r.record(source)  # read the entire audio file

    with Timer(service):
        if service == 'Sphinx':
            text = r.recognize_sphinx(audio)
        elif service =='GoogleSpeech':
            text = r.recognize_google(audio)
        elif service =='GoogleCloudSpeech':
            # You will need a Google Cloud account, enable the Cloud Speech-to-Text API
            # and create a credentials file to use this service. Get started by following
            # these instructions https://cloud.google.com/iam/docs/keys-create-delete#python
            text = r.recognize_google_cloud(audio, 'google_cloud_credentials.json')
        elif service =='Wit':
            # You will need to create an account at https://wit.ai/ and generate 
            # a 'Client Access Token' for the key argument below.
            text = r.recognize_wit(audio, key=wit_key)
        elif service == 'OpenAI':
            audio_file= open(voice_path, "rb")
            text = client.audio.transcriptions.create(model="whisper-1",  file=audio_file).text
            
    try:
        print(f"{service} thinks you said: \"{text}\"")
    except sr.UnknownValueError:
        print(f"{service} could not understand audio")
    except sr.RequestError as e:
        print(f"{service} error; {0}".format(e))

In [7]:
# Convert all the voices back to text
for voice_path in voice_paths:
    time.sleep(20) # Sleep so we don't hit the OpenAI RPM while gathering metrics
    
    print(f'Converting file: {voice_path}')
    wav2text(voice_path, 'Sphinx')
    wav2text(voice_path, 'GoogleSpeech')
    wav2text(voice_path, 'GoogleCloudSpeech')
    wav2text(voice_path, 'Wit')
    wav2text(voice_path, 'OpenAI')
    print('')

# Print execution times
Timer().report()
Timer().reset()

Converting file: ./waves/alloy0_speech.wav
Sphinx thinks you said: "susie south sea shells by the seashore"
GoogleSpeech thinks you said: "Susie sells seashells by the seashore"
GoogleCloudSpeech thinks you said: "Susie sells seashells by the seashore "
Wit thinks you said: "Susie sells seashells by the seashore"
OpenAI thinks you said: "Suzy sells seashells by the seashore."

Converting file: ./waves/alloy1_speech.wav
Sphinx thinks you said: "peter piper picked a peck of pickled peppers"
GoogleSpeech thinks you said: "Peter Piper picked a peck of pickled peppers"
GoogleCloudSpeech thinks you said: "Peter Piper picked a peck of pickled peppers "
Wit thinks you said: "Peter piper picked a peck of pickled peppers"
OpenAI thinks you said: "Peter Piper picked a peck of pickled peppers."

Converting file: ./waves/alloy2_speech.wav
Sphinx thinks you said: "the longest word in the english language that doesn't repeat a character is on a copy ride a bull"
GoogleSpeech thinks you said: "the lon