In [1]:
# Standard Imports
import time

# Utility for collecting execution times
from timer import Timer

# Audio libraries
from openai import OpenAI
import speech_recognition as sr
import librosa
import soundfile as sf

# Personal access tokens
from key import openai_key, wit_key

In [2]:
# Create an OpenAI client
client = OpenAI(
    api_key=openai_key,
)

# The different voices supported by OpenAI
voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
phrases = ["Suzy sells sea shells by the sea shore", 
           "Peter Piper picked a peck of pickled peppers",
           "The longest word in the English language that doesn't repeat a character is uncopyrightable"]
voice_paths = []

# Generate a wav file for each voice and keep track of execution times
for voice in voices:
    i = 1
    for phrase in phrases:
        voice_paths.append(f'./waves/{voice}{i}_speech.wav')
        i += 1

        with Timer(voice):
            response = client.audio.speech.create(
                model="tts-1",
                voice=voice,
                input=phrase
            )

        # The following method is depricated and gives a warning but still seems to work fine.
        #response.stream_to_file(voice_path)
        # This does the same thing without the warning
        response.write_to_file(voice_paths[-1])

    time.sleep(60) # Sleep so we don't hit the OpenAI RPM while gathering metrics

# Print execution times
Timer().report()
Timer().reset()

KeyboardInterrupt: 

In [None]:
def wav2text(voice_path, service):
    '''
    Method to convert speech to text
    '''
    
    # Sometimes audio files don't have the necessary headers, this will add them
    x,_ = librosa.load(voice_path, sr=16000)
    sf.write(voice_path, x, 16000)

    r = sr.Recognizer()
    with sr.AudioFile(voice_path) as source:
        audio = r.record(source)  # read the entire audio file

    with Timer(service):
        if service == 'Sphinx':
            text = r.recognize_sphinx(audio)
        elif service =='GoogleSpeech':
            text = r.recognize_google(audio)
        elif service =='GoogleCloudSpeech':
            # You will need a Google Cloud account, enable the Cloud Speech-to-Text API
            # and create a credentials file to use this service. Get started by following
            # these instructions https://cloud.google.com/iam/docs/keys-create-delete#python
            text = r.recognize_google_cloud(audio, 'google_cloud_credentials.json')
        elif service =='Wit':
            # You will need to create an account at https://wit.ai/ and generate 
            # a 'Client Access Token' for the key argument below.
            text = r.recognize_wit(audio, key=wit_key)
            
    try:
        print(f"{service} thinks you said: \"{text}\"")
    except sr.UnknownValueError:
        print(f"{service} could not understand audio")
    except sr.RequestError as e:
        print(f"{service} error; {0}".format(e))

In [None]:
# Convert all the voices back to text
for voice_path in voice_paths:
    
    print(f'Converting file: {voice_path}')
    wav2text(voice_path, 'Sphinx')
    wav2text(voice_path, 'GoogleSpeech')
    wav2text(voice_path, 'GoogleCloudSpeech')
    wav2text(voice_path, 'Wit')
    print('')

# Print execution times
Timer().report()
Timer().reset()