In [1]:
# Standard Imports
import time
import os

# Utility for collecting execution times
from timer import Timer

# Audio libraries
from openai import OpenAI
import speech_recognition as sr
import librosa
import soundfile as sf
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play

# LLM
from chatgpt import get_completion

# Personal access tokens
from key import openai_key, wit_key

# Create output directory
os.makedirs('./waves', exist_ok=True)

In [2]:
# Create an OpenAI client
client = OpenAI(
    api_key=openai_key,
)

# The different voices supported by OpenAI
voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']

# A few test phrases
phrases = ["Suzy sells sea shells by the sea shore", 
           "Peter Piper picked a peck of pickled peppers",
           "The longest word in the English language that doesn't repeat a character is uncopyrightable"]

# To keep track of the generated files
voice_paths = []

# Generate a wav file for each phrase and voice
for i, phrase in enumerate(phrases):
    voice_paths.append(f'./waves/gTTS{i}_speech.wav')
    
    with Timer('gTTS'):
        gTTS(phrase).save(voice_paths[-1])
        # gTTS saves in mp3 format, need to convert to wav
        audio = AudioSegment.from_mp3(voice_paths[-1])
        audio.export(voice_paths[-1], format="wav")

    for voice in voices:
        time.sleep(20) # Sleep so we don't hit the OpenAI RPM while gathering metrics
        voice_paths.append(f'./waves/{voice}{i}_speech.wav')

        with Timer(voice):
            with client.audio.speech.with_streaming_response.create(
                model="tts-1",
                voice=voice,
                input=phrase
            ) as response:
                                
                response.stream_to_file(voice_paths[-1])

# Print execution times
Timer().report()
Timer().reset()



gTTS ran 3 times
	Min time was 551.0010719299316 at index 0
	Max time was 759.230375289917 at index 2
	Average time was 683.2358837127686 ms
	Total time was 2049.7076511383057 ms

alloy ran 3 times
	Min time was 946.4118480682373 at index 0
	Max time was 1213.5865688323975 at index 2
	Average time was 1080.2051226298015 ms
	Total time was 3240.6153678894043 ms

echo ran 3 times
	Min time was 878.2362937927246 at index 0
	Max time was 1188.518762588501 at index 2
	Average time was 1048.9604473114014 ms
	Total time was 3146.881341934204 ms

fable ran 3 times
	Min time was 874.5748996734619 at index 1
	Max time was 1220.367193222046 at index 2
	Average time was 1008.6244742075603 ms
	Total time was 3025.8734226226807 ms

onyx ran 3 times
	Min time was 914.8581027984619 at index 1
	Max time was 1360.6395721435547 at index 2
	Average time was 1116.1525249481201 ms
	Total time was 3348.4575748443604 ms

nova ran 3 times
	Min time was 1154.7186374664307 at index 0
	Max time was 1315.1674270

In [3]:
# Sometimes audio files don't have the necessary headers, this will add them
for voice_path in voice_paths:
    x,_ = librosa.load(voice_path, sr=16000)
    sf.write(voice_path, x, 16000)

In [4]:
# Initialize speech recognition object
recognizer = sr.Recognizer()

In [5]:
def wav2text(voice_path, service):
    '''
    Method to convert speech to text
    '''
    
    with sr.AudioFile(voice_path) as source:
        audio = recognizer.record(source)  # read the entire audio file

    with Timer(service):
        if service == 'Sphinx':
            text = recognizer.recognize_sphinx(audio)
        elif service =='GoogleSpeech':
            text = recognizer.recognize_google(audio)
        elif service =='GoogleCloudSpeech':
            # You will need a Google Cloud account, enable the Cloud Speech-to-Text API
            # and create a credentials file to use this service. Get started by following
            # these instructions https://cloud.google.com/iam/docs/keys-create-delete#python
            text = recognizer.recognize_google_cloud(audio, 'google_cloud_credentials.json')
        elif service =='Wit':
            # You will need to create an account at https://wit.ai/ and generate 
            # a 'Client Access Token' for the key argument below.
            text = recognizer.recognize_wit(audio, key=wit_key)
        elif service == 'OpenAI':
            audio_file= open(voice_path, "rb")
            text = client.audio.transcriptions.create(model="whisper-1",  file=audio_file).text
            
    try:
        print(f"{service} thinks you said: \"{text}\"")
    except sr.UnknownValueError:
        print(f"{service} could not understand audio")
    except sr.RequestError as e:
        print(f"{service} error; {0}".format(e))

In [6]:
# Convert all the voices back to text
for voice_path in voice_paths:
    #time.sleep(20) # Sleep so we don't hit the OpenAI RPM while gathering metrics
    
    print(f'Reading file: {voice_path}')
    #wav2text(voice_path, 'Sphinx')               # Didn't work well on Linux and gives error on Windows
    wav2text(voice_path, 'GoogleSpeech')
    #wav2text(voice_path, 'GoogleCloudSpeech')    # Not one of the best contenders and gives error on Windows
    wav2text(voice_path, 'Wit')
    wav2text(voice_path, 'OpenAI')
    print('')

# Print execution times
Timer().report()
Timer().reset()

Reading file: ./waves/gTTS0_speech.wav
GoogleSpeech thinks you said: "Susie sells seashells by the seashore"
Wit thinks you said: "Suzy sells seashells by the seashore"
OpenAI thinks you said: "Suzy sells seashells by the seashore"

Reading file: ./waves/alloy0_speech.wav
GoogleSpeech thinks you said: "Susie sells seashells by the seashore"
Wit thinks you said: "Susie sells seashells by the seashore"
OpenAI thinks you said: "Suzy sells seashells by the seashore."

Reading file: ./waves/echo0_speech.wav
GoogleSpeech thinks you said: "Susie sells seashells by the seashore"
Wit thinks you said: "Susie sells seashells by the seashore"
OpenAI thinks you said: "Suzy sells seashells by the seashore."

Reading file: ./waves/fable0_speech.wav
GoogleSpeech thinks you said: "Susie sells seashells by the seashore"
Wit thinks you said: "Suzy sells seashells by the seashore"
OpenAI thinks you said: "Susie sells seashells by the seashore."

Reading file: ./waves/onyx0_speech.wav
GoogleSpeech thinks y

In [7]:
# List available microphones
sr.Microphone.list_microphone_names()

['Microsoft Sound Mapper - Input',
 'Microphone (2- G935 Gaming Head',
 'Microphone (HD Pro Webcam C920)',
 'Microsoft Sound Mapper - Output',
 'Speakers (Realtek(R) Audio)',
 'Speakers (2- G935 Gaming Headse',
 'ASUS VE278 (NVIDIA High Definit',
 'ASUS VE278 (NVIDIA High Definit',
 'VS278 (NVIDIA High Definition A',
 'Realtek Digital Output (Realtek',
 'Primary Sound Capture Driver',
 'Microphone (2- G935 Gaming Headset)',
 'Microphone (HD Pro Webcam C920)',
 'Primary Sound Driver',
 'Speakers (Realtek(R) Audio)',
 'Speakers (2- G935 Gaming Headset)',
 'ASUS VE278 (NVIDIA High Definition Audio)',
 'ASUS VE278 (NVIDIA High Definition Audio)',
 'VS278 (NVIDIA High Definition Audio)',
 'Realtek Digital Output (Realtek(R) Audio)',
 'Speakers (2- G935 Gaming Headset)',
 'Speakers (Realtek(R) Audio)',
 'ASUS VE278 (NVIDIA High Definition Audio)',
 'ASUS VE278 (NVIDIA High Definition Audio)',
 'VS278 (NVIDIA High Definition Audio)',
 'Realtek Digital Output (Realtek(R) Audio)',
 'Microphone 

In [8]:
# Initialize microphone object with appropriate device
microphone = sr.Microphone(device_index=2)

In [9]:
def recognize_speech_from_mic(recognizer, microphone):
    """ Transcribe speech from the microphone

    Returns a dictionary with three keys:
    "success": a boolean indicating whether or not the API request was successful
    "error": `None` if no error occured, otherwise a string containing an error message if the API could not be reached or speech was unrecognizable
    "transcription": A string containing the transcribed text or `None` if speech was unrecognizable
    """

    # Check that recognizer and microphone arguments are appropriately typed
    if not isinstance(recognizer, sr.Recognizer):
        raise TypeError("`recognizer` must be `Recognizer` instance")

    if not isinstance(microphone, sr.Microphone):
        raise TypeError("`microphone` must be `Microphone` instance")

    # Adjust the recognizer sensitivity for ambient noise and listen to the microphone
    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    # Initialize response object
    response = {
        "success": True,
        "error": None,
        "transcription": None
    }

    # Try to recognize the speech if a RequestError or UnknownValueError exception is caught update the response object accordingly
    try:
        response["transcription"] = recognizer.recognize_google(audio)
    except sr.RequestError:
        # API was unreachable or unresponsive
        response["success"] = False
        response["error"] = "API unavailable"
    except sr.UnknownValueError:
        # speech was unintelligible
        response["error"] = "Unable to recognize speech"

    return response

In [14]:
def communicate(phrase):
    temp_file = 'waves\\temp.mp3'
    gTTS(phrase).save(temp_file)
    audio = AudioSegment.from_mp3(temp_file)
    play(audio)
    os.remove(temp_file)

while True:
    guess = recognize_speech_from_mic(recognizer, microphone)['transcription']
    if guess is None:
        communicate("Sorry, I didn't understand that")
    elif 'hello' in guess:
        communicate('Hello, how can I help')  
    elif 'goodbye' in guess:
        communicate('See you next time')
        break
    else:
        print(f'Prompt: {guess} \n')
        response = get_completion(guess)
        
        print(f'{response}\n\n')
        communicate(response)

User: what's the wavelength of the color green 

The color green typically falls within a wavelength range of about 495 to 570 nanometers in the visible light spectrum. This range can vary slightly depending on the specific shade of green being considered. For example, a more yellowish-green might be closer to 570 nanometers, while a more bluish-green might be closer to 495 nanometers.


User: spoken language on the planet 

As of my last update, the most widely spoken language on the planet by the number of native speakers is Mandarin Chinese. Mandarin is the official language of China and Taiwan and is one of the four official languages of Singapore. It is part of the Sino-Tibetan language family.

However, if considering the total number of speakers (native plus second-language speakers), English often ranks as the most spoken language globally. English is widely used as a second language and is the official or one of the official languages in many countries around the world, making