In [1]:
# Standard Imports
import time
import os
import torch

# Utility for collecting execution times
from timer import Timer

# Audio libraries
from openai import OpenAI
import speech_recognition as sr
import librosa
import soundfile as sf
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
from transformers import pipeline

# LLM
from chatgpt import get_completion

# Personal access tokens
from key import openai_key, wit_key

# Create output directory
os.makedirs('./waves', exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Create an OpenAI client
client = OpenAI(
    api_key=openai_key,
)

# The different voices supported by OpenAI
voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
#voices = []

# A few test phrases
phrases = ["Suzy sells sea shells by the sea shore", 
           "Peter Piper picked a peck of pickled peppers",
           "The longest word in the English language that doesn't repeat a character is uncopyrightable"]

# To keep track of the generated files
voice_paths = []

# Generate a wav file for each phrase and voice
for i, phrase in enumerate(phrases):
    voice_paths.append(f'./waves/gTTS{i}_speech.wav')
    
    with Timer('gTTS'):
        gTTS(phrase).save(voice_paths[-1])
        # gTTS saves in mp3 format, need to convert to wav
        audio = AudioSegment.from_mp3(voice_paths[-1])
        audio.export(voice_paths[-1], format="wav")

    for voice in voices:
        #time.sleep(20) # Sleep so we don't hit the OpenAI RPM while gathering metrics
        voice_paths.append(f'./waves/{voice}{i}_speech.wav')

        with Timer(voice):
            with client.audio.speech.with_streaming_response.create(
                model="tts-1",
                voice=voice,
                input=phrase
            ) as response:
                                
                response.stream_to_file(voice_paths[-1])

# Print execution times
Timer().report()
Timer().reset()



gTTS ran 3 times
	Min time was 410.99071502685547 at index 2
	Max time was 571.526288986206 at index 1
	Average time was 492.14943250020343 ms
	Total time was 1476.4482975006104 ms

alloy ran 3 times
	Min time was 1183.842420578003 at index 0
	Max time was 1483.2725524902344 at index 2
	Average time was 1311.5046819051106 ms
	Total time was 3934.514045715332 ms

echo ran 3 times
	Min time was 837.2936248779297 at index 0
	Max time was 1292.8078174591064 at index 1
	Average time was 1045.8238919576008 ms
	Total time was 3137.4716758728027 ms

fable ran 3 times
	Min time was 978.9559841156006 at index 1
	Max time was 1525.5086421966553 at index 2
	Average time was 1275.2206325531006 ms
	Total time was 3825.6618976593018 ms

onyx ran 3 times
	Min time was 711.677074432373 at index 1
	Max time was 1143.1801319122314 at index 2
	Average time was 920.8787282307943 ms
	Total time was 2762.636184692383 ms

nova ran 3 times
	Min time was 851.9587516784668 at index 0
	Max time was 2004.4822692

In [3]:
# Sometimes audio files don't have the necessary headers, this will add them
for voice_path in voice_paths:
    x,_ = librosa.load(voice_path, sr=16000)
    sf.write(voice_path, x, 16000)

In [4]:
# Initialize speech recognition object
recognizer = sr.Recognizer()

In [5]:
def wav2text(voice_path, service):
    '''
    Method to convert speech to text
    '''

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=device)

    with sr.AudioFile(voice_path) as source:
        audio = recognizer.record(source)  # read the entire audio file
    data, fs = sf.read(voice_path, dtype='float32')

    with Timer(service):
        if service == 'Sphinx':
            text = recognizer.recognize_sphinx(audio)
        elif service =='GoogleSpeech':
            text = recognizer.recognize_google(audio)
        elif service =='GoogleCloudSpeech':
            # You will need a Google Cloud account, enable the Cloud Speech-to-Text API
            # and create a credentials file to use this service. Get started by following
            # these instructions https://cloud.google.com/iam/docs/keys-create-delete#python
            text = recognizer.recognize_google_cloud(audio, 'google_cloud_credentials.json')
        elif service =='Wit':
            # You will need to create an account at https://wit.ai/ and generate 
            # a 'Client Access Token' for the key argument below.
            text = recognizer.recognize_wit(audio, key=wit_key)
        elif service == 'OpenAI':
            audio_file= open(voice_path, "rb")
            text = client.audio.transcriptions.create(model="whisper-1",  file=audio_file).text
        elif service == 'Whisper':
            #text = transcriber({"sampling_rate": fs, "raw": data})["text"]
            text = transcriber(data)["text"]
            
    try:
        print(f"{service} thinks you said: \"{text}\"")
    except sr.UnknownValueError:
        print(f"{service} could not understand audio")
    except sr.RequestError as e:
        print(f"{service} error; {0}".format(e))

In [6]:

# Convert all the voices back to text
for voice_path in voice_paths:
    #time.sleep(20) # Sleep so we don't hit the OpenAI RPM while gathering metrics
        
    print(f'Reading file: {voice_path}')
    #wav2text(voice_path, 'Sphinx')               # Didn't work well on Linux and gives error on Windows
    wav2text(voice_path, 'GoogleSpeech')
    #wav2text(voice_path, 'GoogleCloudSpeech')    # Not one of the best contenders and gives error on Windows
    wav2text(voice_path, 'Wit')
    wav2text(voice_path, 'OpenAI')
    wav2text(voice_path, 'Whisper')
    print('')

# Print execution times
Timer().report()
Timer().reset()

Reading file: ./waves/gTTS0_speech.wav
GoogleSpeech thinks you said: "Susie sells seashells by the seashore"
Wit thinks you said: "Suzy sells seashells by the seashore"
OpenAI thinks you said: "Susie sells seashells by the seashore."


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Whisper thinks you said: " Susie sells seashells by the Seashore."

Reading file: ./waves/gTTS1_speech.wav
GoogleSpeech thinks you said: "Peter Piper picked a peck of pickled peppers"
Wit thinks you said: "Peter Piper picked a pec of pickled peppers"
OpenAI thinks you said: "Peter Piper picked a peck of pickled peppers."
Whisper thinks you said: " Peter Piper picked a peck of pickled peppers."

Reading file: ./waves/gTTS2_speech.wav
GoogleSpeech thinks you said: "the longest word in the English language that doesn't repeat a character is uncopyrightable"
Wit thinks you said: "The longest word in the English language that doesn't repeat a character is on copyrightable"
OpenAI thinks you said: "The longest word in the English language that doesn't repeat a character is uncopyrightable."
Whisper thinks you said: " The longest word in the English language that doesn't repeat a character is uncopuritable."



GoogleSpeech ran 3 times
	Min time was 737.8661632537842 at index 1
	Max time was 

In [None]:
# List available microphones
sr.Microphone.list_microphone_names()

In [8]:
# Initialize microphone object with appropriate device
microphone = sr.Microphone(device_index=2)

In [9]:
def recognize_speech_from_mic(recognizer, microphone):
    """ Transcribe speech from the microphone

    Returns a dictionary with three keys:
    "success": a boolean indicating whether or not the API request was successful
    "error": `None` if no error occured, otherwise a string containing an error message if the API could not be reached or speech was unrecognizable
    "transcription": A string containing the transcribed text or `None` if speech was unrecognizable
    """

    # Check that recognizer and microphone arguments are appropriately typed
    if not isinstance(recognizer, sr.Recognizer):
        raise TypeError("`recognizer` must be `Recognizer` instance")

    if not isinstance(microphone, sr.Microphone):
        raise TypeError("`microphone` must be `Microphone` instance")

    # Adjust the recognizer sensitivity for ambient noise and listen to the microphone
    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    # Initialize response object
    response = {
        "success": True,
        "error": None,
        "transcription": None
    }

    # Try to recognize the speech if a RequestError or UnknownValueError exception is caught update the response object accordingly
    try:
        response["transcription"] = recognizer.recognize_google(audio)
    except sr.RequestError:
        # API was unreachable or unresponsive
        response["success"] = False
        response["error"] = "API unavailable"
    except sr.UnknownValueError:
        # speech was unintelligible
        response["error"] = "Unable to recognize speech"

    return response

In [None]:
def communicate(phrase):
    temp_file = 'waves\\temp.mp3'
    gTTS(phrase).save(temp_file)
    audio = AudioSegment.from_mp3(temp_file)
    play(audio)
    os.remove(temp_file)

while True:
    guess = recognize_speech_from_mic(recognizer, microphone)['transcription']
    if guess is None:
        communicate("Sorry, I didn't understand that")
    elif 'hello' in guess:
        communicate('Hello, how can I help')  
    elif 'goodbye' in guess:
        communicate('See you next time')
        break
    else:
        print(f'Prompt: {guess} \n')
        response = get_completion(guess)
        
        print(f'{response}\n\n')
        communicate(response)