## Set Up Environment


In [None]:
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()


# Access environment variables
speech_key = os.getenv('SPEECH_KEY')
print(f'SPEECH_KEY: {speech_key}')
speech_endpoint = os.getenv('SPEECH_ENDPOINT')
print(f'SPEECH_ENDPOINT: {speech_endpoint}')
speech_region = os.getenv('SPEECH_REGION')
print(f'REGION: {speech_region}')

gpt_key = os.getenv('GPT_KEY')
print(f'GPT_KEY: {gpt_key}')
gpt_endpoint = os.getenv('GPT_ENDPOINT')
print(f'GPT_ENDPOINT: {gpt_endpoint}')
gpt_region = os.getenv('OPENAI_REGION')
print(f'REGION: {gpt_region}')

llama_token = os.getenv('LLAMA_TOKEN')
print(f'LLAMA_TOKEN: {llama_token}')
	

## Speech-To-Text Azure

In [None]:

import azure.cognitiveservices.speech as speechsdk

speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_region)
def recognize_speech():
    # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
    
    speech_config.speech_recognition_language="fr-FR"

    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

   
    
    print("Speak into your microphone.")
    speech_recognition_result = speech_recognizer.recognize_once_async().get()
        
    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("You: {}".format(speech_recognition_result.text))
        return speech_recognition_result.text
    elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
    elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_recognition_result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
            print("Did you set the speech resource key and region values?")
    return None

In [None]:
# TODO this code will work, but we need to get the stuff from the microphone/user, not the TTS

import azure.cognitiveservices.speech as speechsdk
import wave
import io

class CustomAudioOutputStream(speechsdk.audio.PullAudioOutputStream):
    def __init__(self):
        super().__init__()
        self._buffer = io.BytesIO()

    def read(self, size):
        # Read the audio data into your buffer
        return self._buffer.read(size)

    def write(self, data):
        # Write data to the buffer to simulate real-time streaming
        self._buffer.write(data)

    def close(self):
        self._buffer.close()

# Azure TTS Configuration
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_region)
audio_config = speechsdk.audio.AudioOutputConfig(stream=CustomAudioOutputStream())

# Create TTS synthesizer
synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

# Synthesize some speech
synthesis_result = synthesizer.speak_text_async("Hello, this is a test of Azure TTS.").get()

# Use the custom stream to pipe audio data elsewhere
output_stream = audio_config.stream
output_data = output_stream.read(1024)  # Example read data

# Now you can pipe `output_data` to a file, another program, etc.
with wave.open("output.wav", "wb") as wf:
    wf.setnchannels(1)
    wf.setsampwidth(2)
    wf.setframerate(16000)
    wf.writeframes(output_data)


In [None]:
# TODO: Make this work so that we can get the properties like the class 'azure.cognitiveservices.speech.SpeechRecognitionResult'

import azure.cognitiveservices.speech as speechsdk
import wave

# Azure Speech SDK configuration

speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_region)

# Use the default microphone for audio capture
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)

# Create a speech recognizer with microphone as the audio source
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

# List to store audio data chunks
audio_chunks = []

# Callback function for recognizing events
def handle_audio_event(evt):
    audio_data = evt.result.audio
    if audio_data:
        audio_chunks.append(audio_data)

# Subscribe to recognized audio events
speech_recognizer.recognized.connect(handle_audio_event)

# Start continuous recognition
print("Listening to microphone. Press Ctrl+C to stop...")
speech_recognizer.start_continuous_recognition()

try:
    while True:
        pass  # Keep listening indefinitely
except KeyboardInterrupt:
    # Stop recognition on interrupt
    speech_recognizer.stop_continuous_recognition()
    print("Stopping...")

# Combine audio chunks into a single byte stream
audio_bytes = b''.join(audio_chunks)

# Save the audio to a .wav file
output_filename = "output.wav"
with wave.open(output_filename, 'wb') as wf:
    wf.setnchannels(1)  # Mono audio
    wf.setsampwidth(2)  # 16-bit audio
    wf.setframerate(16000)  # 16 kHz sample rate
    wf.writeframes(audio_bytes)

print(f"Audio saved to {output_filename}")


## MyProsody Speech Rate Detection

We need to clone the myprosody repository, I chose to do so locally within the repo, but added it to the gitignore


In [None]:
import myprosody as mysp
import io
import sys

# Redirect the print output
def detect_speech_rate(wav: str) -> str:
    # Create a StringIO object to capture the output
    p=wav.split(".")[0]
    c=r"../myprosody/myprosody"
    captured_output = io.StringIO()
    sys.stdout = captured_output  # Redirect sys.stdout to the StringIO object
    try:
        # Call the function whose output you want to capture
        mysp.myspsr(p,c)
    finally:
        sys.stdout = sys.__stdout__  # Restore the original sys.stdout

    # Get the captured output as a string
    output = captured_output.getvalue()
    captured_output.close()  # Close the StringIO object
    
    # TODO: Figure out what Azure TTS's defaualt speech rate is with myprosody
    final_syl_sec = 8
    try:
        final_syl_sec = output.split("rate_of_speech= ")[1].strip()
    except Exception as e:
        print(output)
    
    return float(final_syl_sec) 

## Language Model

### OpenAI GPT

In [None]:
def generate_response(prompt):
    """Generate a response using Azure OpenAI Service."""
    response = openai_client.chat_completions.create(
        deployment_id=deployment_id,
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant."},
            {"role": "user", "content": prompt},
        ],
    )
    return response.choices[0].message["content"]

### LLama (test)

In [None]:
import subprocess


# Run the huggingface-cli login command
subprocess.run(["huggingface-cli", "login", "--token", llama_token])


In [None]:
!huggingface-cli download meta-llama/Meta-Llama-3-8B-Instruct --include "original/*" --local-dir Meta-Llama-3-8B-Instruct

In [None]:
import transformers
import torch

model_id = "meta-llama/Llama-3.1-8B"

pipeline = transformers.pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto", use_auth_token=True
)


In [None]:

pipeline("Hey how are you doing today?")

## Text-to-Speech

In [None]:
def speak_response(response):
	"""Convert text to speech using Azure Speech SDK."""
	speech_config.speech_synthesis_language="fr-FR"
	synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    # Define SSML with Speaking Rate
	rate = '20%'
	# speech_config.voice_name = "fr-FR-Julie-Apollo"
	speech_config.speech_synthesis_voice_name = "fr-FR-VivienneMultilingualNeural"
	ssml_string = f"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" 
    xmlns:mstts="http://www.w3.org/2001/mstts" 
    	xml:lang="en-US">
    		<voice name="fr-FR-VivienneMultilingualNeural">
        <prosody rate="{rate}">{response}.</prosody>
    </voice>
    </speak>"""
	result = synthesizer.speak_ssml_async(ssml_string).get()
	if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
		print("Speech synthesized successfully.")
		return result.audio_data
	elif result.reason == speechsdk.ResultReason.Canceled:
		cancellation_details = result.cancellation_details
		print(f"Speech synthesis canceled: {cancellation_details.reason}")
		if cancellation_details.reason == speechsdk.CancellationReason.Error:
			print(f"Error details: {cancellation_details.error_details}")

# Synthesize Speech
	# synthesizer.speak_text_async(response)

speak_response("Moi? Je vais bien, merci!")

In [None]:
def main():
    """Main voice agent loop."""
    while True:
        user_input = recognize_speech()
        # if user_input:
        #     # Generate a response using Azure OpenAI Service
        #     response = generate_response(user_input)
        #     print(f"Agent: {response}")

        #     # Speak the response
        #     speak_response(response)
        # else:
        #     print("Could not understand input. Please try again.")

if __name__ == "__main__":
    main()

In [None]:
# import azure.cognitiveservices.speech as speechsdk

# def process_recognized_text(text):
#     """
#     This function receives recognized speech as text.
#     You can customize it to store, process, or send the output elsewhere.
#     """
#     print(f"Recognized Text: {text}")

# def recognize_and_pipe_text():
#     # Set up Azure Speech config
#     speech_key = "YourAzureSpeechKey"
#     region = "YourAzureRegion"
#     speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=region)

#     # Use microphone as input (can replace with a specific audio file if needed)
#     audio_input = speechsdk.audio.AudioConfig(use_default_microphone_input=True)

#     # Create the SpeechRecognizer object
#     speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

#     # Event triggered when a speech recognition result is received
#     def recognized_handler(event):
#         if event.result.reason == speechsdk.ResultReason.RecognizedSpeech:
#             recognized_text = event.result.text
#             # Send recognized text to another function
#             process_recognized_text(recognized_text)
#         elif event.result.reason == speechsdk.ResultReason.NoMatch:
#             print("No speech could be recognized.")

#     # Connect events to handlers
#     speech_recognizer.recognized.connect(recognized_handler)

#     # Start continuous recognition
#     print("Start speaking...")
#     speech_recognizer.start_continuous_recognition()

#     # Keep recognizing (use stop_continuous_recognition() to exit cleanly)
#     try:
#         while True:
#             pass  # Keeps the application running to listen for speech input
#     except KeyboardInterrupt:
#         print("Stopping recognition.")
#         speech_recognizer.stop_continuous_recognition()

# recognize_and_pipe_text()
