## Azure Speech Service Capabilities - Text-to-Speech (TTS), Speech-to-Text (STT) and Speech Sysnthesis with SSML

### Installing Utilities and Libraries

In [None]:
%pip install azure-cognitiveservices-speech

### Setting Up the Environment Variables

In [None]:
import os 
from dotenv import load_dotenv
load_dotenv()

stt_endpoint = os.environ.get('STT_ENDPOINT')
tts_endpoint = os.environ.get('TTS_ENDPOINT')
speech_key = os.environ.get('SPEECH_KEY')

### Creating the Azure Speech-to-Text Client

In [None]:
import azure.cognitiveservices.speech as speechsdk

speech_config_stt = speechsdk.SpeechConfig(subscription=speech_key, endpoint=stt_endpoint)
speech_config_stt.speech_recognition_language="en-US"

audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config_stt, audio_config=audio_config)

### Speech-to-Text (STT) Implementation

In [None]:
print("Speak into your microphone.")
speech_recognition_result = speech_recognizer.recognize_once_async().get()

if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(speech_recognition_result.text))
elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_recognition_result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
            print("Did you set the speech resource key and endpoint values?")

### Creating the Azure Text-to-Speech Client

In [None]:
import azure.cognitiveservices.speech as speechsdk

speech_config_tts = speechsdk.SpeechConfig(subscription=speech_key, endpoint=tts_endpoint)
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)

### Setting the Voice for Text-to-Speech

Some speakers to use:
- en-US-SteffanNeural
- zh-CN-XiaoxiaoNeural
- fr-FR-DeniseNeural

In [None]:
# The neural multilingual voice can speak different languages based on the input text.
speech_config_tts.speech_synthesis_voice_name='en-US-Ava:DragonHDLatestNeural'

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config_tts, audio_config=audio_config)

### Text-to-Speech (TTS) Implementation

In [None]:
# Get text from the console and synthesize to the default speaker.
print("Enter some text that you want to speak >")
text = input()

speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()

if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    print("Speech synthesized for text [{}]".format(text))
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = speech_synthesis_result.cancellation_details
    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        if cancellation_details.error_details:
            print("Error details: {}".format(cancellation_details.error_details))
            print("Did you set the speech resource key and endpoint values?")

### SSML for Advanced TTS Features

### Two Speaker Dialogue Example

In [18]:
ssml = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
    <voice name="en-US-Ava:DragonHDLatestNeural">
        Good morning!
    </voice>
    <voice name="en-US-Andrew:DragonHDLatestNeural">
        Good morning to you too Ava!
    </voice>
</speak>
"""

# Synthesize speech from SSML
result = speech_synthesizer.speak_ssml_async(ssml).get()

### Multi-Speaker Conversation

In [19]:
ssml = """
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'>
    <voice name='en-US-MultiTalker-Ava-Andrew:DragonHDLatestNeural'>
        <mstts:dialog>
            <mstts:turn speaker="ava">Hello, Andrew! How's your day going?</mstts:turn>
            <mstts:turn speaker="andrew">Hey Ava! It's been great, just exploring some AI advancements in communication.</mstts:turn>
            <mstts:turn speaker="ava">That sounds interesting! What kind of projects are you working on?</mstts:turn>
            <mstts:turn speaker="andrew">Well, we've been experimenting with text-to-speech applications, including turning emails into podcasts.</mstts:turn>
            <mstts:turn speaker="ava">Wow, that could really improve content accessibility! Are you looking for collaborators?</mstts:turn>
            <mstts:turn speaker="andrew">Absolutely! We're open to testing new ideas and seeing how AI can enhance communication.</mstts:turn>
        </mstts:dialog>
    </voice>
</speak>
"""

# Synthesize speech from SSML
result = speech_synthesizer.speak_ssml_async(ssml).get()

### Chinese Expressive Speech Example

In [20]:
ssml = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">
    <voice name="zh-CN-XiaomoNeural">
        <mstts:express-as style="sad" styledegree="2">
            快走吧，路上一定要注意安全，早去早回。
        </mstts:express-as>
    </voice>
</speak>
"""

# Synthesize speech from SSML
result = speech_synthesizer.speak_ssml_async(ssml).get()

### Role Playing with Different Styles

In [26]:
ssml = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
    <voice name="en-US-JennyNeural">
        The daughter saw her father walk in and asked:
        <mstts:express-as role="YoungAdultFemale" style="shouting">
            "You arrived quickly. How did you get here?"
        </mstts:express-as>
        The father put down his briefcase and said:
        <mstts:express-as role="OlderAdultMale" style="gentle">
            "I just took a taxi. The road was quite smooth."
        </mstts:express-as>
    </voice>
</speak>
"""

# Synthesize speech from SSML
result = speech_synthesizer.speak_ssml_async(ssml).get()