In [1]:
import os
import azure.cognitiveservices.speech as speechsdk
from openai import AzureOpenAI
from dotenv import load_dotenv

load_dotenv()
AZURE_OPENAI_ENDPOINT=os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY=os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_API_VERSION=os.getenv("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_GPT4o_DEPLOYMENT=os.getenv("AZURE_OPENAI_GPT4o_DEPLOYMENT")

#init the openai client
client = AzureOpenAI(
  azure_endpoint = AZURE_OPENAI_ENDPOINT, 
  api_key=AZURE_OPENAI_KEY,  
  api_version=AZURE_OPENAI_API_VERSION
)

print(f"Model: {AZURE_OPENAI_GPT4o_DEPLOYMENT}; API Version:{AZURE_OPENAI_API_VERSION}")
print("Azure OpenAI model is ready to use!")

Model: gpt-4o; API Version:2024-10-21
Azure OpenAI model is ready to use!


In [2]:
SPEECH_KEY = os.getenv("SPEECH_KEY")
SPEECH_REGION = os.getenv("SPEECH_REGION")
engine_name = "test"

print(f"Speech Service in {SPEECH_REGION} is ready to use!")

speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)
# Set up Azure Text-to-Speech language 
speech_config.speech_synthesis_language = "en-US"
# Set up Azure Speech-to-Text language recognition
speech_config.speech_recognition_language = "en-US"
# Use an absolute path for the log file
log_file_path = os.path.abspath("./log/log.txt")
speech_config.set_property(speechsdk.PropertyId.Speech_LogFilename, log_file_path)


# Set up the voice configuration
speech_config.speech_synthesis_voice_name = "en-US-JennyMultilingualNeural"
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

Speech Service in australiaeast is ready to use!


In [3]:
# Define the speech-to-text function using continuous recognition
def speech_to_text():
    # Set up the audio configuration
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    # Create a speech recognizer and start the recognition
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    print("Say something...")

    done = False
    all_results = []

    def stop_cb(evt):
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    def recognized_cb(evt):
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print("Recognized: {}".format(evt.result.text))
            all_results.append(evt.result.text)
        elif evt.result.reason == speechsdk.ResultReason.NoMatch:
            print("No speech could be recognized: {}".format(evt.result.no_match_details))
        return ""

    speech_recognizer.recognized.connect(recognized_cb)
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    speech_recognizer.start_continuous_recognition()
    while not done:
        pass

    speech_recognizer.stop_continuous_recognition()
    return " ".join(all_results)

In [4]:
# Define the text-to-speech function
def text_to_speech(text):
    try:
        result = speech_synthesizer.speak_text_async(text).get()
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Text-to-speech conversion successful.")
            return True
        else:
            print(f"Error synthesizing audio: {result}")
            return False
    except Exception as ex:
        print(f"Error synthesizing audio: {ex}")
        return False

In [5]:
def evaluate_sentiment(text):
    system_message = """
    You are an AI assistant that helps recognize the sentiment in a given text.
    1. Evaluate the given text and provide the category of the sentiment as either positive, negative, or neutral.
    2. Do not provide any additional examples to the output, just the category.
    """

    response = client.chat.completions.create(
        model=AZURE_OPENAI_GPT4o_DEPLOYMENT,
        messages = [
            {"role":"system","content":system_message},
            {"role":"user","content":text}
            ],
        temperature=0   
    )
    return response.choices[0].message.content

In [6]:
def speech_from_file_short(filename):
    audio_config = speechsdk.audio.AudioConfig(filename=filename)
    # Creates a speech recognizer using a file as audio input, also specify the speech language
    speech_config.speech_recognition_language = "en-US"
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config, audio_config=audio_config)

    # Starts speech recognition, and returns after a single utterance is recognized. The end of a
    # single utterance is determined by listening for silence at the end or until a maximum of about 30
    # seconds of audio is processed. It returns the recognition text as result.
    # Note: Since recognize_once() returns only a single utterance, it is suitable only for single
    # shot recognition like command or query.
    # For long-running multi-utterance recognition, use start_continuous_recognition() instead.
    result = speech_recognizer.recognize_once()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        return result.text
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(result.no_match_details))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
    return ""


In [7]:
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
def speech_from_file_full(filename):
    audio_config = speechsdk.audio.AudioConfig(filename=filename)
    speech_config.speech_recognition_language = "en-US"
    
    # Creates a speech recognizer using a file as audio input
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    # List to store the recognized text
    all_results = []

    def recognized_cb(evt):
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print("Recognized: {}".format(evt.result.text))
            all_results.append(evt.result.text)
        elif evt.result.reason == speechsdk.ResultReason.NoMatch:
            print("No speech could be recognized: {}".format(evt.result.no_match_details))

    def stop_cb(evt):
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognized.connect(recognized_cb)
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous recognition
    speech_recognizer.start_continuous_recognition()

    done = False
    while not done:
        pass

    speech_recognizer.stop_continuous_recognition()

    # Return the concatenated results
    return " ".join(all_results)

In [8]:
def extract_entities(text):
    response = client.chat.completions.create(
        model=AZURE_OPENAI_GPT4o_DEPLOYMENT,
        messages=[
            {"role": "system", "content": """
             Extract the entities from the text and provide only JSON output.
             
             {caller_name: "John", call_purpose: "meeting"}

             """},
            {"role": "user", "content": text},
        ],
        response_format={ "type": "json_object" },
    )

    return response.choices[0].message.content

In [9]:
def translate(text, target_language):
    system_message = """You are a helpful assistant that translates text into """ + target_language + """.
    Answer in a clear and concise manner only translating the text. 
    Ignore filler words like 'ng', 'uh', etc. and any unnatural sentence breaks. 
    Ensure the translation flows smoothly and natually while preserving the intended meaning.
    Text:
    """

    response = client.chat.completions.create(
        model=AZURE_OPENAI_GPT4o_DEPLOYMENT,
        messages = [
            {"role":"system","content":system_message},
            {"role":"user","content":text}
            ],
        temperature=0   
    )
    return response.choices[0].message.content

In [10]:
#Full transcription
source = "./data/Call1_separated_16k_health_insurance.wav"
transcription_full = speech_from_file_full(source)
print(f"Transcription: {transcription_full}")

Recognized: Hello, thank you for calling Contoso, who am I speaking with today? Hi, my name is Mary Rondo. I'm trying to enroll myself with Contuso. Hi Mary. Uh, are you calling because you need health insurance? Yes, Yeah, I'm calling to sign up for insurance. Great. Uh, if you can answer a few questions, uh, we can get you signed up in a jiffy. OK, So, uh, what's your full name?
Recognized: Uh, so Mary Beth Rondo, last name is R like Romeo, O like ocean, N like Nancy, DD like Dog, and O like Ocean again.
Recognized: Rondo got it. And what's the best callback number in case we get disconnected?
Recognized: I only have a cell phone, so I can give you that. Yeah, that would be fine. Sure. So it's 234554 and then 9312. Got it. So to confirm, it's 234-554-9312.
Recognized: Yep, that's right. Excellent. Let's get some additional information from your from your application. Do you have a job?
Recognized: Yes, I am self-employed. OK, so then you have a Social Security number as well? Yes, I 

In [11]:
# Translate the text to Chinese using OpenAI
translated_text_full = translate(transcription_full, "Chinese")
print(f"Translated to Chinese: {translated_text_full}")

Translated to Chinese: 您好，感谢您致电Contoso，请问今天我在和谁通话？您好，我叫Mary Rondo。我想注册Contuso。你好，Mary。请问您是因为需要健康保险而打电话吗？是的，我打电话是为了注册保险。太好了。如果您能回答几个问题，我们可以很快为您注册。好的，那么，您的全名是什么？是Mary Beth Rondo，姓氏是R像Romeo，O像ocean，N像Nancy，DD像Dog，O像Ocean。Rondo，明白了。以防我们断线，您最好的回拨号码是多少？我只有手机，所以我可以给您这个号码。好的，那就行。好的，是234554然后是9312。明白了。确认一下，是234-554-9312。是的，没错。太好了。让我们从您的申请中获取一些额外信息。您有工作吗？是的，我是自雇人士。好的，那么您也有社会安全号码吗？是的，我有。好的，请问您的社会安全号码是多少？当然，是412256789。抱歉，您刚才说的是A25还是A225？您刚才断了一下。是22，所以是412，然后再一个2，然后是5。好的，非常感谢。请问您的电子邮件地址是什么？是的，itsmaryrondo@gmail.com。就是我的名字和姓氏@gmail.com。没有句号，没有破折号。好的。这是最后一个问题。让我记录您的信息，我会立即为您注册。感谢您致电Contoso，我会立即为您注册。我们的代理将在大约24小时内给您回电以确认您的申请。听起来不错，谢谢。绝对的。如果您还有其他需要，请拨打1-800-555-5564分机123联系我们。非常感谢您致电Contessa。实际上，我还有一个问题。好的，当然。我想知道，我会收到一张实体卡作为保险证明吗？默认情况下是数字会员卡，但如果您愿意，我们可以寄给您一张实体卡。好的。请在准备好后邮寄给我，我想寄到我的地址。好的，是2660号A单元，Maple Ave. SE，Lansing，邮政编码是48823。好的，我已经在您的档案中做了记录。太好了，非常感谢。非常欢迎。感谢您致电Contoso，祝您有美好的一天。


In [None]:
# Convert the response to speech using text-to-speech
# In Azure Speech, Speech Synthesis Markup Language (SSML) can be used to specifu different voices.
text_to_speech(translated_text_full)

In [12]:
#The speech_from_file_short function use recognize_once() method. It is designed for short, single utterances and has a default limit of about 30 seconds for demo purpose.
source = "./data/Call1_separated_16k_health_insurance.wav"
transcription_short = speech_from_file_short(source)
print(f"Transcription: {transcription_short}")

Transcription: Hello, thank you for calling Contoso, who am I speaking with today? Hi, my name is Mary Rondo. I'm trying to enroll myself with Contuso. Hi Mary. Uh, are you calling because you need health insurance? Yes, Yeah, I'm calling to sign up for insurance. Great. Uh, if you can answer a few questions, uh, we can get you signed up in a jiffy. OK. Umm, So, uh, what's your full name?


In [13]:
# Evaluate the sentiment using OpenAI
response = evaluate_sentiment(transcription_short)
print(f"Sentiment: {response}")

Sentiment: Neutral


In [14]:
entities = extract_entities(transcription_short)
print(f"Entities: {entities}")

Entities: {
  "caller_name": "Mary Rondo",
  "call_purpose": "sign up for insurance"
}


In [19]:
# Translate the text to Chinese using OpenAI
translated_text_short = translate(transcription_short, "Chinese")
print(f"Translated to Chinese: {translated_text_short}")

Translated to Chinese: 您好，感谢您致电Contoso，请问我今天在和谁通话？您好，我叫玛丽·隆多。我想注册Contoso。您好，玛丽。请问您是因为需要健康保险而打电话吗？是的，我打电话是为了注册保险。好的，如果您能回答几个问题，我们可以很快为您注册。好的，那么，您的全名是什么？


In [None]:
# Convert the response to speech using text-to-speech
text_to_speech(translated_text_short)