### In this notebook, I show an example of how to use the openai whisper and GPT-4o models with Autogen AssistantAgent and UserProxyAgent to recognize and translate audio file that contains a clip from a PodCast.
#### The source language is English and Target language is Chinese. The transcribe_text_from_audio function does the transcription while the translate_text function is called as part of a function calling example to execute the translation. 

#### Step 1: Import required libraries and define variables.

In [0]:
from typing_extensions import List, Annotated
import autogen
import os
import whisper
from openai import AzureOpenAI
from autogen import AssistantAgent, UserProxyAgent, register_function
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from pydub import AudioSegment
from pydub.silence import split_on_silence

keyVaultName = os.environ["KEY_VAULT_NAME"]
KVUri = f"https://{keyVaultName}.vault.azure.net"

credential = DefaultAzureCredential()
client = SecretClient(vault_url=KVUri, credential=credential)

source_language = "English"
target_language = "Chinese"
azure_openai_endpoint=client.get_secret(name="aoai-endpoint").value
azure_openai_api_key=client.get_secret(name="aoai-api-key").value
azure_openai_deploymentname = client.get_secret(name="aoai-deploymentname").value

video_file = "C:\\source\\github\\whisper-transcription\\peppa pig video clip.mp4"
audio_file = "C:\\source\\github\\whisper-transcription\\output_audio.wav"
podcast_filepath = "C:\\source\\github\\whisper-transcription\\PodcastSnippet.mp3"
source_language = "English"
target_language = "Chinese"

#### Step 2: Define and configure the agents.

In [0]:
llm_config = {
    "config_list": [
        {
            "model": client.get_secret(name="aoai-deploymentname").value,
            "api_key": client.get_secret(name="aoai-api-key").value,
            "base_url": client.get_secret(name="aoai-endpoint").value,
            "api_type": "azure",
            "api_version": "2024-02-15-preview",
        }
    ],
    "timeout": 120,
}


assistant = AssistantAgent(
    name="Assistant",
    system_message="For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done",
    llm_config=llm_config,
)

user_proxy = UserProxyAgent(
    name="User",
    is_termination_msg=lambda x: "terminate" in x.get("content", "").lower()
    if x.get("content", "") is not None
    else False,
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    code_execution_config=False,
)

In [0]:
def transcribe_text_from_audio(
    filepath: Annotated[str, "path of the audio file"]
) -> str:
    """
    This function uses the python pydub package to convert the audio file to wav format
      and transcribe the audio file to English text using the openai whisper model"""
    try:
        # Extract  audio from the podcast file
        audio = AudioSegment.from_mp3(file=filepath)
        out_file = "audio.wav"
        audio.export(out_file, format="wav")
        # Load model
        model = whisper.load_model("small")
        result = model.transcribe(audio=out_file, verbose=False)
        transcript = result["text"]
        # print(transcript)
        return transcript
    except FileNotFoundError:
        print("The specified audio file could not be found.")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

In [0]:
# one way of registering functions is to use the register_for_llm and register_for_execution decorators or use the register_function method.

@user_proxy.register_for_execution()
@assistant.register_for_llm(
    description="using translate_text function to translate the script"
)
def translate_text(
    source_language: Annotated[str, "source language"],
    target_language: Annotated[str, "target language"],
) -> str:
    
    """
    This function is used for the agent function calling. It is registered with the agents and initiates the 
    translation operation with the LLM."""
    
    client_aoai = AzureOpenAI(
        api_key=azure_openai_api_key,
        azure_endpoint=azure_openai_endpoint,
        api_version="2024-02-15-preview",
    )
    input_text = transcribe_text_from_audio(filepath=podcast_filepath).strip()
    response = client_aoai.chat.completions.create(
        model=azure_openai_deploymentname,
        messages=[
            {"role": "system", "content": "You are a helpful assistant"},
            {
                "role": "user",
                "content": f"Directly translate the following {source_language} text to a pure {target_language}"
                f"audio text without additional explanation.: '{input_text}'",
            },
        ],
        max_tokens=1500,
    )
    # Correctly access the response content
    translated_text = response.choices[0].message.content if response.choices else None
    return translated_text

#### Start the chat

In [0]:
agent_result = user_proxy.initiate_chat(
    assistant,
    message=f"For the provided podcast file in {audio_file}, recognize the speech and transfer it into a script file, "
    f"then translate from {source_language} text to a {target_language} text. ",
)

In [0]:
print(agent_result.summary)