In [None]:
#
# Summary:
#

#
# This Python script uses Microsoft's Speech-to-Text API to convert audio files stored in Azure cloud storage into text transcripts.
# Here's what it does:
#
# 1. Loads configuration details, such as API keys and container locations.
# 2. Submits a transcription job to Microsoft's speech service, pointing to the cloud location of the audio files.
# 3. Waits for the transcription to complete.
# 4. Stores the resulting text files in a specified Azure storage container.
#
# It uses automatic punctuation, filters profanity, and includes word-level timestamps in the transcripts.
#

# 
# This Notebook requires the following environment variables (.env file):
# 

# SPEECH_API_VERSION=2024-11-15
# SPEECH_API_KEY=TODO
# SPEECH_API_REGION=eastasia
# SPEECH_API_LOCALE=en-US
# SPEECH_TO_TEXT_INPUT_CONTAINER_SAS_URI=TODO
# SPEECH_TO_TEXT_OUTPUT_CONTAINER_SAS_URI=TODO


In [None]:
import os
import swagger_client
import time
from dotenv import load_dotenv

NAME = "Simple transcription"
DESCRIPTION = "Simple transcription"

def convert_audio_to_text(config: dict[str, str]):
    speech_config = swagger_client.Configuration()
    speech_config.api_key["Ocp-Apim-Subscription-Key"] = config["api_key"]
    speech_config.host = f"https://{config['api_region']}.api.cognitive.microsoft.com/speechtotext"

    client = swagger_client.ApiClient(speech_config)
    api = swagger_client.CustomSpeechTranscriptionsApi(api_client=client)

    client_props = swagger_client.TranscriptionProperties(time_to_live_hours=6)
    client_props.word_level_timestamps_enabled = True
    client_props.display_from_word_level_timestamps_enabled = True
    client_props.punctuation_mode = "DictatedAndAutomatic"
    client_props.profanity_filter_mode = "Masked"
    client_props.destination_container_url = config['output_container_sas']

    transcription_definition = transcribe_from_container(
        config['input_container_sas'], 
        client_props,
        config['api_locale']
    )

    created_transcription, status, headers = api.transcriptions_submit_with_http_info(
        transcription=transcription_definition,
        api_version=config['api_version']
    )

    transcription_id = headers["location"].split("/")[-1].split("?")[0]
    completed = False
    while not completed:
        time.sleep(5)

        transcription = api.transcriptions_get(transcription_id, api_version=config['api_version'])
        if (transcription.status in ("Failed", "Succeeded")):
            completed = True

        if transcription.status == "Failed":
            raise Exception(f"Transcription failed: {transcription.properties.error.message}")
       
    return                

def get_configuration() -> dict[str, str]:
    return {
        "api_version": os.getenv('SPEECH_API_VERSION'),
        "api_key": os.getenv('SPEECH_API_KEY'),
        "api_region": os.getenv('SPEECH_API_REGION'),
        "api_locale": os.getenv('SPEECH_API_LOCALE'),
        "input_container_sas": os.getenv('SPEECH_TO_TEXT_INPUT_CONTAINER_SAS_URI'),
        "output_container_sas": os.getenv('SPEECH_TO_TEXT_OUTPUT_CONTAINER_SAS_URI')
    }

def transcribe_from_container(uri: str, properties: any, locale: str):
    transcription_definition = swagger_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=locale,
        content_container_url=uri,
        properties=properties
    )

    return transcription_definition

def main():
    print("Notebook Cell Running...")

    print("Loading configuration...")
    load_dotenv()
    configuration = get_configuration()

    print("Converting audio to text...")
    convert_audio_to_text(configuration)

    print("Done!")

main()