In [None]:
%pip install azure-storage-blob
%pip install python-dotenv

In [None]:
# 
# This Notebook requires the following environment variables (.env file):
# 

# SEGMENT_AUDIO_INPUT_CONTAINER_SAS_URI=
# SEGMENT_TEXT_INPUT_CONTAINER_SAS_URI=
# SEGMENT_TEXT_INPUT_PATH=
# SEGMENT_OUTPUT_CONTAINER_SAS_URI=


In [None]:
import os
from azure.storage.blob import ContainerClient

from dotenv import load_dotenv
load_dotenv()

AUDIO_INPUT_CONTAINER_SAS_URI = os.getenv("SEGMENT_AUDIO_INPUT_CONTAINER_SAS_URI")
TEXT_INPUT_CONTAINER_SAS_URI = os.getenv("SEGMENT_TEXT_INPUT_CONTAINER_SAS_URI")
TEXT_INPUT_PATH = os.getenv("SEGMENT_TEXT_INPUT_PATH")
OUTPUT_CONTAINER_SAS_URI = os.getenv("SEGMENT_OUTPUT_CONTAINER_SAS_URI")

def get_container_client_from_sas(sas_uri):
    """Get a container client directly from a container SAS URI."""
    return ContainerClient.from_container_url(sas_uri)

def download_blobs(container_client, prefix=None, local_folder="downloads"):
    """Download all blobs from a container with an optional prefix."""
    os.makedirs(local_folder, exist_ok=True)
    
    downloaded_files = []
    blob_list = list(container_client.list_blobs(name_starts_with=prefix if prefix else ""))
    
    print(f"Found {len(blob_list)} blobs to download")
    
    for blob in blob_list:
        local_file_path = os.path.join(local_folder, os.path.basename(blob.name))
        print(f"Downloading {blob.name} to {local_file_path}")
        
        with open(local_file_path, "wb") as file:
            blob_data = container_client.download_blob(blob.name)
            file.write(blob_data.readall())
        
        downloaded_files.append({
            "blob_name": blob.name,
            "local_path": local_file_path
        })
    
    return downloaded_files

audio_input_container_client = get_container_client_from_sas(AUDIO_INPUT_CONTAINER_SAS_URI)
text_input_container_client = get_container_client_from_sas(TEXT_INPUT_CONTAINER_SAS_URI)
output_container_client = get_container_client_from_sas(OUTPUT_CONTAINER_SAS_URI)

audio_folder = "audio_downloads"
translation_folder = "text_downloads"
output_folder = "segmented_audio"

os.makedirs(audio_folder, exist_ok=True)
os.makedirs(translation_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

# Download audio files
audio_files = download_blobs(audio_input_container_client, prefix=None, local_folder=audio_folder)
print(f"Downloaded {len(audio_files)} audio files")

# Download translation files
translation_files = download_blobs(text_input_container_client, prefix=TEXT_INPUT_PATH, local_folder=translation_folder)
print(f"Downloaded {len(translation_files)} text files")