In [None]:
# 
# This Notebook requires the following environment variables (.env file):
# 

# PACKAGE_LOCAL_WORKING_DIR=
# PACKAGE_INPUT_CONTAINER_SAS_URI=
# PACKAGE_OUTPUT_CONTAINER_SAS_URI=


In [None]:
import os
import uuid
from azure.storage.blob import ContainerClient

from dotenv import load_dotenv
load_dotenv()

PACKAGE_NAME = str(uuid.uuid4())
LOCAL_WORKING_DIR = os.getenv("PACKAGE_LOCAL_WORKING_DIR")
INPUT_CONTAINER_SAS_URI = os.getenv("PACKAGE_INPUT_CONTAINER_SAS_URI")
OUTPUT_CONTAINER_SAS_URI = os.getenv("PACKAGE_OUTPUT_CONTAINER_SAS_URI")

def get_container_client_from_sas(sas_uri):
    """Get a container client directly from a container SAS URI."""
    return ContainerClient.from_container_url(sas_uri)

def download_filtered_audio_blobs(container_client, file_extensions=('.mp3', '.wav'), max_duration=15, local_folder="downloads"):
    """Download filtered audio blobs from a container"""
    os.makedirs(local_folder, exist_ok=True)
    
    downloaded_files = []
    blob_list = list(container_client.list_blobs())

    print(f"Found {len(blob_list)} total blobs in container")

    for blob in blob_list:
        #  filter by file extension
        if not blob.name.lower().endswith(file_extensions):
            print(f"Skipping {blob.name} - unsupported file extension")
            continue

        blob_client = container_client.get_blob_client(blob.name)
        props = blob_client.get_blob_properties()
        metadata = props.metadata

        # Filter by duration in metadata
        try:
            duration = float(metadata.get('duration', 0))        
        except ValueError:
            print(f"Skipping {blob.name} - invalid duration metadata")
            continue

        if duration >= max_duration:
            print(f"Skipping {blob.name}: duration {duration}s >= {max_duration}s")
            continue

        # Filter by presence of text metadata
        text = metadata.get('text', '')

        if not text:
            print(f"Skipping {blob.name} - no text metadata found")
            continue
    
        local_file_path = os.path.join(local_folder, os.path.basename(blob.name))
        print(f"Downloading {blob.name} to {local_file_path}")

        with open(local_file_path, "wb") as file:
            blob_data = blob_client.download_blob()
            file.write(blob_data.readall())

        downloaded_files.append({
            "blob_name": blob.name,
            "local_path": local_file_path,
            "duration": duration,
            "text": text
        })

    print(f"Downloaded {len(downloaded_files)} files that match the criteria")
    return downloaded_files

#
# Main
#

print("Starting packaging process for package: ", PACKAGE_NAME)

input_container_client = get_container_client_from_sas(INPUT_CONTAINER_SAS_URI)
output_container_client = get_container_client_from_sas(OUTPUT_CONTAINER_SAS_URI)
package_working_dir = os.path.join(LOCAL_WORKING_DIR, PACKAGE_NAME)

os.makedirs(LOCAL_WORKING_DIR, exist_ok=True)
os.makedirs(package_working_dir, exist_ok=True)

audio_files = download_filtered_audio_blobs(
    input_container_client,
    file_extensions=('.mp3', '.wav'),
    max_duration=15,
    local_folder=package_working_dir
)




Starting packaging process for package:  d62074f0-ee09-4809-8fed-52b5f13c2aa5
Found 914 total blobs in container
Downloading FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean_seg000.mp3 to ../../temp\d62074f0-ee09-4809-8fed-52b5f13c2aa5\FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean_seg000.mp3
Downloading FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean_seg001.mp3 to ../../temp\d62074f0-ee09-4809-8fed-52b5f13c2aa5\FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean_seg001.mp3
Downloading FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean_seg002.mp3 to ../../temp\d62074f0-ee09-4809-8fed-52b5f13c2aa5\FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean_seg002.mp3
Downloading FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean_seg003.mp3 to ../../temp\d62074f0-ee09-4809-8fed-52b5f13c2aa5\FY25 Kickoff Event SE&O - Meeting Recording audio only

KeyboardInterrupt: 