In [None]:
#
# Summary:
#

#
# This Python script creates a ZIP package of short audio clips (less than 15 seconds) from an Azure Blob Storage container and generates a corresponding manifest file.
# Here's what it does:
#
# 1. Loads environment configuration (such as working directories and container SAS URIs).
# 2. Downloads eligible audio files (duration < 15s and with metadata text) from Azure.
# 3. Creates a ZIP archive of the downloaded files.
# 4. Generates a manifest .txt file listing each audio filename and its transcription.
# 5. Uploads the ZIP file and manifest back to an output Azure container.
# 6. Cleans up temporary local directories used for processing.
#
# The output is a compact, labeled package of short audio samples and their text—ideal for datasets, training, or delivery.
#

# 
# This Notebook requires the following environment variables (.env file):
# 

# PACKAGE_LOCAL_WORKING_DIR=
# PACKAGE_INPUT_CONTAINER_SAS_URI=
# PACKAGE_OUTPUT_CONTAINER_SAS_URI=


In [None]:
import os
import shutil
import time
import uuid
import zipfile
from azure.storage.blob import BlobBlock, BlobClient, ContainerClient
from dotenv import load_dotenv

MAX_RETRIES = 3
BASE_DELAY = 1

def create_manifest(blobs: list[any], dest: str):
    with open(dest, 'w') as file:
        for blob in blobs:
            filename = os.path.splitext(blob["name"])[0]
            file.write(f"{filename}\t{blob["text"]}\n")

def download_package_blobs(client: ContainerClient, dest: str) -> list[any]:
    results = []
    blobs = list(client.list_blobs())
    for blob in blobs:
        blob_client = client.get_blob_client(blob.name)
        props = blob_client.get_blob_properties()
        meta = props.metadata

        try:
            duration = float(meta.get("duration", 0))
        except ValueError:
            continue

        if duration >= 15:
            continue

        text = meta.get("text", "")
        if not text:
            continue

        local_blob_path = os.path.join(dest, blob.name)
        for attempt in range(1, MAX_RETRIES + 1):
            try: 
                with open(local_blob_path, "wb") as file:
                    stream = client.download_blob(blob.name)
                    for chunk in stream.chunks():
                        file.write(chunk)
                        
                break
            except Exception as e:
                if attempt == MAX_RETRIES:
                    raise
                else:
                    print(f"download_blobs() error: {str(e)}")
                    delay = BASE_DELAY * (2 ** (attempt -1))
                    time.sleep(delay) 

        results.append({
            "name": blob.name,
            "path": local_blob_path,
            "duration": duration,
            "text":text
        })
        
    return results

def get_configuration() -> dict[str, str]:
    return {
        "local_working_dir": os.path.normpath(os.getenv('PACKAGE_LOCAL_WORKING_DIR')),
        "input_container_sas": os.getenv('PACKAGE_INPUT_CONTAINER_SAS_URI'),
        "output_container_sas": os.getenv('PACKAGE_OUTPUT_CONTAINER_SAS_URI')
    }

def get_container_client_from_sas(sas_uri: str) -> ContainerClient:
    return ContainerClient.from_container_url(sas_uri)

def get_package_and_manifest_names() -> str:
    unique_id = str(uuid.uuid4())
    return f"{unique_id}.zip", f"{unique_id}.txt"

def get_temp_dir(path: str) -> str:
    dir = os.path.join(path, str(uuid.uuid4()))
    os.makedirs(dir, exist_ok=True)
    return dir

def remove_temp_dir(path: str):
    shutil.rmtree(path)

def upload_blobs(client: ContainerClient, source: str):
    for file_name in os.listdir(source):
        local_file_path = os.path.join(source, file_name)
        if not os.path.isfile(local_file_path):
            continue
        
        blob_client = client.get_blob_client(file_name)

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                block_list = []
                chunk_size = 4 * 1024 * 1024

                with open(local_file_path, "rb") as file:
                    while True:
                        chunk = file.read(chunk_size)
                        if not chunk:
                            break
                        block_id = str(len(block_list)).zfill(6)
                        blob_client.stage_block(block_id=block_id, data=chunk)
                        block_list.append(BlobBlock(block_id=block_id))

                blob_client.commit_block_list(block_list)                
                break
            except Exception as e:
                if attempt == MAX_RETRIES:
                    raise
                else:
                    print(f"upload_blobs() error: {str(e)}")
                    delay = BASE_DELAY * (2 ** (attempt - 1))
                    time.sleep(delay)

def zip_package_blobs(blobs: list[any], dest: str):
    with zipfile.ZipFile(dest, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for blob in blobs:
            zipf.write(blob["path"], arcname=blob["name"])        

def main():
    print("Notebook Cell Running...")

    print("Loading configuration...")
    load_dotenv()
    configuration = get_configuration()

    print("Creating local input and output directories...")
    local_input_dir = get_temp_dir(configuration["local_working_dir"])
    local_output_dir = get_temp_dir(configuration["local_working_dir"])

    print("Obtaining input and output container clients...")
    input_client = get_container_client_from_sas(configuration['input_container_sas'])
    output_client = get_container_client_from_sas(configuration['output_container_sas'])

    print("Downloading input files locally...")    
    blobs = download_package_blobs(input_client, local_input_dir)

    print("Generating package and manifest names...")
    package_name, manifest_name = get_package_and_manifest_names()
    
    print("Creating package...")
    zip_package_blobs(blobs, os.path.join(local_output_dir, package_name))

    print("Creating manifest...")
    create_manifest(blobs, os.path.join(local_output_dir, manifest_name))

    print("Uploading output files to cloud...")
    upload_blobs(output_client, local_output_dir)

    print("Cleaning local input and output directories...")
    remove_temp_dir(local_input_dir)
    remove_temp_dir(local_output_dir)

    print("Done!")

main()