In [None]:
#
# Summary:
#

#
# This Python script automates the process of converting video files stored in Microsoft Azure cloud storage into audio files (MP3 format). 
# Here's what it does:
#
# 1. Downloads video files from a cloud storage container.
# 2. Converts each video file into an audio file using FFmpeg.
# 3. Uploads the resulting audio files back to a different cloud storage container.
# 4. Cleans up temporary files used during the process.
#
# It also includes retry logic to handle occasional failures during file download or upload.
#

# 
# This Notebook requires the following environment variables (.env file):
# 

# VIDEO_TO_AUDIO_LOCAL_WORKING_DIR=TODO
# VIDEO_TO_AUDIO_INPUT_CONTAINER_SAS_URI=TODO
# VIDEO_TO_AUDIO_OUTPUT_CONTAINER_SAS_URI=TODO

In [None]:
import os
import shutil
import subprocess
import time
import uuid
from azure.storage.blob import BlobBlock, BlobClient, ContainerClient
from dotenv import load_dotenv

MAX_RETRIES = 3
BASE_DELAY = 1

def convert_video_to_audio_files(source: str, dest: str):
    for file_name in os.listdir(source):
        local_path = os.path.join(source, file_name)
        output_path = os.path.join(dest, f"{os.path.splitext(os.path.basename(file_name))[0]}.mp3")

        subprocess.run(["ffmpeg", "-i", local_path, "-q:a", "0", "-map", "a", output_path], check=True)

def download_blobs(client: ContainerClient, dest: str):
    blobs = list(client.list_blobs())
    for blob in blobs:
        local_blob_path = os.path.join(dest, blob.name)
        for attempt in range(1, MAX_RETRIES + 1):
            try: 
                with open(local_blob_path, "wb") as file:
                    stream = client.download_blob(blob.name)
                    for chunk in stream.chunks():
                        file.write(chunk)
                        
                break
            except Exception as e:
                if attempt == MAX_RETRIES:
                    raise
                else:
                    print(f"download_blobs() error: {str(e)}")
                    delay = BASE_DELAY * (2 ** (attempt -1))
                    time.sleep(delay)                    

def get_configuration() -> dict[str, str]:
    return {
        "local_working_dir": os.path.normpath(os.getenv('VIDEO_TO_AUDIO_LOCAL_WORKING_DIR')),
        "input_container_sas": os.getenv('VIDEO_TO_AUDIO_INPUT_CONTAINER_SAS_URI'),
        "output_container_sas": os.getenv('VIDEO_TO_AUDIO_OUTPUT_CONTAINER_SAS_URI')
    }

def get_container_client_from_sas(sas_uri: str) -> ContainerClient:
    return ContainerClient.from_container_url(sas_uri)

def get_temp_dir(path: str) -> str:
    dir = os.path.join(path, str(uuid.uuid4()))
    os.makedirs(dir, exist_ok=True)
    return dir

def remove_temp_dir(path: str):
    shutil.rmtree(path)

def upload_blobs(client: ContainerClient, source: str):
    for file_name in os.listdir(source):
        local_file_path = os.path.join(source, file_name)
        if not os.path.isfile(local_file_path):
            continue
        
        blob_client = client.get_blob_client(file_name)

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                block_list = []
                chunk_size = 4 * 1024 * 1024

                with open(local_file_path, "rb") as file:
                    while True:
                        chunk = file.read(chunk_size)
                        if not chunk:
                            break
                        block_id = str(len(block_list)).zfill(6)
                        blob_client.stage_block(block_id=block_id, data=chunk)
                        block_list.append(BlobBlock(block_id=block_id))

                blob_client.commit_block_list(block_list)                
                break
            except Exception as e:
                if attempt == MAX_RETRIES:
                    raise
                else:
                    print(f"upload_blobs() error: {str(e)}")
                    delay = BASE_DELAY * (2 ** (attempt - 1))
                    time.sleep(delay)

def main():
    print("Notebook Cell Running...")

    print("Loading configuration...")
    load_dotenv()
    configuration = get_configuration()

    print("Creating local input and output directories...")
    local_input_dir = get_temp_dir(configuration["local_working_dir"])
    local_output_dir = get_temp_dir(configuration["local_working_dir"])

    print("Obtaining input and output container clients...")
    input_client = get_container_client_from_sas(configuration['input_container_sas'])
    output_client = get_container_client_from_sas(configuration['output_container_sas'])

    print("Downloading input files locally...")
    download_blobs(input_client, local_input_dir)

    print("Converting video to audio...")
    convert_video_to_audio_files(local_input_dir, local_output_dir)

    print("Uploading output files to cloud...")
    upload_blobs(output_client, local_output_dir)

    print("Cleaning local input and output directories...")
    remove_temp_dir(local_input_dir)
    remove_temp_dir(local_output_dir)

    print("Done!")

main()