In [None]:
!apt-get update -qq
!apt-get install -qq ffmpeg

In [None]:
%pip install azure-storage-blob
%pip install ffmpeg-python
%pip install python-dotenv

In [None]:
# 
# This Notebook requires the following environment variables (.env file):
# 

# SEGMENT_LOCAL_WORKING_DIR=
# SEGMENT_AUDIO_INPUT_CONTAINER_SAS_URI=
# SEGMENT_TEXT_INPUT_CONTAINER_SAS_URI=
# SEGMENT_TEXT_INPUT_PATH=
# SEGMENT_OUTPUT_CONTAINER_SAS_URI=


In [1]:
import os
import json
import ffmpeg
from azure.storage.blob import ContainerClient

from dotenv import load_dotenv
load_dotenv()

LOCAL_WORKING_DIR = os.getenv("SEGMENT_LOCAL_WORKING_DIR")
AUDIO_INPUT_CONTAINER_SAS_URI = os.getenv("SEGMENT_AUDIO_INPUT_CONTAINER_SAS_URI")
TEXT_INPUT_CONTAINER_SAS_URI = os.getenv("SEGMENT_TEXT_INPUT_CONTAINER_SAS_URI")
TEXT_INPUT_PATH = os.getenv("SEGMENT_TEXT_INPUT_PATH")
OUTPUT_CONTAINER_SAS_URI = os.getenv("SEGMENT_OUTPUT_CONTAINER_SAS_URI")

def get_container_client_from_sas(sas_uri):
    """Get a container client directly from a container SAS URI."""
    return ContainerClient.from_container_url(sas_uri)

def download_blobs(container_client, prefix=None, local_folder="downloads"):
    """Download all blobs from a container with an optional prefix."""
    os.makedirs(local_folder, exist_ok=True)
    
    downloaded_files = []
    blob_list = list(container_client.list_blobs(name_starts_with=prefix if prefix else ""))
    
    print(f"Found {len(blob_list)} blobs to download")
    
    for blob in blob_list:
        local_file_path = os.path.join(local_folder, os.path.basename(blob.name))
        print(f"Downloading {blob.name} to {local_file_path}")
        
        #temp
        #with open(local_file_path, "wb") as file:
        #    blob_data = container_client.download_blob(blob.name)
        #    file.write(blob_data.readall())
        
        downloaded_files.append({
            "blob_name": blob.name,
            "local_path": local_file_path
        })
    
    return downloaded_files

def match_audio_and_translation_files(audio_files, translation_files):
    """Match audio files with their corresponding translation files."""
    matched_pairs = []
    
    for audio_file in audio_files:
        audio_basename = os.path.splitext(os.path.basename(audio_file["blob_name"]))[0]
        
        # Look for translation files that contain the audio file name
        matching_translations = [
            t for t in translation_files 
            if audio_basename in os.path.basename(t["blob_name"])
        ]
        
        if matching_translations:
            # Use the first matching translation file
            matched_pairs.append({
                "audio_file": audio_file,
                "translation_file": matching_translations[0]
            })
        else:
            print(f"No matching translation found for {audio_basename}")
    
    return matched_pairs

def parse_translation_file(translation_file_path):
    """Parse a translation JSON file to extract phrases and timestamps."""
    with open(translation_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    segments = []
    
    # The structure of the JSON may vary depending on the Speech API response format
    # This is a common format for Azure Speech Service recognition results
    if "recognizedPhrases" in data:
        for phrase in data["recognizedPhrases"]:
            if "offsetInTicks" in phrase and "durationInTicks" in phrase:
                # Convert ticks (100-nanosecond units) to seconds
                start_time = float(phrase["offsetInTicks"]) / 10000000
                duration = float(phrase["durationInTicks"]) / 10000000
                end_time = start_time + duration
                
                text = ""
                if "nBest" in phrase and len(phrase["nBest"]) > 0:
                    text = phrase["nBest"][0].get("display", "")
                
                segments.append({
                    "start_time": start_time,
                    "end_time": end_time,
                    "text": text
                })
        
    return segments

def segment_audio_file(audio_file_path, segments, output_folder):
    """Segment an audio file based on the provided timestamps using FFmpeg."""
    base_filename = os.path.splitext(os.path.basename(audio_file_path))[0]
    segmented_files = []
    
    for i, segment in enumerate(segments):
        start_time = segment["start_time"]
        end_time = segment["end_time"]
        duration = end_time - start_time
                
        output_filename = f"{base_filename}_seg{i:03d}.mp3"
        output_path = os.path.join(output_folder, output_filename)
        
        try:
            # Use FFmpeg to extract the segment
            #temp
            #(
            #    ffmpeg
            #    .input(audio_file_path, ss=start_time, t=duration)
            #    .output(output_path, acodec='libmp3lame', q=2)
            #    .run(quiet=True, overwrite_output=True)
            #)
            
            segmented_files.append({
                "file_path": output_path,
                "segment": segment
            })
            
            print(f"Created segment {i+1}/{len(segments)}: {output_filename}")
        except Exception as e:
            print(f"Error creating segment {i+1}: {str(e)}")
    
    return segmented_files

def upload_segmented_files(container_client, segmented_files, base_folder="segments"):
    """Upload segmented audio files to the output container with metadata."""
    uploaded_files = []
    
    for file_info in segmented_files:
        file_path = file_info["file_path"]
        file_name = os.path.basename(file_path)

        if base_folder:
            blob_name = f"{base_folder}/{file_name}"
        else:
            blob_name = file_name
        
        # Extract segment info for metadata
        segment = file_info["segment"]
        metadata = {
            "start_time": str(segment["start_time"]),
            "end_time": str(segment["end_time"]),
            "duration": str(segment["end_time"] - segment["start_time"]),
            "text": segment["text"][:256]  # Metadata values are limited in size
        }
        
        print(f"Uploading {file_name} to {blob_name}")
        
        from azure.storage.blob import ContentSettings
        with open(file_path, "rb") as data:
            container_client.upload_blob(
                name=blob_name, 
                data=data, 
                overwrite=True,
                metadata=metadata,
                content_settings=ContentSettings(content_type="audio/mpeg")
            )
        
        uploaded_files.append(blob_name)
    
    return uploaded_files

#
# Main
#

print("Starting audio segmentation process...")

audio_input_container_client = get_container_client_from_sas(AUDIO_INPUT_CONTAINER_SAS_URI)
text_input_container_client = get_container_client_from_sas(TEXT_INPUT_CONTAINER_SAS_URI)
output_container_client = get_container_client_from_sas(OUTPUT_CONTAINER_SAS_URI)

audio_folder = os.path.join(LOCAL_WORKING_DIR, "audio_downloads")
translation_folder = os.path.join(LOCAL_WORKING_DIR, "text_downloads")
output_folder = os.path.join(LOCAL_WORKING_DIR, "segmented_audio")

os.makedirs(audio_folder, exist_ok=True)
os.makedirs(translation_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

# Download audio files
audio_files = download_blobs(audio_input_container_client, prefix=None, local_folder=audio_folder)
print(f"Downloaded {len(audio_files)} audio files")

# Download translation files
translation_files = download_blobs(text_input_container_client, prefix=TEXT_INPUT_PATH, local_folder=translation_folder)
print(f"Downloaded {len(translation_files)} text files")

# Match audio files with translation files
matched_pairs = match_audio_and_translation_files(audio_files, translation_files)
print(f"Found {len(matched_pairs)} matching audio-translation pairs")

# Process each matched pair
all_segmented_files = []

for pair in matched_pairs:
    audio_path = pair["audio_file"]["local_path"]
    translation_path = pair["translation_file"]["local_path"]
    
    print(f"\nProcessing {os.path.basename(audio_path)} with {os.path.basename(translation_path)}")
    
    # Parse the translation file to get segments
    segments = parse_translation_file(translation_path)
    print(f"Found {len(segments)} segments in translation file")
    
    if segments:
        # Segment the audio file
        segmented_files = segment_audio_file(audio_path, segments, output_folder)
        all_segmented_files.extend(segmented_files)
    else:
        print("No segments found, skipping audio segmentation")

if all_segmented_files:
    uploaded_files = upload_segmented_files(output_container_client, all_segmented_files, "")
    print(f"Uploaded {len(uploaded_files)} segmented files to output container")

print("Audio segmentation process completed successfully.")

Starting audio segmentation process...
Found 3 blobs to download
Downloading FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean.wav to ../../temp\audio_downloads\FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean.wav
Downloading SE&O_Onboarding_Part 2_V01_clean.wav to ../../temp\audio_downloads\SE&O_Onboarding_Part 2_V01_clean.wav
Downloading Welcome_to_the_Digital_GTM_Day_V07_clean.wav to ../../temp\audio_downloads\Welcome_to_the_Digital_GTM_Day_V07_clean.wav
Downloaded 3 audio files
Found 3 blobs to download
Downloading 8529afd3-f1ca-4475-96b6-a48cc9aa10ab/clean-audio/FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean.wav.json to ../../temp\text_downloads\FY25 Kickoff Event SE&O - Meeting Recording audio only edited (1)_clean.wav.json
Downloading 8529afd3-f1ca-4475-96b6-a48cc9aa10ab/clean-audio/SE&O_Onboarding_Part 2_V01_clean.wav.json to ../../temp\text_downloads\SE&O_Onboarding_Part 2_V01_clean.wav.json
Downloading 8529