# Multilingual Long-Form Alignment Demo

This notebook demonstrates **TorchAudio Long-Form Aligner** across 8 languages using real-world audio and text data.

## Languages Covered
1. **English** - Meta Q1 2025 Earnings Call (~1 hour, 9K words)
2. **Portuguese** - Orpheu Poetry (17 min, 18K words book)
3. **Chinese** - Analects of Confucius (11.5 min, 15K chars)
4. **Japanese** - Kaze Tachinu novel (57 min, 57K chars)
5. **Hindi** - Universal Declaration of Human Rights (17.5 min)
6. **Korean** - Universal Declaration of Human Rights (12 min)
7. **Filipino (Tagalog)** - Universal Declaration of Human Rights (17.5 min)
8. **Zhuang** - Bible Luke chapter (15.5 min, low-resource language)

## Key Features Demonstrated
- **Simple 3-line API**: `align_long_audio(audio, text)`
- **Automatic text normalization**: PDF parsing, romanization, number expansion
- **Long-form handling**: Segments audio, aligns with fuzzy matching, stitches with LIS
- **Interactive verification**: Listen to aligned segments word-by-word

## Requirements
- k2 (WFST library)
- lis (longest increasing subsequence)
- Language-specific: uroman, cutlet (Japanese), zhon (CJK)

## Setup

In [None]:
# =============================================================================
# Install Dependencies (auto-detect k2 version)
# =============================================================================

import subprocess
import sys

def install_k2_if_needed():
    """Check if k2 is available, if not, install the correct version."""
    try:
        import k2
        print(f"k2 already installed:")
        ! pip show k2
        return True
    except ImportError:
        pass
    
    # Get system info
    import torch
    torch_version = torch.__version__.split('+')[0]  # e.g., "2.5.0"
    torch_major_minor = '.'.join(torch_version.split('.')[:2])  # e.g., "2.5"
    cuda_available = torch.cuda.is_available()
    cuda_version = torch.version.cuda if cuda_available else None
    
    print(f"PyTorch: {torch_version}")
    print(f"CUDA available: {cuda_available}")
    if cuda_version:
        print(f"CUDA version: {cuda_version}")
    
    # Determine which k2 to install
    if cuda_available and cuda_version:
        # GPU version
        cuda_major_minor = '.'.join(cuda_version.split('.')[:2])  # e.g., "12.4"
        index_url = "https://k2-fsa.github.io/k2/cuda.html"
        print(f"\nLooking for k2 with CUDA {cuda_major_minor} and PyTorch {torch_major_minor}...")
        
        # Try to find matching version from the index
        # Common patterns: k2==1.24.4.dev20251030+cuda12.4.torch2.5.0
        try:
            import urllib.request
            with urllib.request.urlopen(index_url, timeout=10) as response:
                html = response.read().decode('utf-8')
            
            # Parse available versions
            import re
            # Match pattern like: k2-1.24.4.dev20251030+cuda12.4.torch2.5.0
            pattern = rf'k2-[\d.]+dev\d+\+cuda{re.escape(cuda_major_minor)}\.torch{re.escape(torch_major_minor)}\.\d+'
            matches = re.findall(pattern, html)
            
            if matches:
                # Get the latest version (last match usually)
                latest = matches[-1].replace('k2-', 'k2==').replace('+', '%2B')
                # Convert back for pip
                pkg_name = matches[-1].replace('k2-', 'k2==')
                print(f"Found: {pkg_name}")
                cmd = f"pip install {pkg_name} -f {index_url}"
            else:
                print(f"No exact match found for CUDA {cuda_major_minor} + PyTorch {torch_major_minor}")
                print("Trying generic GPU install...")
                cmd = f"pip install k2 -f {index_url}"
        except Exception as e:
            print(f"Could not fetch index: {e}")
            cmd = f"pip install k2 -f {index_url}"
    else:
        # CPU version
        index_url = "https://k2-fsa.github.io/k2/cpu.html"
        print(f"\nLooking for k2 CPU version for PyTorch {torch_major_minor}...")
        
        try:
            import urllib.request
            with urllib.request.urlopen(index_url, timeout=10) as response:
                html = response.read().decode('utf-8')
            
            import re
            pattern = rf'k2-[\d.]+dev\d+\+cpu\.torch{re.escape(torch_major_minor)}\.\d+'
            matches = re.findall(pattern, html)
            
            if matches:
                pkg_name = matches[-1].replace('k2-', 'k2==')
                print(f"Found: {pkg_name}")
                cmd = f"pip install {pkg_name} --no-deps -f {index_url}"
            else:
                print(f"No exact match found for PyTorch {torch_major_minor}")
                cmd = f"pip install k2 --no-deps -f {index_url}"
        except Exception as e:
            print(f"Could not fetch index: {e}")
            cmd = f"pip install k2 --no-deps -f {index_url}"
    
    print(f"\nInstalling: {cmd}")
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode == 0:
        print("k2 installed successfully!")
        return True
    else:
        print(f"Installation failed: {result.stderr}")
        return False

def install_other_deps():
    """Install other required dependencies."""
    deps = [
        "pytorch-lightning",
        "cmudict",
        "g2p_en",
        "pydub",
        "pypdf",
        "git+https://github.com/huangruizhe/lis.git",
    ]
    for dep in deps:
        try:
            subprocess.run(f"pip install -q {dep}", shell=True, check=True)
        except:
            print(f"Warning: Failed to install {dep}")

# Run installation
install_k2_if_needed()
install_other_deps()
print("\nDependency installation complete.")

In [None]:
# =============================================================================
# Setup: Clone Repository and Configure Imports
# =============================================================================

import sys
import os
from pathlib import Path

# ===== CONFIGURATION =====
GITHUB_REPO = "https://github.com/huangruizhe/torchaudio_aligner.git"
BRANCH = "dev"
# =========================

def setup_imports():
    IN_COLAB = 'google.colab' in sys.modules
    
    if IN_COLAB:
        repo_path = '/content/torchaudio_aligner'
        src_path = f'{repo_path}/src'
        data_path = '/content/data'
        
        if not os.path.exists(repo_path):
            print(f"Cloning repository (branch: {BRANCH})...")
            os.system(f'git clone -b {BRANCH} {GITHUB_REPO} {repo_path}')
        else:
            print(f"Updating repository (branch: {BRANCH})...")
            os.system(f'cd {repo_path} && git fetch origin && git checkout {BRANCH} && git pull origin {BRANCH}')
        
        if not os.path.exists(data_path):
            os.makedirs(data_path)
    else:
        possible_paths = [
            Path(".").absolute().parent / "src",
            Path(".").absolute() / "src",
        ]
        src_path = None
        for p in possible_paths:
            if p.exists() and (p / "alignment").exists():
                src_path = str(p.absolute())
                break
        if src_path is None:
            raise FileNotFoundError("src directory not found")
        
        # Data in parent directory
        data_path = str(Path(src_path).parent.parent / "examples")
        os.makedirs(data_path, exist_ok=True)
        print(f"Running locally from: {src_path}")
    
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
    
    return src_path, data_path

src_path, data_path = setup_imports()

import torch
import torchaudio
import logging
logging.basicConfig(level=logging.INFO)

# Import the simple API
from api import align_long_audio

print()
print("=" * 60)
print(f"PyTorch: {torch.__version__}")
print(f"TorchAudio: {torchaudio.__version__}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
print(f"Data path: {data_path}")
print("=" * 60)

In [None]:
# Check dependencies
print("Checking dependencies...")

K2_AVAILABLE = False
LIS_AVAILABLE = False

try:
    import k2
    K2_AVAILABLE = True
    print("k2: available")
except ImportError:
    print("k2: NOT AVAILABLE - install with pip")

try:
    import lis
    LIS_AVAILABLE = True
    print("lis: available")
except ImportError:
    print("lis: NOT AVAILABLE - pip install git+https://github.com/huangruizhe/lis.git")

try:
    from pypdf import PdfReader
    print("pypdf: available")
except ImportError:
    print("pypdf: NOT AVAILABLE - pip install pypdf")

try:
    from pydub import AudioSegment
    print("pydub: available")
except ImportError:
    print("pydub: NOT AVAILABLE - pip install pydub")

In [None]:
!pip install pytorch-lightning
!pip install cmudict g2p_en
!pip install pydub
!pip install git+https://github.com/huangruizhe/lis.git
!pip install torchcodec

In [None]:
# =============================================================================
# Import Audio Preview Functions from Library
# =============================================================================

from visualization_utils import play_random, play_words_sequential, play_segment

# Note: result.summary() is built-in, no need for custom show_alignment_summary()

print("Audio preview functions imported from visualization_utils")

---
## Language 1: English

**Source**: Meta Q1 2025 Earnings Call
- Audio: ~1 hour recording from SeekingAlpha
- Text: ~9,200 words from Meta's investor relations PDF

This demonstrates alignment of a real-world earnings call with noisy transcript (PDF artifacts, headers, etc.).

In [None]:
# Download English data
!wget -q -nc https://static.seekingalpha.com/cdn/s3/transcripts_audio/4780182.mp3 -O {data_path}/meta_earnings.mp3
!wget -q -nc https://s21.q4cdn.com/399680738/files/doc_financials/2025/q1/Transcripts/META-Q1-2025-Earnings-Call-Transcript-1.pdf -O {data_path}/meta_earnings.pdf

print("English data downloaded!")

In [None]:
# Align English - Just 3 lines!
result_en = align_long_audio(
    audio=f"{data_path}/meta_earnings.mp3",
    text=f"{data_path}/meta_earnings.pdf",
    language="eng",
    verbose=True,
)

# Use built-in summary
print(result_en.summary())

In [None]:
# Listen to a random segment using library function
display(play_random(result_en, num_words=30)[0])

# Word-by-word listening using library function
import random
start = random.randint(0, max(0, len(result_en) - 10))
play_words_sequential(result_en, start_idx=start, num_words=8)

In [None]:
# Download Portuguese data
!wget -q -nc https://ia801705.us.archive.org/7/items/orpheu_no1_2010_librivox/orpheuno1_46__128kb.mp3 -O {data_path}/portuguese_orpheu.mp3

# Download and parse text
import requests
from bs4 import BeautifulSoup

url = "https://www.gutenberg.org/cache/epub/23620/pg23620-images.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
text_pt = soup.get_text().replace("\r\n", "\n")

# Save text
with open(f"{data_path}/portuguese_orpheu.txt", "w", encoding="utf-8") as f:
    f.write(text_pt)

print(f"Portuguese data downloaded! Text: {len(text_pt.split())} words")

# Listen to a random segment
display(play_random(result_pt, num_words=30)[0])

In [None]:
# Download Chinese data
!wget -q -nc https://ia801307.us.archive.org/15/items/lun_yu_0801_librivox/lunyu_14_confucius.mp3 -O {data_path}/chinese_confucius.mp3
!wget -q -nc https://www.with.org/analects_ch.pdf -O {data_path}/chinese_confucius.pdf

print("Chinese data downloaded!")

In [None]:
# Align Chinese
result_zh = align_long_audio(
    audio=f"{data_path}/chinese_confucius.mp3",
    text=f"{data_path}/chinese_confucius.pdf",
    language="cmn",  # ISO 639-3 code for Mandarin
    verbose=True,
)

print(result_zh.summary())

In [None]:
# Download Japanese data
!wget -q -nc https://ia803207.us.archive.org/30/items/kazetachinu_ek_librivox/kazetachinu_03_hori.mp3 -O {data_path}/japanese_kaze.mp3

# Download and parse Japanese text (with proper encoding)
import urllib.request
import html

url = "https://www.aozora.gr.jp/cards/001030/files/4803_14204.html"
response = urllib.request.urlopen(url)
html_bytes = response.read()

try:
    text_ja = html_bytes.decode('utf-8')
except:
    try:
        text_ja = html_bytes.decode('shiftjis')
    except:
        text_ja = html_bytes.decode('shift_jisx0213')

text_ja = html.unescape(text_ja)
soup = BeautifulSoup(text_ja, "html.parser")
text_ja = soup.get_text().replace("\r\n", "\n")

with open(f"{data_path}/japanese_kaze.txt", "w", encoding="utf-8") as f:
    f.write(text_ja)

print(f"Japanese data downloaded! Text: {len(text_ja)} characters")

In [None]:
# Align Japanese
result_ja = align_long_audio(
    audio=f"{data_path}/japanese_kaze.mp3",
    text=f"{data_path}/japanese_kaze.txt",
    language="jpn",  # ISO 639-3 code
    verbose=True,
)

print(result_ja.summary())

In [None]:
# Align Hindi
result_hi = align_long_audio(
    audio=f"{data_path}/hindi_udhr.mp3",
    text=f"{data_path}/hindi_udhr.pdf",
    language="hin",  # ISO 639-3 code
    verbose=True,
)

print(result_hi.summary())

# Listen to a random segment
display(play_random(result_hi, num_words=30)[0])

In [None]:
# Align Korean
result_ko = align_long_audio(
    audio=f"{data_path}/korean_udhr.mp3",
    text=f"{data_path}/korean_udhr.pdf",
    language="kor",  # ISO 639-3 code
    verbose=True,
)

print(result_ko.summary())

# Listen to a random segment
display(play_random(result_ko, num_words=30)[0])

In [None]:
# Download Filipino data
!wget -q -nc https://ia800906.us.archive.org/24/items/universal_declaration_librivox/human_rights_un_fil_alnl.mp3 -O {data_path}/filipino_udhr.mp3
!wget -q -nc https://web.archive.org/web/20250110125503/https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/tgl.pdf -O {data_path}/filipino_udhr.pdf

print("Filipino data downloaded!")

In [None]:
# Align Filipino
result_fil = align_long_audio(
    audio=f"{data_path}/filipino_udhr.mp3",
    text=f"{data_path}/filipino_udhr.pdf",
    language="tgl",  # ISO 639-3 code for Tagalog
    verbose=True,
)

print(result_fil.summary())

In [None]:
# Download Zhuang data
!wget -q -nc "https://www.zhuangfuyin.org/sites/www.zhuangfuyin.org/files/media_stream/encodings/audio_download_mp3_orig_qual/499-.mp3" -O {data_path}/zhuang_luke.mp3
!wget -q -nc https://www.zhuangfuyin.org/sites/www.zhuangfuyin.org/files/uploads/Luhzaz.pdf -O {data_path}/zhuang_luke.pdf

print("Zhuang data downloaded!")

In [None]:
# Align Zhuang
result_za = align_long_audio(
    audio=f"{data_path}/zhuang_luke.mp3",
    text=f"{data_path}/zhuang_luke.pdf",
    language="zha",  # ISO 639-3 code for Zhuang (generic)
    verbose=True,
)

print(result_za.summary())

In [None]:
# Export all results
import os

export_dir = f"{data_path}/exports"
os.makedirs(export_dir, exist_ok=True)

for name, result, _ in results:
    try:
        prefix = name.lower().replace(" ", "_")
        
        # Audacity labels
        result.save_audacity_labels(f"{export_dir}/{prefix}_labels.txt")
        
        # JSON
        result.save_json(f"{export_dir}/{prefix}_alignment.json")
        
        # SRT subtitles
        result.save_srt(f"{export_dir}/{prefix}_subtitles.srt")
        
        print(f"{name}: exported to {export_dir}/{prefix}_*")
    except Exception as e:
        print(f"{name}: export failed - {e}")

print(f"\nAll exports saved to: {export_dir}")