# Multilingual Long-Form Alignment Demo

This notebook demonstrates **TorchAudio Long-Form Aligner** across 8 languages using real-world audio and text data.

## Languages Covered
1. **English** - Meta Q1 2025 Earnings Call (~1 hour, 9K words)
2. **Portuguese** - Orpheu Poetry (17 min, 18K words book)
3. **Chinese** - Analects of Confucius (11.5 min, 15K chars)
4. **Japanese** - Kaze Tachinu novel (57 min, 57K chars)
5. **Hindi** - Universal Declaration of Human Rights (17.5 min)
6. **Korean** - Universal Declaration of Human Rights (12 min)
7. **Filipino (Tagalog)** - Universal Declaration of Human Rights (17.5 min)
8. **Zhuang** - Bible Luke chapter (15.5 min, low-resource language)

## Key Features Demonstrated
- **Simple 3-line API**: `align_long_audio(audio, text)`
- **Automatic text normalization**: PDF parsing, romanization, number expansion
- **Long-form handling**: Segments audio, aligns with fuzzy matching, stitches with LIS
- **Interactive verification**: Listen to aligned segments word-by-word

## Requirements
- k2 (WFST library)
- lis (longest increasing subsequence)
- Language-specific: uroman, cutlet (Japanese), zhon (CJK)

## Setup

In [None]:
# =============================================================================
# Install Dependencies
# =============================================================================

import subprocess
import sys

def install_k2_if_needed():
    """Auto-detect and install the correct k2 version."""
    try:
        import k2
        print(f"k2 already installed: {k2.__version__}")
        return True
    except ImportError:
        pass
    
    import torch
    torch_version = '.'.join(torch.__version__.split('.')[:2])
    cuda_available = torch.cuda.is_available()
    cuda_version = torch.version.cuda if cuda_available else None
    
    print(f"PyTorch: {torch.__version__}, CUDA: {cuda_version}")
    
    if cuda_available and cuda_version:
        cuda_mm = '.'.join(cuda_version.split('.')[:2])
        index_url = "https://k2-fsa.github.io/k2/cuda.html"
        cmd = f"pip install k2 -f {index_url}"
    else:
        index_url = "https://k2-fsa.github.io/k2/cpu.html"
        cmd = f"pip install k2 --no-deps -f {index_url}"
    
    print(f"Installing k2: {cmd}")
    subprocess.run(cmd, shell=True)
    return True

# Install all dependencies
install_k2_if_needed()

!pip install -q pytorch-lightning pydub pypdf
!pip install -q git+https://github.com/huangruizhe/lis.git
!pip install -q uroman-python zhon  # For romanization
!pip install -q cutlet unidic-lite  # For Japanese
!pip install -q num2words  # For number expansion

print("\nDependencies installed!")

In [None]:
# =============================================================================
# Setup: Clone Repository and Configure Imports
# =============================================================================

import sys
import os
from pathlib import Path

GITHUB_REPO = "https://github.com/huangruizhe/torchaudio_aligner.git"
BRANCH = "dev"

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    repo_path = '/content/torchaudio_aligner'
    src_path = f'{repo_path}/src'
    data_path = '/content/data'
    
    if not os.path.exists(repo_path):
        print(f"Cloning repository (branch: {BRANCH})...")
        os.system(f'git clone -b {BRANCH} {GITHUB_REPO} {repo_path}')
    else:
        print(f"Updating repository...")
        os.system(f'cd {repo_path} && git pull origin {BRANCH}')
    
    os.makedirs(data_path, exist_ok=True)
else:
    # Running locally
    src_path = str(Path(".").absolute().parent / "src")
    data_path = str(Path(".").absolute().parent / "examples")
    os.makedirs(data_path, exist_ok=True)

if src_path not in sys.path:
    sys.path.insert(0, src_path)

import torch
import torchaudio
import IPython.display as ipd
from pydub import AudioSegment
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("=" * 60)
print(f"PyTorch: {torch.__version__}")
print(f"TorchAudio: {torchaudio.__version__}")
print(f"Device: {device}")
print(f"Data path: {data_path}")
print("=" * 60)

In [None]:
# =============================================================================
# Import TorchAudio Aligner
# =============================================================================

from api import align_long_audio, AlignmentResult, AlignedWord
from visualization_utils import preview_word_seconds, preview_segment_seconds

print("TorchAudio Aligner imported successfully!")

In [None]:
# =============================================================================
# Helper Functions for Demo
# =============================================================================

def play_random_segment(result, audio_file, num_words=30):
    """
    Play a random segment of aligned words.
    
    Args:
        result: AlignmentResult from align_long_audio()
        audio_file: Path to the audio file
        num_words: Number of words to play
    """
    if len(result) < num_words:
        num_words = len(result)
    
    start_idx = random.randint(0, len(result) - num_words)
    words = result.words[start_idx:start_idx + num_words]
    
    # Get time range (use start_seconds() method)
    t1 = words[0].start_seconds()
    t2 = words[-1].end_seconds()
    
    # Load and slice audio
    audio = AudioSegment.from_file(audio_file)
    segment = audio[t1 * 1000:t2 * 1000].set_channels(1)
    
    # Display text
    text = " ".join(w.word for w in words)
    print(f"\nPlaying words {start_idx} to {start_idx + num_words}:")
    print(f"Time: {t1:.2f}s - {t2:.2f}s")
    print(f"\nText: {text[:200]}{'...' if len(text) > 200 else ''}")
    
    return ipd.Audio(segment.get_array_of_samples(), rate=segment.frame_rate)


def play_word_by_word(result, audio_file, start_idx=0, num_words=10):
    """
    Play words one by one with their text.
    
    Args:
        result: AlignmentResult from align_long_audio()
        audio_file: Path to the audio file
        start_idx: Starting word index
        num_words: Number of words to play
    """
    audio = AudioSegment.from_file(audio_file).set_channels(1)
    
    print(f"\nPlaying words {start_idx} to {start_idx + num_words - 1}:")
    print("=" * 50)
    
    words = result.words[start_idx:start_idx + num_words]
    
    for i, word in enumerate(words):
        # Get end time (use next word's start or add buffer)
        if i + 1 < len(words):
            end_time = words[i + 1].start_seconds()
        else:
            end_time = word.end_seconds() + 0.3
        
        start_time = word.start_seconds()
        
        # Display word info
        display_text = word.display_text if hasattr(word, 'display_text') else word.word
        print(f"\n[{start_idx + i}] '{display_text}' ({start_time:.2f}s - {end_time:.2f}s):")
        
        # Play audio segment
        segment = audio[start_time * 1000:end_time * 1000]
        display(ipd.Audio(segment.get_array_of_samples(), rate=segment.frame_rate))


def show_alignment_summary(result, name="Alignment"):
    """
    Show a summary of alignment results.
    """
    print(f"\n{'=' * 60}")
    print(f"{name} Results")
    print(f"{'=' * 60}")
    print(f"Total aligned words: {len(result)}")
    
    if len(result) > 0:
        first_word = result.words[0]
        last_word = result.words[-1]
        duration = last_word.end_seconds() - first_word.start_seconds()
        print(f"Time span: {first_word.start_seconds():.2f}s - {last_word.end_seconds():.2f}s ({duration:.1f}s)")
        
        print(f"\nFirst 5 words:")
        for w in result.words[:5]:
            print(f"  '{w.word}': {w.start_seconds():.2f}s - {w.end_seconds():.2f}s")
        
        print(f"\nLast 5 words:")
        for w in result.words[-5:]:
            print(f"  '{w.word}': {w.start_seconds():.2f}s - {w.end_seconds():.2f}s")

---
## Language 1: English

**Source**: Meta Q1 2025 Earnings Call
- Audio: ~1 hour recording from SeekingAlpha
- Text: ~9,200 words from Meta's investor relations PDF

This demonstrates alignment of a real-world earnings call with noisy transcript (PDF artifacts, headers, etc.).

In [None]:
# Download English data
!wget -q -nc https://static.seekingalpha.com/cdn/s3/transcripts_audio/4780182.mp3 -O {data_path}/meta_earnings.mp3
!wget -q -nc https://s21.q4cdn.com/399680738/files/doc_financials/2025/q1/Transcripts/META-Q1-2025-Earnings-Call-Transcript-1.pdf -O {data_path}/meta_earnings.pdf

print("English data downloaded!")

In [None]:
# Align English - Just 3 lines!
result_en = align_long_audio(
    audio=f"{data_path}/meta_earnings.mp3",
    text=f"{data_path}/meta_earnings.pdf",
    language="eng",
    verbose=True,
)

show_alignment_summary(result_en, "English (Meta Earnings Call)")

In [None]:
# Listen to a random segment
display(play_random_segment(result_en, f"{data_path}/meta_earnings.mp3", num_words=30))

In [None]:
# Word-by-word listening (pick a random starting point)
start = random.randint(0, max(0, len(result_en) - 10))
play_word_by_word(result_en, f"{data_path}/meta_earnings.mp3", start_idx=start, num_words=8)

---
## Language 2: Portuguese

**Source**: Orpheu Poetry Book (LibriVox)
- Audio: 17 minutes chapter "Ode Triunfal"
- Text: 18K words from Project Gutenberg

Demonstrates alignment with romanization (uroman for Portuguese diacritics).

In [None]:
# Download Portuguese data
!wget -q -nc https://ia801705.us.archive.org/7/items/orpheu_no1_2010_librivox/orpheuno1_46__128kb.mp3 -O {data_path}/portuguese_orpheu.mp3

# Download and parse text
import requests
from bs4 import BeautifulSoup

url = "https://www.gutenberg.org/cache/epub/23620/pg23620-images.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
text_pt = soup.get_text().replace("\r\n", "\n")

# Save text
with open(f"{data_path}/portuguese_orpheu.txt", "w", encoding="utf-8") as f:
    f.write(text_pt)

print(f"Portuguese data downloaded! Text: {len(text_pt.split())} words")

In [None]:
# Align Portuguese
result_pt = align_long_audio(
    audio=f"{data_path}/portuguese_orpheu.mp3",
    text=f"{data_path}/portuguese_orpheu.txt",
    language="por",  # ISO 639-3 code
    verbose=True,
)

show_alignment_summary(result_pt, "Portuguese (Orpheu Poetry)")

In [None]:
# Listen to a random segment
display(play_random_segment(result_pt, f"{data_path}/portuguese_orpheu.mp3", num_words=30))

---
## Language 3: Chinese (Mandarin)

**Source**: Analects of Confucius (LibriVox)
- Audio: 11.5 minutes chapter
- Text: 15K Chinese characters

Demonstrates character-level alignment for CJK languages with romanization.

In [None]:
# Download Chinese data
!wget -q -nc https://ia801307.us.archive.org/15/items/lun_yu_0801_librivox/lunyu_14_confucius.mp3 -O {data_path}/chinese_confucius.mp3
!wget -q -nc https://www.with.org/analects_ch.pdf -O {data_path}/chinese_confucius.pdf

print("Chinese data downloaded!")

In [None]:
# Align Chinese
result_zh = align_long_audio(
    audio=f"{data_path}/chinese_confucius.mp3",
    text=f"{data_path}/chinese_confucius.pdf",
    language="cmn",  # ISO 639-3 code for Mandarin
    verbose=True,
)

show_alignment_summary(result_zh, "Chinese (Analects of Confucius)")

In [None]:
# Listen to a random segment
display(play_random_segment(result_zh, f"{data_path}/chinese_confucius.mp3", num_words=30))

---
## Language 4: Japanese

**Source**: Kaze Tachinu (The Wind Rises) novel (LibriVox)
- Audio: 57.5 minutes chapter
- Text: 57K Japanese characters from Aozora Bunko

Demonstrates alignment with Japanese morphological analysis (cutlet for romaji).

In [None]:
# Download Japanese data
!wget -q -nc https://ia803207.us.archive.org/30/items/kazetachinu_ek_librivox/kazetachinu_03_hori.mp3 -O {data_path}/japanese_kaze.mp3

# Download and parse Japanese text (with proper encoding)
import urllib.request
import html

url = "https://www.aozora.gr.jp/cards/001030/files/4803_14204.html"
response = urllib.request.urlopen(url)
html_bytes = response.read()

try:
    text_ja = html_bytes.decode('utf-8')
except:
    try:
        text_ja = html_bytes.decode('shiftjis')
    except:
        text_ja = html_bytes.decode('shift_jisx0213')

text_ja = html.unescape(text_ja)
soup = BeautifulSoup(text_ja, "html.parser")
text_ja = soup.get_text().replace("\r\n", "\n")

with open(f"{data_path}/japanese_kaze.txt", "w", encoding="utf-8") as f:
    f.write(text_ja)

print(f"Japanese data downloaded! Text: {len(text_ja)} characters")

In [None]:
# Align Japanese
result_ja = align_long_audio(
    audio=f"{data_path}/japanese_kaze.mp3",
    text=f"{data_path}/japanese_kaze.txt",
    language="jpn",  # ISO 639-3 code
    verbose=True,
)

show_alignment_summary(result_ja, "Japanese (Kaze Tachinu)")

In [None]:
# Listen to a random segment
display(play_random_segment(result_ja, f"{data_path}/japanese_kaze.mp3", num_words=30))

---
## Language 5: Hindi

**Source**: Universal Declaration of Human Rights (LibriVox)
- Audio: 17.5 minutes
- Text: 2K Hindi words (from PDF with OCR)

Demonstrates alignment with OCR-extracted text and Devanagari script romanization.

In [None]:
# Download Hindi data
!wget -q -nc https://www.archive.org/download/human_rights_02_0908_librivox/human_rights_un_hin_brc.mp3 -O {data_path}/hindi_udhr.mp3
!wget -q -nc https://web.archive.org/web/20250623004015/https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/hnd.pdf -O {data_path}/hindi_udhr.pdf

print("Hindi data downloaded!")

In [None]:
# Note: Hindi PDF requires OCR. For this demo, we'll use uroman on the extracted text.
# In production, you might use easyocr for scanned PDFs.

result_hi = align_long_audio(
    audio=f"{data_path}/hindi_udhr.mp3",
    text=f"{data_path}/hindi_udhr.pdf",
    language="hin",  # ISO 639-3 code
    verbose=True,
)

show_alignment_summary(result_hi, "Hindi (UDHR)")

In [None]:
# Listen to a random segment
display(play_random_segment(result_hi, f"{data_path}/hindi_udhr.mp3", num_words=30))

---
## Language 6: Korean

**Source**: Universal Declaration of Human Rights (LibriVox)
- Audio: 12 minutes
- Text: 1.3K Korean words from PDF

Demonstrates alignment with Hangul script romanization.

In [None]:
# Download Korean data
!wget -q -nc https://ia800906.us.archive.org/24/items/universal_declaration_librivox/human_rights_un_kkn_lsj.mp3 -O {data_path}/korean_udhr.mp3
!wget -q -nc https://web.archive.org/web/20250114234231/https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/kkn.pdf -O {data_path}/korean_udhr.pdf

print("Korean data downloaded!")

In [None]:
# Align Korean
result_ko = align_long_audio(
    audio=f"{data_path}/korean_udhr.mp3",
    text=f"{data_path}/korean_udhr.pdf",
    language="kor",  # ISO 639-3 code
    verbose=True,
)

show_alignment_summary(result_ko, "Korean (UDHR)")

In [None]:
# Listen to a random segment
display(play_random_segment(result_ko, f"{data_path}/korean_udhr.mp3", num_words=30))

---
## Language 7: Filipino (Tagalog)

**Source**: Universal Declaration of Human Rights (LibriVox)
- Audio: 17.5 minutes
- Text: 2K words from PDF

Demonstrates alignment for Filipino/Tagalog which uses Latin script.

In [None]:
# Download Filipino data
!wget -q -nc https://ia800906.us.archive.org/24/items/universal_declaration_librivox/human_rights_un_fil_alnl.mp3 -O {data_path}/filipino_udhr.mp3
!wget -q -nc https://web.archive.org/web/20250110125503/https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/tgl.pdf -O {data_path}/filipino_udhr.pdf

print("Filipino data downloaded!")

In [None]:
# Align Filipino
result_fil = align_long_audio(
    audio=f"{data_path}/filipino_udhr.mp3",
    text=f"{data_path}/filipino_udhr.pdf",
    language="tgl",  # ISO 639-3 code for Tagalog
    verbose=True,
)

show_alignment_summary(result_fil, "Filipino/Tagalog (UDHR)")

In [None]:
# Listen to a random segment
display(play_random_segment(result_fil, f"{data_path}/filipino_udhr.mp3", num_words=30))

---
## Language 8: Zhuang (Low-Resource)

**Source**: Bible - Book of Luke (Southern Zhuang translation)
- Audio: 15.5 minutes chapter
- Text: 21K words from PDF

Demonstrates alignment for a **low-resource language**. Zhuang is spoken in Southern China. Northern Zhuang is in MMS training data, but Southern Zhuang is NOT - this tests cross-dialect transfer.

In [None]:
# Download Zhuang data
!wget -q -nc "https://www.zhuangfuyin.org/sites/www.zhuangfuyin.org/files/media_stream/encodings/audio_download_mp3_orig_qual/499-.mp3" -O {data_path}/zhuang_luke.mp3
!wget -q -nc https://www.zhuangfuyin.org/sites/www.zhuangfuyin.org/files/uploads/Luhzaz.pdf -O {data_path}/zhuang_luke.pdf

print("Zhuang data downloaded!")

In [None]:
# Align Zhuang
# Note: Zhuang uses Latin script with tone marks, so minimal romanization needed
result_za = align_long_audio(
    audio=f"{data_path}/zhuang_luke.mp3",
    text=f"{data_path}/zhuang_luke.pdf",
    language="zha",  # ISO 639-3 code for Zhuang (generic)
    verbose=True,
)

show_alignment_summary(result_za, "Zhuang (Bible - Luke)")

In [None]:
# Listen to a random segment
display(play_random_segment(result_za, f"{data_path}/zhuang_luke.mp3", num_words=30))

---
## Summary

We've demonstrated alignment across 8 languages:

| Language | Script | Romanization | Audio Length | Text Size |
|----------|--------|--------------|--------------|----------|
| English | Latin | None needed | ~60 min | ~9K words |
| Portuguese | Latin | uroman | ~17 min | ~18K words |
| Chinese | Han | uroman | ~11 min | ~15K chars |
| Japanese | Mixed | cutlet | ~57 min | ~57K chars |
| Hindi | Devanagari | uroman | ~17 min | ~2K words |
| Korean | Hangul | uroman | ~12 min | ~1.3K words |
| Filipino | Latin | uroman | ~17 min | ~2K words |
| Zhuang | Latin | None needed | ~15 min | ~21K words |

### Key Takeaways

1. **Simple API**: `align_long_audio(audio, text, language)` handles everything
2. **Automatic preprocessing**: PDF parsing, text normalization, romanization
3. **Robust to noise**: Works with real-world noisy transcripts (PDF artifacts, headers, etc.)
4. **Low-resource languages**: MMS model transfers to unseen languages
5. **Long-form support**: Handles hours of audio efficiently with segment-and-stitch approach

In [None]:
# Final summary table
print("\n" + "=" * 70)
print("ALIGNMENT SUMMARY")
print("=" * 70)
print(f"{'Language':<15} {'Words Aligned':<15} {'Coverage':<12} {'Duration':<12}")
print("-" * 70)

results = [
    ("English", result_en, f"{data_path}/meta_earnings.mp3"),
    ("Portuguese", result_pt, f"{data_path}/portuguese_orpheu.mp3"),
    ("Chinese", result_zh, f"{data_path}/chinese_confucius.mp3"),
    ("Japanese", result_ja, f"{data_path}/japanese_kaze.mp3"),
    ("Hindi", result_hi, f"{data_path}/hindi_udhr.mp3"),
    ("Korean", result_ko, f"{data_path}/korean_udhr.mp3"),
    ("Filipino", result_fil, f"{data_path}/filipino_udhr.mp3"),
    ("Zhuang", result_za, f"{data_path}/zhuang_luke.mp3"),
]

for name, result, audio_path in results:
    try:
        words = len(result)
        if words > 0:
            duration = result.words[-1].end_seconds() - result.words[0].start_seconds()
            duration_str = f"{duration/60:.1f} min"
        else:
            duration_str = "N/A"
        print(f"{name:<15} {words:<15} {'N/A':<12} {duration_str:<12}")
    except:
        print(f"{name:<15} {'ERROR':<15}")

print("=" * 70)

---
## Export Results

Save alignment results in various formats.

In [None]:
# Export all results
import os

export_dir = f"{data_path}/exports"
os.makedirs(export_dir, exist_ok=True)

for name, result, _ in results:
    try:
        prefix = name.lower().replace(" ", "_")
        
        # Audacity labels
        result.save_audacity_labels(f"{export_dir}/{prefix}_labels.txt")
        
        # JSON
        result.save_json(f"{export_dir}/{prefix}_alignment.json")
        
        # SRT subtitles
        result.save_srt(f"{export_dir}/{prefix}_subtitles.srt")
        
        print(f"{name}: exported to {export_dir}/{prefix}_*")
    except Exception as e:
        print(f"{name}: export failed - {e}")

print(f"\nAll exports saved to: {export_dir}")