# Multilingual Long-Form Alignment Demo

This notebook demonstrates **TorchAudio Long-Form Aligner** across 8 languages using real-world audio and text data.

## Languages Covered
1. **English** - Meta Q1 2025 Earnings Call (~1 hour, 9K words)
2. **Portuguese** - Orpheu Poetry (17 min, 18K words book)
3. **Chinese** - Analects of Confucius (11.5 min, 15K chars)
4. **Japanese** - Kaze Tachinu novel (57 min, 57K chars)
5. **Hindi** - Universal Declaration of Human Rights (17.5 min)
6. **Korean** - Universal Declaration of Human Rights (12 min)
7. **Filipino (Tagalog)** - Universal Declaration of Human Rights (17.5 min)
8. **Zhuang** - Bible Luke chapter (15.5 min, low-resource language)

## Key Features Demonstrated
- **Simple 3-line API**: `align_long_audio(audio, text)`
- **Automatic text normalization**: PDF parsing, romanization, number expansion
- **Long-form handling**: Segments audio, aligns with fuzzy matching, stitches with LIS
- **Interactive verification**: Listen to aligned segments word-by-word

## Setup

> **WARNING: DO NOT MODIFY THE SETUP CELLS BELOW (cell-2, cell-3, cell-4) UNLESS ABSOLUTELY NECESSARY.**
>
> These setup cells are carefully tested to work with Colab's environment and match the pattern used in `test_longform_alignment.ipynb` and `test_alignment.ipynb`. Changes can cause subtle issues like kernel crashes, CUDA conflicts, or module caching problems.

In [None]:
# =============================================================================
# Install Dependencies (auto-detect k2 version)
# =============================================================================
# Uses the install_utils module for k2 and dependency installation.
# The module auto-detects PyTorch/CUDA versions and installs matching k2.
#
# WARNING: DO NOT MODIFY THIS CELL UNLESS ABSOLUTELY NECESSARY.
# =============================================================================

import sys
import os

# First, clone repo to get access to install_utils
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    GITHUB_REPO = "https://github.com/huangruizhe/torchaudio_aligner.git"
    BRANCH = "dev"
    repo_path = '/content/torchaudio_aligner'
    src_path = f'{repo_path}/src'
    
    if not os.path.exists(repo_path):
        os.system(f'git clone -b {BRANCH} {GITHUB_REPO} {repo_path}')
    else:
        os.system(f'cd {repo_path} && git fetch origin && git checkout {BRANCH} && git reset --hard origin/{BRANCH}')
    
    if src_path not in sys.path:
        sys.path.insert(0, src_path)

# Now use the install_utils module
from misc.install_utils import install_k2_if_needed, install_other_deps

install_k2_if_needed()
install_other_deps()
print("\nDependency installation complete.")

In [None]:
# =============================================================================
# Setup: Clone Repository and Configure Imports
# =============================================================================

import sys
import os
import importlib
from pathlib import Path

GITHUB_REPO = "https://github.com/huangruizhe/torchaudio_aligner.git"
BRANCH = "dev"

def setup_imports():
    IN_COLAB = 'google.colab' in sys.modules
    if IN_COLAB:
        repo_path = '/content/torchaudio_aligner'
        src_path = f'{repo_path}/src'
        data_path = '/content/data'
        if not os.path.exists(repo_path):
            os.system(f'git clone -b {BRANCH} {GITHUB_REPO} {repo_path}')
        else:
            # Always pull latest code
            os.system(f'cd {repo_path} && git fetch origin && git checkout {BRANCH} && git reset --hard origin/{BRANCH}')
        os.makedirs(data_path, exist_ok=True)
    else:
        possible_paths = [Path(".").absolute().parent / "src", Path(".").absolute() / "src"]
        src_path = None
        for p in possible_paths:
            if p.exists() and (p / "alignment").exists():
                src_path = str(p.absolute())
                break
        if src_path is None:
            raise FileNotFoundError("src directory not found")
        data_path = str(Path(src_path).parent.parent / "examples")
        os.makedirs(data_path, exist_ok=True)
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
    return src_path, data_path

def clear_module_cache(src_path):
    """Remove cached modules from src to force fresh imports."""
    modules_to_remove = []
    for name, module in sys.modules.items():
        if hasattr(module, '__file__') and module.__file__:
            if src_path in module.__file__:
                modules_to_remove.append(name)
    for name in modules_to_remove:
        del sys.modules[name]
    print(f"Cleared {len(modules_to_remove)} cached modules")

src_path, data_path = setup_imports()

# Clear old cached modules to ensure fresh code is loaded
clear_module_cache(src_path)

import torch
import torchaudio
import gc
import logging
logging.basicConfig(level=logging.INFO)

# Import with fresh modules (not cached)
import api
import visualization_utils
importlib.reload(api)
importlib.reload(visualization_utils)

from api import align_long_audio
from visualization_utils import play_random, play_words_sequential

def clear_gpu():
    """Clear GPU memory between alignments."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

print("=" * 60)
print(f"PyTorch: {torch.__version__}")
print(f"TorchAudio: {torchaudio.__version__}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
print(f"Data path: {data_path}")
print("=" * 60)

In [None]:
!pip install uroman-python
!apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-hin tesseract-ocr-jpn tesseract-ocr-chi-sim tesseract-ocr-kor
!pip install pdf2image pytesseract
!pip install beautifulsoup4
!pip install torchcodec
from bs4 import BeautifulSoup

---
## Language 1: English

**Source**: Meta Q1 2025 Earnings Call (~1 hour, 9K words)

In [None]:
# Download English data
!wget -q -nc https://static.seekingalpha.com/cdn/s3/transcripts_audio/4780182.mp3 -O {data_path}/meta_earnings.mp3
!wget -q -nc https://s21.q4cdn.com/399680738/files/doc_financials/2025/q1/Transcripts/META-Q1-2025-Earnings-Call-Transcript-1.pdf -O {data_path}/meta_earnings.pdf
print("English data downloaded!")

In [None]:
result_en = align_long_audio(
    audio=f"{data_path}/meta_earnings.mp3",
    text=f"{data_path}/meta_earnings.pdf",
    language="eng",
    verbose=True,
)
print(result_en.summary())

In [None]:
display(play_random(result_en, num_words=30)[0])
clear_gpu()

---
## Language 2: Portuguese

**Source**: Orpheu Poetry Book - "Ode Triunfal" (17 min, 18K words)

In [None]:
!wget -q -nc https://ia801705.us.archive.org/7/items/orpheu_no1_2010_librivox/orpheuno1_46__128kb.mp3 -O {data_path}/portuguese_orpheu.mp3

import requests
from bs4 import BeautifulSoup
url = "https://www.gutenberg.org/cache/epub/23620/pg23620-images.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
text_pt = soup.get_text().replace("\r\n", "\n")
with open(f"{data_path}/portuguese_orpheu.txt", "w", encoding="utf-8") as f:
    f.write(text_pt)
print(f"Portuguese data downloaded! Text: {len(text_pt.split())} words")

In [None]:
result_pt = align_long_audio(
    audio=f"{data_path}/portuguese_orpheu.mp3",
    text=f"{data_path}/portuguese_orpheu.txt",
    language="por",
    verbose=True,
)
print(result_pt.summary())

In [None]:
display(play_random(result_pt, num_words=30)[0])
clear_gpu()

---
## Language 3: Chinese (Mandarin)

**Source**: Analects of Confucius (11.5 min, 15K chars)

In [None]:
!wget -q -nc https://ia801307.us.archive.org/15/items/lun_yu_0801_librivox/lunyu_14_confucius.mp3 -O {data_path}/chinese_confucius.mp3
!wget -q -nc https://www.with.org/analects_ch.pdf -O {data_path}/chinese_confucius.pdf
print("Chinese data downloaded!")

In [None]:
result_zh = align_long_audio(
    audio=f"{data_path}/chinese_confucius.mp3",
    text=f"{data_path}/chinese_confucius.pdf",
    language="cmn",
    verbose=True,
)
print(result_zh.summary())

In [None]:
display(play_random(result_zh, num_words=30)[0])
clear_gpu()

---
## Language 4: Japanese

**Source**: Kaze Tachinu novel (57.5 min, 57K chars)

In [None]:
!wget -q -nc https://ia803207.us.archive.org/30/items/kazetachinu_ek_librivox/kazetachinu_03_hori.mp3 -O {data_path}/japanese_kaze.mp3

import urllib.request
import html
url = "https://www.aozora.gr.jp/cards/001030/files/4803_14204.html"
response = urllib.request.urlopen(url)
html_bytes = response.read()
try:
    text_ja = html_bytes.decode('utf-8')
except:
    text_ja = html_bytes.decode('shiftjis')
text_ja = html.unescape(text_ja)
soup = BeautifulSoup(text_ja, "html.parser")
text_ja = soup.get_text().replace("\r\n", "\n")
with open(f"{data_path}/japanese_kaze.txt", "w", encoding="utf-8") as f:
    f.write(text_ja)
print(f"Japanese data downloaded! Text: {len(text_ja)} characters")

In [None]:
result_ja = align_long_audio(
    audio=f"{data_path}/japanese_kaze.mp3",
    text=f"{data_path}/japanese_kaze.txt",
    language="jpn",
    verbose=True,
)
print(result_ja.summary())

In [None]:
display(play_random(result_ja, num_words=30)[0])
clear_gpu()

---
## Language 5: Hindi

**Source**: Universal Declaration of Human Rights (17.5 min)

In [None]:
!wget -q -nc https://www.archive.org/download/human_rights_02_0908_librivox/human_rights_un_hin_brc.mp3 -O {data_path}/hindi_udhr.mp3
!wget -q -nc https://web.archive.org/web/20250623004015/https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/hnd.pdf -O {data_path}/hindi_udhr.pdf
print("Hindi data downloaded!")

In [None]:
result_hi = align_long_audio(
    audio=f"{data_path}/hindi_udhr.mp3",
    text=f"{data_path}/hindi_udhr.pdf",
    language="hin",
    verbose=True,
)
print(result_hi.summary())

In [None]:
display(play_random(result_hi, num_words=30)[0])
clear_gpu()

---
## Language 6: Korean

**Source**: Universal Declaration of Human Rights (12 min)

In [None]:
!wget -q -nc https://ia800906.us.archive.org/24/items/universal_declaration_librivox/human_rights_un_kkn_lsj.mp3 -O {data_path}/korean_udhr.mp3
!wget -q -nc https://web.archive.org/web/20250114234231/https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/kkn.pdf -O {data_path}/korean_udhr.pdf
print("Korean data downloaded!")

In [None]:
result_ko = align_long_audio(
    audio=f"{data_path}/korean_udhr.mp3",
    text=f"{data_path}/korean_udhr.pdf",
    language="kor",
    verbose=True,
)
print(result_ko.summary())

In [None]:
display(play_random(result_ko, num_words=30)[0])
clear_gpu()

---
## Language 7: Filipino (Tagalog)

**Source**: Universal Declaration of Human Rights (17.5 min)

In [None]:
!wget -q -nc https://ia800906.us.archive.org/24/items/universal_declaration_librivox/human_rights_un_fil_alnl.mp3 -O {data_path}/filipino_udhr.mp3
!wget -q -nc https://web.archive.org/web/20250110125503/https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/tgl.pdf -O {data_path}/filipino_udhr.pdf
print("Filipino data downloaded!")

In [None]:
result_fil = align_long_audio(
    audio=f"{data_path}/filipino_udhr.mp3",
    text=f"{data_path}/filipino_udhr.pdf",
    language="tgl",
    verbose=True,
)
print(result_fil.summary())

In [None]:
display(play_random(result_fil, num_words=30)[0])
clear_gpu()

---
## Language 8: Zhuang (Low-Resource)

**Source**: Bible - Book of Luke (15.5 min, 21K words)

In [None]:
!wget -q -nc "https://www.zhuangfuyin.org/sites/www.zhuangfuyin.org/files/media_stream/encodings/audio_download_mp3_orig_qual/499-.mp3" -O {data_path}/zhuang_luke.mp3
!wget -q -nc https://www.zhuangfuyin.org/sites/www.zhuangfuyin.org/files/uploads/Luhzaz.pdf -O {data_path}/zhuang_luke.pdf
print("Zhuang data downloaded!")

In [None]:
result_za = align_long_audio(
    audio=f"{data_path}/zhuang_luke.mp3",
    text=f"{data_path}/zhuang_luke.pdf",
    language="zha",
    verbose=True,
)
print(result_za.summary())

In [None]:
display(play_random(result_za, num_words=30)[0])
clear_gpu()

---
## Summary

In [None]:
print("\n" + "=" * 70)
print("ALIGNMENT SUMMARY")
print("=" * 70)
print(f"{'Language':<15} {'Words Aligned':<15} {'Duration':<12}")
print("-" * 70)

results = [
    ("English", result_en),
    ("Portuguese", result_pt),
    ("Chinese", result_zh),
    ("Japanese", result_ja),
    ("Hindi", result_hi),
    ("Korean", result_ko),
    ("Filipino", result_fil),
    ("Zhuang", result_za),
]

for name, result in results:
    try:
        words = len(result)
        if words > 0:
            duration = result.words[-1].end_seconds() - result.words[0].start_seconds()
            duration_str = f"{duration/60:.1f} min"
        else:
            duration_str = "N/A"
        print(f"{name:<15} {words:<15} {duration_str:<12}")
    except:
        print(f"{name:<15} {'ERROR':<15}")

print("=" * 70)

---
## Export Results

In [None]:
import os

export_dir = f"{data_path}/exports"
os.makedirs(export_dir, exist_ok=True)

for name, result in results:
    try:
        prefix = name.lower().replace(" ", "_")
        result.save_audacity_labels(f"{export_dir}/{prefix}_labels.txt")
        result.save_json(f"{export_dir}/{prefix}_alignment.json")
        result.save_srt(f"{export_dir}/{prefix}_subtitles.srt")
        print(f"{name}: exported")
    except Exception as e:
        print(f"{name}: export failed - {e}")

print(f"\nAll exports saved to: {export_dir}")