# Download All Resources for Testing

This notebook downloads all audio and text resources used in Tutorial.py for testing.
Resources are downloaded as-is (HTML, PDF, MP3) without processing.

Run this notebook once to cache all resources locally.

In [None]:
import os
import urllib.request
from pathlib import Path

# Create resources directory
RESOURCES_DIR = Path("resources")
RESOURCES_DIR.mkdir(exist_ok=True)

def download_file(url, filename, force=False):
    """Download a file if it doesn't exist."""
    filepath = RESOURCES_DIR / filename
    if filepath.exists() and not force:
        print(f"✅ Already exists: {filename}")
        return filepath
    
    print(f"⬇️  Downloading: {filename}")
    try:
        urllib.request.urlretrieve(url, filepath)
        size_mb = filepath.stat().st_size / (1024 * 1024)
        print(f"   ✅ Downloaded: {size_mb:.2f} MB")
        return filepath
    except Exception as e:
        print(f"   ❌ Failed: {e}")
        return None

print(f"Resources will be saved to: {RESOURCES_DIR.absolute()}")

## 1. English Resources

In [None]:
print("=" * 60)
print("English: Meta Q1 2025 Earnings Call")
print("=" * 60)

# Text (PDF)
download_file(
    "https://s21.q4cdn.com/399680738/files/doc_financials/2025/q1/Transcripts/META-Q1-2025-Earnings-Call-Transcript-1.pdf",
    "en_meta_q1_2025_transcript.pdf"
)

# Audio (MP3) - from SeekingAlpha (if available)
# Note: Audio may require authentication or may not be directly downloadable
print("\n⚠️  Audio for Meta earnings call requires manual download from SeekingAlpha")
print("   URL: https://seekingalpha.com/article/4780182-meta-platforms-inc-meta-q1-2025-earnings-call-transcript")

In [None]:
print("=" * 60)
print("English: Walden by Henry David Thoreau")
print("=" * 60)

# Text (HTML)
download_file(
    "https://www.gutenberg.org/cache/epub/205/pg205-images.html",
    "en_walden.html"
)

# Audio (MP3) - Chapter 1 from LibriVox
download_file(
    "https://www.archive.org/download/walden_librivox/walden_01_thoreau.mp3",
    "en_walden_ch01.mp3"
)

## 2. Portuguese Resources

In [None]:
print("=" * 60)
print("Portuguese: Orpheu no.1")
print("=" * 60)

# Text (HTML)
download_file(
    "https://www.gutenberg.org/cache/epub/23620/pg23620-images.html",
    "pt_orpheu_no1.html"
)

# Audio (MP3) - Chapter 46 "Ode Triunfal"
download_file(
    "https://ia801705.us.archive.org/7/items/orpheu_no1_2010_librivox/orpheuno1_46__128kb.mp3",
    "pt_orpheu_ch46.mp3"
)

## 3. Chinese Resources

In [None]:
print("=" * 60)
print("Chinese: 論語 Analects of Confucius")
print("=" * 60)

# Text (PDF) - Traditional Chinese
download_file(
    "https://www.with.org/analects_ch.pdf",
    "zh_analects.pdf"
)

# Audio (MP3) - Chapter 14 from LibriVox
download_file(
    "https://ia801307.us.archive.org/15/items/lun_yu_0801_librivox/lunyu_14_confucius.mp3",
    "zh_analects_ch14.mp3"
)

## 4. Japanese Resources

In [None]:
print("=" * 60)
print("Japanese: 風立ちぬ Kaze Tachinu by Hori Tatsuo")
print("=" * 60)

# Text (HTML) - Aozora Bunko (special encoding)
import urllib.request
import html

url = "https://www.aozora.gr.jp/cards/001030/files/4803_14204.html"
filepath = RESOURCES_DIR / "ja_kaze_tachinu.html"

if filepath.exists():
    print(f"✅ Already exists: ja_kaze_tachinu.html")
else:
    print(f"⬇️  Downloading: ja_kaze_tachinu.html")
    try:
        response = urllib.request.urlopen(url)
        html_bytes = response.read()
        # Save raw bytes to preserve encoding
        with open(filepath, 'wb') as f:
            f.write(html_bytes)
        size_kb = filepath.stat().st_size / 1024
        print(f"   ✅ Downloaded: {size_kb:.2f} KB")
    except Exception as e:
        print(f"   ❌ Failed: {e}")

# Audio (MP3) - Chapter 3 from LibriVox
download_file(
    "https://ia803207.us.archive.org/30/items/kazetachinu_ek_librivox/kazetachinu_03_hori.mp3",
    "ja_kaze_tachinu_ch03.mp3"
)

## 5. Hindi Resources

In [None]:
print("=" * 60)
print("Hindi: Universal Declaration of Human Rights (UDHR)")
print("=" * 60)

# Text (PDF) - UDHR Hindi translation (contains images, needs OCR)
download_file(
    "https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/hnd.pdf",
    "hi_udhr.pdf"
)

# Audio (MP3) - UDHR Hindi from LibriVox
download_file(
    "https://www.archive.org/download/human_rights_02_0908_librivox/human_rights_un_hin_brc.mp3",
    "hi_udhr.mp3"
)

## 6. Korean Resources

In [None]:
print("=" * 60)
print("Korean: Universal Declaration of Human Rights (UDHR)")
print("=" * 60)

# Text (PDF) - UDHR Korean translation
download_file(
    "https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/kkn.pdf",
    "ko_udhr.pdf"
)

# Audio (MP3) - UDHR Korean from LibriVox
download_file(
    "https://ia800906.us.archive.org/24/items/universal_declaration_librivox/human_rights_un_kkn_lsj.mp3",
    "ko_udhr.mp3"
)

## 7. Filipino (Tagalog) Resources

In [None]:
print("=" * 60)
print("Filipino (Tagalog): Universal Declaration of Human Rights (UDHR)")
print("=" * 60)

# Text (PDF) - UDHR Filipino translation
download_file(
    "https://www.ohchr.org/sites/default/files/UDHR/Documents/UDHR_Translations/tgl.pdf",
    "tl_udhr.pdf"
)

# Audio (MP3) - UDHR Filipino from LibriVox
download_file(
    "https://ia800906.us.archive.org/24/items/universal_declaration_librivox/human_rights_un_fil_alnl.mp3",
    "tl_udhr.mp3"
)

## 8. Zhuang Resources

In [None]:
print("=" * 60)
print("Zhuang: Luke in Bible (Southern Zhuang - Low-resource language)")
print("=" * 60)

# Text (PDF) - Luke in Zhuang
download_file(
    "https://www.zhuangfuyin.org/sites/www.zhuangfuyin.org/files/uploads/Luhzaz.pdf",
    "za_luke.pdf"
)

# Audio (MP3) - Luke Chapter 1 in Zhuang
download_file(
    "https://www.zhuangfuyin.org/sites/www.zhuangfuyin.org/files/media_stream/encodings/audio_download_mp3_orig_qual/499-.mp3",
    "za_luke_ch01.mp3"
)

## Summary

In [None]:
print("=" * 60)
print("DOWNLOAD SUMMARY")
print("=" * 60)

# List all downloaded files
files = sorted(RESOURCES_DIR.glob("*"))
total_size = 0

print(f"\n{'Filename':<40} {'Size':>10}")
print("-" * 52)

for f in files:
    if f.is_file():
        size = f.stat().st_size
        total_size += size
        if size > 1024 * 1024:
            size_str = f"{size / (1024 * 1024):.2f} MB"
        else:
            size_str = f"{size / 1024:.2f} KB"
        print(f"{f.name:<40} {size_str:>10}")

print("-" * 52)
print(f"{'Total':<40} {total_size / (1024 * 1024):.2f} MB")
print(f"\n✅ All resources saved to: {RESOURCES_DIR.absolute()}")

In [None]:
# Language-resource mapping for easy reference
RESOURCE_MAP = {
    "English (Meta)": {
        "text": "en_meta_q1_2025_transcript.pdf",
        "audio": None,  # Requires manual download
        "lang_code": None,
        "cjk_split": False,
    },
    "English (Walden)": {
        "text": "en_walden.html",
        "audio": "en_walden_ch01.mp3",
        "lang_code": None,
        "cjk_split": False,
    },
    "Portuguese": {
        "text": "pt_orpheu_no1.html",
        "audio": "pt_orpheu_ch46.mp3",
        "lang_code": "por",
        "cjk_split": False,
    },
    "Chinese": {
        "text": "zh_analects.pdf",
        "audio": "zh_analects_ch14.mp3",
        "lang_code": "cmn",
        "cjk_split": True,
    },
    "Japanese": {
        "text": "ja_kaze_tachinu.html",
        "audio": "ja_kaze_tachinu_ch03.mp3",
        "lang_code": "jpn",
        "cjk_split": True,
    },
    "Hindi": {
        "text": "hi_udhr.pdf",
        "audio": "hi_udhr.mp3",
        "lang_code": "hin",
        "cjk_split": False,
    },
    "Korean": {
        "text": "ko_udhr.pdf",
        "audio": "ko_udhr.mp3",
        "lang_code": "kor",
        "cjk_split": False,
    },
    "Filipino": {
        "text": "tl_udhr.pdf",
        "audio": "tl_udhr.mp3",
        "lang_code": "tgl",
        "cjk_split": False,
    },
    "Zhuang": {
        "text": "za_luke.pdf",
        "audio": "za_luke_ch01.mp3",
        "lang_code": None,  # Latin script already
        "cjk_split": False,
    },
}

print("Resource mapping saved. Use RESOURCE_MAP to access file paths.")
print("\nExample:")
print(f"  RESOURCE_MAP['Japanese']['text'] = {RESOURCE_MAP['Japanese']['text']}")
print(f"  RESOURCE_MAP['Japanese']['audio'] = {RESOURCE_MAP['Japanese']['audio']}")