# Wiktionary Etymology Scraper

This notebook uses the `wiktionary_scraper` module to scrape and visualize etymological relationships between languages.

It supports two types of etymological relationships:
- **Borrowed terms**: Words that one language borrowed from another
- **Derived terms**: Words that are derived from words in another language

## Setup

In [None]:
# Imports
import json
import time
from pathlib import Path

import pandas as pd
import numpy as np

# Import the wiktionary_scraper module
import wiktionary_scraper as ws

# Optional: Sound notifications when scraping completes
try:
    import chime
    chime.theme('pokemon')
    CHIME_AVAILABLE = True
except ImportError:
    CHIME_AVAILABLE = False
    print("Note: Install 'chime' package for sound notifications: pip install chime")

In [None]:
# Configuration
DATA_DIR = Path(".")
BORROWED_TERMS_FILE = DATA_DIR / "borrowed_terms.json"
DERIVED_TERMS_FILE = DATA_DIR / "derived_terms.json"

## Part 1: Borrowed Terms

### Scrape Borrowed Terms

This cell scrapes all borrowed terms from Wiktionary. **Warning**: This can take 30+ minutes to complete.

If `borrowed_terms.json` already exists, skip this cell and load from the file in the next section.

In [None]:
# Scrape borrowed terms (skip if borrowed_terms.json already exists)
start_time = time.time()

borrowed_terms = ws.scrape_etymological_terms(
    category_type="borrowed",
    save_path=str(BORROWED_TERMS_FILE),
    verbose=True
)

elapsed = time.time() - start_time
print(f"\nScraping completed in {elapsed/60:.1f} minutes")

if CHIME_AVAILABLE:
    chime.success()

### Load Borrowed Terms (from existing file)

If you already have `borrowed_terms.json`, load it here instead of re-scraping.

In [None]:
# Load borrowed terms from file
if BORROWED_TERMS_FILE.exists():
    borrowed_terms = ws.load_terms_from_json(str(BORROWED_TERMS_FILE))
    print(f"Loaded {len(borrowed_terms)} categories")
    print(f"Total terms: {sum(len(v) for v in borrowed_terms.values()):,}")
    
    # Show top 10 categories by term count
    print("\nTop 10 categories by term count:")
    sorted_cats = sorted(borrowed_terms.items(), key=lambda x: len(x[1]), reverse=True)[:10]
    for cat, urls in sorted_cats:
        print(f"  {cat}: {len(urls):,} terms")
else:
    print(f"File not found: {BORROWED_TERMS_FILE}")
    print("Run the scraping cell above first.")

### Visualize Borrowed Terms Heatmap

This creates a heatmap showing which languages (debtors) borrowed the most terms from which other languages (creditors).

In [None]:
# Create and display heatmap for borrowed terms
if 'borrowed_terms' in locals():
    heatmap = ws.create_language_heatmap(
        borrowed_terms,
        category_type="borrowed",
        top_n=50
    )
    display(heatmap)
else:
    print("Load borrowed_terms first (see cells above)")

## Part 2: Derived Terms

### Scrape Derived Terms

This cell scrapes all derived terms from Wiktionary. **Warning**: This can take 30+ minutes to complete.

If `derived_terms.json` already exists, skip this cell and load from the file in the next section.

In [None]:
# Scrape derived terms (skip if derived_terms.json already exists)
start_time = time.time()

derived_terms = ws.scrape_etymological_terms(
    category_type="derived",
    save_path=str(DERIVED_TERMS_FILE),
    verbose=True
)

elapsed = time.time() - start_time
print(f"\nScraping completed in {elapsed/60:.1f} minutes")

if CHIME_AVAILABLE:
    chime.success()

### Load Derived Terms (from existing file)

If you already have `derived_terms.json`, load it here instead of re-scraping.

In [None]:
# Load derived terms from file
if DERIVED_TERMS_FILE.exists():
    derived_terms = ws.load_terms_from_json(str(DERIVED_TERMS_FILE))
    print(f"Loaded {len(derived_terms)} categories")
    print(f"Total terms: {sum(len(v) for v in derived_terms.values()):,}")
    
    # Show top 10 categories by term count
    print("\nTop 10 categories by term count:")
    sorted_cats = sorted(derived_terms.items(), key=lambda x: len(x[1]), reverse=True)[:10]
    for cat, urls in sorted_cats:
        print(f"  {cat}: {len(urls):,} terms")
else:
    print(f"File not found: {DERIVED_TERMS_FILE}")
    print("Run the scraping cell above first.")

### Visualize Derived Terms Heatmap

This creates a heatmap showing which languages (recipients) have the most terms derived from which other languages (sources).

In [None]:
# Create and display heatmap for derived terms
if 'derived_terms' in locals():
    heatmap = ws.create_language_heatmap(
        derived_terms,
        category_type="derived",
        top_n=50
    )
    display(heatmap)
else:
    print("Load derived_terms first (see cells above)")

## Part 3: Comparative Analysis (Optional)

Compare borrowed vs derived terms to understand different patterns of linguistic influence.

In [None]:
# Compare borrowed vs derived terms
if 'borrowed_terms' in locals() and 'derived_terms' in locals():
    # Count total terms
    borrowed_total = sum(len(v) for v in borrowed_terms.values())
    derived_total = sum(len(v) for v in derived_terms.values())
    
    print("Comparison Summary")
    print("=" * 50)
    print(f"Borrowed terms: {len(borrowed_terms):,} categories, {borrowed_total:,} total terms")
    print(f"Derived terms:  {len(derived_terms):,} categories, {derived_total:,} total terms")
    print()
    
    # Extract language pairs
    def extract_languages(terms_dict, pattern):
        languages = set()
        for cat in terms_dict.keys():
            if pattern in cat:
                parts = cat.split(pattern)
                languages.add(parts[0])
                languages.add(parts[1])
        return languages
    
    borrowed_langs = extract_languages(borrowed_terms, "_terms_borrowed_from_")
    derived_langs = extract_languages(derived_terms, "_terms_derived_from_")
    
    print(f"Languages with borrowed terms: {len(borrowed_langs)}")
    print(f"Languages with derived terms:  {len(derived_langs)}")
    print(f"Languages in both:             {len(borrowed_langs & derived_langs)}")
    print()
    
    # Find languages only in one category
    only_borrowed = borrowed_langs - derived_langs
    only_derived = derived_langs - borrowed_langs
    
    if only_borrowed:
        print(f"Languages only in borrowed: {len(only_borrowed)}")
        print(f"  Examples: {', '.join(sorted(only_borrowed)[:10])}")
        print()
    
    if only_derived:
        print(f"Languages only in derived: {len(only_derived)}")
        print(f"  Examples: {', '.join(sorted(only_derived)[:10])}")
else:
    print("Load both borrowed_terms and derived_terms to run this analysis")

In [None]:
# Find languages with most borrowing vs derivation
if 'borrowed_terms' in locals() and 'derived_terms' in locals():
    from urllib.parse import unquote
    
    # Count terms per language (as recipient/debtor)
    def count_by_recipient(terms_dict, pattern):
        counts = {}
        for cat, urls in terms_dict.items():
            if pattern in cat:
                recipient = cat.split(pattern)[0]
                recipient = unquote(recipient).replace("_", " ")
                counts[recipient] = counts.get(recipient, 0) + len(urls)
        return pd.Series(counts).sort_values(ascending=False)
    
    borrowed_by_lang = count_by_recipient(borrowed_terms, "_terms_borrowed_from_")
    derived_by_lang = count_by_recipient(derived_terms, "_terms_derived_from_")
    
    # Create comparison DataFrame
    comparison = pd.DataFrame({
        'borrowed': borrowed_by_lang,
        'derived': derived_by_lang
    }).fillna(0).astype(int)
    
    comparison['total'] = comparison['borrowed'] + comparison['derived']
    comparison['borrowed_pct'] = (comparison['borrowed'] / comparison['total'] * 100).round(1)
    comparison = comparison.sort_values('total', ascending=False)
    
    print("Top 20 languages by total etymological terms (borrowed + derived)")
    print(comparison.head(20))