# Wikipedia Dataset and Embeddings - Example Usage

This notebook demonstrates how to use the Wikipedia dataset utilities and embedding functions.

## 1. Setup and Installation

First, make sure you have installed the required dependencies:

```bash
pip install -r requirements.txt
```

In [None]:
# Import the utilities
from wikipedia_dataset import (
    download_wikipedia_dataset,
    parse_page_content,
    parse_multiple_pages,
    get_page_by_title
)
from embeddings import (
    load_embedding_model,
    embed_text,
    embed_page_content,
    compare_embeddings,
    compare_multiple_embeddings,
    find_most_similar
)

## 2. Download Wikipedia Dataset

⚠️ **Note**: This step downloads a large dataset and may take significant time on first run.

In [None]:
# Download the English Wikipedia dataset
dataset = download_wikipedia_dataset("en")
print(f"Downloaded dataset with {len(dataset)} articles")

## 3. Parse Wikipedia Pages

In [None]:
# Parse a single page
title, lead = parse_page_content(dataset[0])
print(f"Title: {title}")
print(f"Lead content (first 200 chars): {lead[:200]}...")

In [None]:
# Parse multiple pages
pages = parse_multiple_pages(dataset, num_pages=10)
for i, (title, lead) in enumerate(pages):
    print(f"{i+1}. {title} - {len(lead)} characters")

## 4. Load Embedding Model

In [None]:
# Load the embedding model (this may take a moment on first run)
model = load_embedding_model()
print("Model loaded successfully!")

## 5. Create Embeddings

In [None]:
# Embed a single text
text = "Python is a high-level programming language."
embedding = embed_text(text, model)
print(f"Text: {text}")
print(f"Embedding shape: {embedding.shape}")
print(f"First 5 values: {embedding[:5]}")

In [None]:
# Embed multiple texts at once
texts = [
    "Python is a programming language",
    "Java is a programming language",
    "The cat sat on the mat"
]
embeddings = embed_text(texts, model)
print(f"Embedded {len(texts)} texts")
print(f"Embeddings shape: {embeddings.shape}")

In [None]:
# Embed a Wikipedia page (title + lead content)
title, lead = parse_page_content(dataset[0])
page_embedding = embed_page_content(title, lead, model)
print(f"Page: {title}")
print(f"Embedding shape: {page_embedding.shape}")

## 6. Compare Embeddings

In [None]:
# Compare two embeddings
text1 = "Python programming language"
text2 = "Programming in Python"
text3 = "Cooking pasta recipes"

emb1 = embed_text(text1, model)
emb2 = embed_text(text2, model)
emb3 = embed_text(text3, model)

sim_1_2 = compare_embeddings(emb1, emb2)
sim_1_3 = compare_embeddings(emb1, emb3)
sim_2_3 = compare_embeddings(emb2, emb3)

print(f"Text 1: {text1}")
print(f"Text 2: {text2}")
print(f"Text 3: {text3}\n")

print(f"Similarity (1 vs 2): {sim_1_2:.4f}")
print(f"Similarity (1 vs 3): {sim_1_3:.4f}")
print(f"Similarity (2 vs 3): {sim_2_3:.4f}")

## 7. Compare Multiple Embeddings (Pairwise)

In [None]:
# Compute pairwise similarities
import numpy as np

texts = ["Python", "Java", "JavaScript", "Cooking"]
embeddings = embed_text(texts, model)
similarities = compare_multiple_embeddings(embeddings)

print("Pairwise similarities:")
print("\t" + "\t".join(texts))
for i, text in enumerate(texts):
    print(f"{text}\t", end="")
    for j in range(len(texts)):
        print(f"{similarities[i,j]:.3f}\t", end="")
    print()

## 8. Find Most Similar Pages

In [None]:
# Parse and embed multiple Wikipedia pages
pages = parse_multiple_pages(dataset, num_pages=50)
page_embeddings = []
page_titles = []

for title, lead in pages:
    if lead:  # Only embed if there's content
        embedding = embed_page_content(title, lead, model)
        page_embeddings.append(embedding)
        page_titles.append(title)

page_embeddings_array = np.array(page_embeddings)
print(f"Embedded {len(page_embeddings)} pages")

In [None]:
# Find pages most similar to a query
query_text = "Computer science and programming"
query_embedding = embed_text(query_text, model)

top_matches = find_most_similar(query_embedding, page_embeddings_array, top_k=5)

print(f"Query: {query_text}\n")
print("Most similar pages:")
for idx, score in top_matches:
    print(f"{score:.4f} - {page_titles[idx]}")

## 9. Compare Wikipedia Pages

In [None]:
# Get specific pages by title (if they exist in the dataset)
# Example: Compare embeddings of different programming language articles

# Parse first 1000 pages and find specific topics
pages = parse_multiple_pages(dataset, num_pages=1000)

# Look for pages about programming languages
target_keywords = ["Python", "Java", "JavaScript", "C++"]
found_pages = {}

for title, lead in pages:
    for keyword in target_keywords:
        if keyword.lower() in title.lower() and keyword not in found_pages:
            found_pages[keyword] = (title, lead)
            break

# Embed and compare found pages
if len(found_pages) >= 2:
    print("Found pages:")
    page_list = list(found_pages.items())
    for keyword, (title, _) in page_list:
        print(f"  {keyword}: {title}")
    
    print("\nComparing embeddings:")
    for i in range(len(page_list)):
        for j in range(i+1, len(page_list)):
            kw1, (t1, l1) = page_list[i]
            kw2, (t2, l2) = page_list[j]
            emb1 = embed_page_content(t1, l1, model)
            emb2 = embed_page_content(t2, l2, model)
            similarity = compare_embeddings(emb1, emb2)
            print(f"  {kw1} vs {kw2}: {similarity:.4f}")
else:
    print("Not enough pages found in the first 1000 entries.")
    print("Try increasing the num_pages parameter or use different keywords.")