# Word Embeddings Tutorial: From Theory to Practice

This comprehensive tutorial covers word embeddings fundamentals with hands-on PyTorch implementation.

## Table of Contents
1. [Introduction to Word Embeddings](#introduction)
2. [Traditional vs Modern Approaches](#traditional-vs-modern)
3. [Word2Vec Implementation from Scratch](#word2vec-implementation)
4. [Working with Pre-trained GloVe Embeddings](#glove-embeddings)
5. [Interactive Demonstrations](#demonstrations)
6. [Training Custom Embeddings](#custom-training)
7. [Real-world Applications](#applications)
8. [Semantic Search Foundation](#semantic-search)

## Requirements
- Python 3.13+
- PyTorch 2.8+
- NumPy, Matplotlib, Scikit-learn
- NLTK for text preprocessing

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import re
import random
import collections
from collections import Counter, defaultdict
import urllib.request
import zipfile
import os
from typing import List, Dict, Tuple, Optional

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Introduction to Word Embeddings {#introduction}

Word embeddings are dense vector representations of words that capture semantic relationships. Unlike one-hot encoding, embeddings:

- **Capture semantic similarity**: Similar words have similar vectors
- **Enable vector arithmetic**: Mathematical operations reveal relationships
- **Reduce dimensionality**: Dense representations (50-300D) vs sparse one-hot
- **Transfer learning**: Pre-trained embeddings work across tasks

### Key Concepts
- **Distributional Hypothesis**: Words appearing in similar contexts have similar meanings
- **Context Window**: Surrounding words that define meaning
- **Embedding Space**: High-dimensional space where semantic relationships are preserved

In [None]:
# Simple demonstration of the distributional hypothesis
sample_sentences = [
    "The cat sat on the mat",
    "A dog ran in the park", 
    "The kitten played on the carpet",
    "A puppy walked in the garden",
    "The feline rested on the rug"
]

def extract_context_words(sentences, target_word, window_size=2):
    """Extract context words for a target word from sentences."""
    contexts = []
    
    for sentence in sentences:
        words = sentence.lower().split()
        for i, word in enumerate(words):
            if word == target_word:
                start = max(0, i - window_size)
                end = min(len(words), i + window_size + 1)
                context = words[start:i] + words[i+1:end]
                contexts.extend(context)
    
    return Counter(contexts)

# Compare context words for semantically similar words
cat_contexts = extract_context_words(sample_sentences, "cat")
dog_contexts = extract_context_words(sample_sentences, "dog")
kitten_contexts = extract_context_words(sample_sentences, "kitten")

print("Context words for 'cat':", dict(cat_contexts))
print("Context words for 'dog':", dict(dog_contexts))
print("Context words for 'kitten':", dict(kitten_contexts))

print("\nNotice how 'cat' and 'kitten' share more context words than 'cat' and 'dog'")
print("This is the foundation of word embeddings!")

## 2. Traditional vs Modern Approaches {#traditional-vs-modern}

### Traditional Approaches
- **One-hot encoding**: Sparse, no semantic information
- **Co-occurrence matrices**: Dense but computationally expensive
- **TF-IDF**: Good for document similarity, poor for semantic similarity

### Modern Approaches
- **Word2Vec**: Efficient neural approach (Skip-gram, CBOW)
- **GloVe**: Global statistical information + local context
- **FastText**: Subword information for out-of-vocabulary words

In [None]:
# Comparison of traditional vs modern approaches
vocab = ["cat", "dog", "kitten", "puppy", "car", "truck"]
vocab_size = len(vocab)

# One-hot encoding example
def create_one_hot(word, vocab):
    """Create one-hot encoding for a word."""
    vector = np.zeros(len(vocab))
    if word in vocab:
        vector[vocab.index(word)] = 1
    return vector

# Create one-hot vectors
cat_onehot = create_one_hot("cat", vocab)
dog_onehot = create_one_hot("dog", vocab)
kitten_onehot = create_one_hot("kitten", vocab)

print("One-hot vectors (sparse, no semantic information):")
print(f"cat:    {cat_onehot}")
print(f"dog:    {dog_onehot}")
print(f"kitten: {kitten_onehot}")

# Calculate similarities
cat_dog_sim = np.dot(cat_onehot, dog_onehot)
cat_kitten_sim = np.dot(cat_onehot, kitten_onehot)

print(f"\nOne-hot similarities:")
print(f"cat-dog similarity: {cat_dog_sim}")
print(f"cat-kitten similarity: {cat_kitten_sim}")
print("Problem: All word pairs have zero similarity!")

# Mock dense embeddings (what we'll learn to create)
print("\nDense embeddings (what we'll create):")
mock_embeddings = {
    "cat":    np.array([0.2, -0.1, 0.8, -0.3]),
    "dog":    np.array([0.1, -0.2, 0.7, -0.1]),
    "kitten": np.array([0.3, -0.1, 0.9, -0.4]),
    "car":    np.array([-0.5, 0.8, 0.1, 0.2])
}

def cosine_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

cat_dog_sim_dense = cosine_sim(mock_embeddings["cat"], mock_embeddings["dog"])
cat_kitten_sim_dense = cosine_sim(mock_embeddings["cat"], mock_embeddings["kitten"])
cat_car_sim_dense = cosine_sim(mock_embeddings["cat"], mock_embeddings["car"])

print(f"cat-dog similarity: {cat_dog_sim_dense:.3f}")
print(f"cat-kitten similarity: {cat_kitten_sim_dense:.3f}")
print(f"cat-car similarity: {cat_car_sim_dense:.3f}")
print("Success: Similar words (cat-kitten) have higher similarity!")

## 3. Word2Vec Implementation from Scratch {#word2vec-implementation}

Word2Vec uses shallow neural networks to learn word embeddings. Two architectures:

### Skip-gram
- Input: Center word
- Output: Context words
- Better for rare words

### CBOW (Continuous Bag of Words)
- Input: Context words
- Output: Center word
- Faster training, better for frequent words

In [None]:
class TextPreprocessor:
    """Preprocessor for text data to prepare for Word2Vec training."""
    
    def __init__(self, min_count=5):
        self.min_count = min_count
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.word_counts = Counter()
        self.vocab_size = 0
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text."""
        # Convert to lowercase
        text = text.lower()
        # Remove non-alphabetic characters except spaces
        text = re.sub(r'[^a-z\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text
    
    def build_vocab(self, corpus: List[str]) -> None:
        """Build vocabulary from corpus."""
        # Count all words
        for sentence in corpus:
            cleaned = self.clean_text(sentence)
            words = cleaned.split()
            self.word_counts.update(words)
        
        # Filter by minimum count
        filtered_words = [word for word, count in self.word_counts.items() 
                         if count >= self.min_count]
        
        # Create mappings
        self.word_to_idx = {word: idx for idx, word in enumerate(filtered_words)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        self.vocab_size = len(self.word_to_idx)
        
        print(f"Built vocabulary with {self.vocab_size} words")
        print(f"Most common words: {self.word_counts.most_common(10)}")
    
    def encode_sentence(self, sentence: str) -> List[int]:
        """Convert sentence to list of word indices."""
        cleaned = self.clean_text(sentence)
        words = cleaned.split()
        return [self.word_to_idx[word] for word in words if word in self.word_to_idx]

# Sample corpus for demonstration
sample_corpus = [
    "The cat sat on the mat",
    "A dog ran in the park",
    "The cat played with the dog", 
    "Dogs and cats are pets",
    "The park has many dogs running",
    "Cats like to sit on mats",
    "The dog and cat are friends",
    "Animals play in the park daily",
    "The mat is comfortable for cats",
    "Dogs love to run and play"
]

# Initialize preprocessor
preprocessor = TextPreprocessor(min_count=1)  # Low min_count for small corpus
preprocessor.build_vocab(sample_corpus)

# Show encoded sentences
print("\nEncoded sentences:")
for i, sentence in enumerate(sample_corpus[:3]):
    encoded = preprocessor.encode_sentence(sentence)
    decoded = [preprocessor.idx_to_word[idx] for idx in encoded]
    print(f"Original: {sentence}")
    print(f"Encoded:  {encoded}")
    print(f"Decoded:  {decoded}")
    print()

In [None]:
class Word2VecDataset(Dataset):
    """Dataset for Word2Vec training (Skip-gram model)."""
    
    def __init__(self, corpus: List[str], preprocessor: TextPreprocessor, 
                 window_size: int = 2):
        self.preprocessor = preprocessor
        self.window_size = window_size
        self.pairs = self._create_training_pairs(corpus)
    
    def _create_training_pairs(self, corpus: List[str]) -> List[Tuple[int, int]]:
        """Create (center_word, context_word) pairs for Skip-gram."""
        pairs = []
        
        for sentence in corpus:
            encoded = self.preprocessor.encode_sentence(sentence)
            
            for i, center_word in enumerate(encoded):
                # Define context window
                start = max(0, i - self.window_size)
                end = min(len(encoded), i + self.window_size + 1)
                
                # Create pairs with context words
                for j in range(start, end):
                    if i != j:  # Skip center word itself
                        context_word = encoded[j]
                        pairs.append((center_word, context_word))
        
        return pairs
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        center, context = self.pairs[idx]
        return torch.tensor(center, dtype=torch.long), torch.tensor(context, dtype=torch.long)

# Create dataset
dataset = Word2VecDataset(sample_corpus, preprocessor, window_size=2)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print(f"Created {len(dataset)} training pairs")
print("Sample pairs:")
for i in range(5):
    center_idx, context_idx = dataset.pairs[i]
    center_word = preprocessor.idx_to_word[center_idx]
    context_word = preprocessor.idx_to_word[context_idx]
    print(f"  {center_word} -> {context_word}")

In [None]:
class SkipGramModel(nn.Module):
    """Skip-gram Word2Vec model implementation."""
    
    def __init__(self, vocab_size: int, embedding_dim: int):
        super(SkipGramModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # Input embeddings (center words)
        self.in_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Output embeddings (context words)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Initialize embeddings
        self._init_embeddings()
    
    def _init_embeddings(self):
        """Initialize embeddings with small random values."""
        nn.init.uniform_(self.in_embeddings.weight, -0.5/self.embedding_dim, 0.5/self.embedding_dim)
        nn.init.uniform_(self.out_embeddings.weight, -0.5/self.embedding_dim, 0.5/self.embedding_dim)
    
    def forward(self, center_words: torch.Tensor, context_words: torch.Tensor) -> torch.Tensor:
        """Forward pass for Skip-gram model."""
        # Get embeddings
        center_embeds = self.in_embeddings(center_words)  # (batch_size, embedding_dim)
        context_embeds = self.out_embeddings(context_words)  # (batch_size, embedding_dim)
        
        # Compute dot product (similarity)
        scores = torch.sum(center_embeds * context_embeds, dim=1)  # (batch_size,)
        
        return scores
    
    def get_word_embedding(self, word_idx: int) -> torch.Tensor:
        """Get embedding for a specific word."""
        return self.in_embeddings.weight[word_idx].detach()
    
    def get_all_embeddings(self) -> torch.Tensor:
        """Get all word embeddings."""
        return self.in_embeddings.weight.detach()

# Initialize model
embedding_dim = 50
model = SkipGramModel(preprocessor.vocab_size, embedding_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

print(f"Initialized Skip-gram model:")
print(f"  Vocabulary size: {preprocessor.vocab_size}")
print(f"  Embedding dimension: {embedding_dim}")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters())}")

In [None]:
def train_word2vec(model, dataloader, optimizer, num_epochs=100, negative_samples=5):
    """Train Word2Vec model with negative sampling."""
    model.train()
    losses = []
    
    print("Training Word2Vec model...")
    
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        
        for batch_idx, (center_words, context_words) in enumerate(dataloader):
            center_words = center_words.to(device)
            context_words = context_words.to(device)
            batch_size = center_words.size(0)
            
            optimizer.zero_grad()
            
            # Positive samples (actual context words)
            pos_scores = model(center_words, context_words)
            pos_loss = -F.logsigmoid(pos_scores).mean()
            
            # Negative samples (random words)
            neg_words = torch.randint(0, model.vocab_size, (batch_size * negative_samples,), device=device)
            center_repeated = center_words.repeat_interleave(negative_samples)
            neg_scores = model(center_repeated, neg_words)
            neg_loss = -F.logsigmoid(-neg_scores).mean()
            
            # Total loss
            loss = pos_loss + neg_loss
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(dataloader)
        losses.append(avg_loss)
        
        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
    print("Training completed!")
    return losses

# Train the model
losses = train_word2vec(model, dataloader, optimizer, num_epochs=200)

# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(losses)
plt.title('Word2Vec Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

## 4. Working with Pre-trained GloVe Embeddings {#glove-embeddings}

GloVe (Global Vectors) combines global statistical information with local context information. Let's load and work with pre-trained GloVe embeddings.

In [None]:
class GloVeEmbeddings:
    """Loader and utilities for GloVe embeddings."""
    
    def __init__(self):
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.embeddings = None
        self.embedding_dim = 0
    
    def download_glove(self, dim=50):
        """Download GloVe embeddings if not present."""
        filename = f"glove.6B.{dim}d.txt"
        filepath = os.path.join(".", filename)
        
        if not os.path.exists(filepath):
            print(f"Downloading GloVe {dim}d embeddings...")
            url = "http://nlp.stanford.edu/data/glove.6B.zip"
            
            # Download and extract
            urllib.request.urlretrieve(url, "glove.6B.zip")
            with zipfile.ZipFile("glove.6B.zip", 'r') as zip_ref:
                zip_ref.extract(filename)
            os.remove("glove.6B.zip")
            print("Download completed!")
        
        return filepath
    
    def load_glove_subset(self, words_to_load=None, dim=50):
        """Load a subset of GloVe embeddings for demonstration."""
        # For demo purposes, create mock GloVe embeddings
        demo_words = [
            "cat", "dog", "kitten", "puppy", "animal", "pet",
            "king", "queen", "man", "woman", "royal", "person",
            "car", "truck", "vehicle", "drive", "road", "traffic",
            "happy", "sad", "joy", "anger", "emotion", "feeling",
            "big", "small", "large", "tiny", "size", "scale",
            "good", "bad", "excellent", "terrible", "quality"
        ]
        
        if words_to_load:
            demo_words.extend(words_to_load)
        
        # Create realistic embeddings with semantic relationships
        np.random.seed(42)
        self.embedding_dim = dim
        embeddings_dict = {}
        
        # Define semantic clusters
        clusters = {
            'animals': ["cat", "dog", "kitten", "puppy", "animal", "pet"],
            'royalty': ["king", "queen", "man", "woman", "royal", "person"],
            'vehicles': ["car", "truck", "vehicle", "drive", "road", "traffic"],
            'emotions': ["happy", "sad", "joy", "anger", "emotion", "feeling"],
            'sizes': ["big", "small", "large", "tiny", "size", "scale"],
            'quality': ["good", "bad", "excellent", "terrible", "quality"]
        }
        
        # Generate embeddings with cluster structure
        for cluster_name, words in clusters.items():
            # Create cluster center
            cluster_center = np.random.randn(dim) * 0.5
            
            for word in words:
                if word in demo_words:
                    # Add noise around cluster center
                    embedding = cluster_center + np.random.randn(dim) * 0.3
                    embeddings_dict[word] = embedding
        
        # Handle remaining words
        for word in demo_words:
            if word not in embeddings_dict:
                embeddings_dict[word] = np.random.randn(dim) * 0.5
        
        # Create mappings and embedding matrix
        self.word_to_idx = {word: idx for idx, word in enumerate(embeddings_dict.keys())}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        
        embedding_list = [embeddings_dict[word] for word in self.word_to_idx.keys()]
        self.embeddings = torch.tensor(embedding_list, dtype=torch.float32)
        
        print(f"Loaded {len(self.word_to_idx)} GloVe embeddings ({dim}D)")
        return self.embeddings
    
    def get_embedding(self, word: str) -> Optional[torch.Tensor]:
        """Get embedding for a word."""
        if word in self.word_to_idx:
            idx = self.word_to_idx[word]
            return self.embeddings[idx]
        return None
    
    def similarity(self, word1: str, word2: str) -> float:
        """Calculate cosine similarity between two words."""
        emb1 = self.get_embedding(word1)
        emb2 = self.get_embedding(word2)
        
        if emb1 is None or emb2 is None:
            return 0.0
        
        cos_sim = F.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0))
        return cos_sim.item()

# Load GloVe embeddings
glove = GloVeEmbeddings()
glove_embeddings = glove.load_glove_subset(dim=100)

# Test similarities
print("\nGloVe Similarity Examples:")
test_pairs = [
    ("cat", "dog"),
    ("cat", "kitten"),
    ("king", "queen"),
    ("car", "truck"),
    ("happy", "sad"),
    ("big", "large"),
    ("cat", "car")
]

for word1, word2 in test_pairs:
    sim = glove.similarity(word1, word2)
    print(f"{word1} - {word2}: {sim:.3f}")

## 5. Interactive Demonstrations {#demonstrations}

Let's explore the fascinating properties of word embeddings through interactive demonstrations.

In [None]:
class EmbeddingAnalyzer:
    """Analyzer for exploring word embedding properties."""
    
    def __init__(self, embeddings, word_to_idx, idx_to_word):
        self.embeddings = embeddings
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
    
    def get_embedding(self, word: str) -> Optional[torch.Tensor]:
        """Get embedding for a word."""
        if word in self.word_to_idx:
            idx = self.word_to_idx[word]
            return self.embeddings[idx]
        return None
    
    def find_nearest_neighbors(self, word: str, k: int = 5) -> List[Tuple[str, float]]:
        """Find k nearest neighbors to a word."""
        word_emb = self.get_embedding(word)
        if word_emb is None:
            return []
        
        # Calculate similarities to all words
        similarities = F.cosine_similarity(word_emb.unsqueeze(0), self.embeddings)
        
        # Get top k similar words (excluding the word itself)
        top_k = torch.topk(similarities, k + 1)
        
        neighbors = []
        for score, idx in zip(top_k.values[1:], top_k.indices[1:]):  # Skip first (itself)
            neighbor_word = self.idx_to_word[idx.item()]
            neighbors.append((neighbor_word, score.item()))
        
        return neighbors
    
    def vector_arithmetic(self, positive: List[str], negative: List[str] = None, k: int = 5) -> List[Tuple[str, float]]:
        """Perform vector arithmetic: positive[0] + positive[1] + ... - negative[0] - negative[1] - ..."""
        if negative is None:
            negative = []
        
        result_vector = torch.zeros_like(self.embeddings[0])
        
        # Add positive vectors
        for word in positive:
            emb = self.get_embedding(word)
            if emb is not None:
                result_vector += emb
        
        # Subtract negative vectors
        for word in negative:
            emb = self.get_embedding(word)
            if emb is not None:
                result_vector -= emb
        
        # Find nearest neighbors to result vector
        similarities = F.cosine_similarity(result_vector.unsqueeze(0), self.embeddings)
        
        # Get top k words, excluding input words
        excluded_words = set(positive + negative)
        top_words = []
        
        sorted_indices = torch.argsort(similarities, descending=True)
        for idx in sorted_indices:
            word = self.idx_to_word[idx.item()]
            if word not in excluded_words:
                score = similarities[idx].item()
                top_words.append((word, score))
                if len(top_words) >= k:
                    break
        
        return top_words
    
    def solve_analogy(self, a: str, b: str, c: str, k: int = 5) -> List[Tuple[str, float]]:
        """Solve analogy: a is to b as c is to ?"""
        # a:b :: c:? => ? = b - a + c
        return self.vector_arithmetic(positive=[b, c], negative=[a], k=k)

# Test with our trained Word2Vec embeddings
w2v_analyzer = EmbeddingAnalyzer(
    model.get_all_embeddings(),
    preprocessor.word_to_idx,
    preprocessor.idx_to_word
)

print("=== Word2Vec Embeddings Analysis ===")
print("\nNearest neighbors:")
test_words = ["cat", "dog", "park"]
for word in test_words:
    if word in preprocessor.word_to_idx:
        neighbors = w2v_analyzer.find_nearest_neighbors(word, k=3)
        print(f"{word}: {[(w, f'{s:.3f}') for w, s in neighbors]}")

# Test with GloVe embeddings
glove_analyzer = EmbeddingAnalyzer(
    glove.embeddings,
    glove.word_to_idx,
    glove.idx_to_word
)

print("\n=== GloVe Embeddings Analysis ===")
print("\nNearest neighbors:")
for word in ["cat", "king", "happy"]:
    neighbors = glove_analyzer.find_nearest_neighbors(word, k=3)
    print(f"{word}: {[(w, f'{s:.3f}') for w, s in neighbors]}")

print("\nVector arithmetic examples:")
# King - man + woman = queen
result = glove_analyzer.solve_analogy("man", "king", "woman", k=3)
print(f"man:king :: woman:? = {[(w, f'{s:.3f}') for w, s in result]}")

# Cat + big = ?
result = glove_analyzer.vector_arithmetic(["cat", "big"], k=3)
print(f"cat + big = {[(w, f'{s:.3f}') for w, s in result]}")

In [None]:
def visualize_embeddings(embeddings, word_to_idx, idx_to_word, method='tsne', words_to_plot=None):
    """Visualize word embeddings using dimensionality reduction."""
    
    # Select words to plot
    if words_to_plot is None:
        # Select a subset of words
        words_to_plot = list(word_to_idx.keys())[:30]
    
    # Get embeddings for selected words
    indices = [word_to_idx[word] for word in words_to_plot if word in word_to_idx]
    selected_embeddings = embeddings[indices].numpy()
    selected_words = [idx_to_word[idx] for idx in indices]
    
    # Apply dimensionality reduction
    if method == 'tsne':
        reducer = TSNE(n_components=2, random_state=42, perplexity=min(5, len(selected_words)-1))
        coords = reducer.fit_transform(selected_embeddings)
        title = 't-SNE Visualization of Word Embeddings'
    elif method == 'pca':
        reducer = PCA(n_components=2, random_state=42)
        coords = reducer.fit_transform(selected_embeddings)
        title = f'PCA Visualization of Word Embeddings\n(Explained variance: {reducer.explained_variance_ratio_.sum():.2%})'
    else:
        raise ValueError("Method must be 'tsne' or 'pca'")
    
    # Create visualization
    plt.figure(figsize=(12, 10))
    scatter = plt.scatter(coords[:, 0], coords[:, 1], alpha=0.7, s=100)
    
    # Add word labels
    for i, word in enumerate(selected_words):
        plt.annotate(word, (coords[i, 0], coords[i, 1]), 
                    xytext=(5, 5), textcoords='offset points',
                    ha='left', va='bottom', fontsize=10)
    
    plt.title(title, fontsize=14, fontweight='bold')
    plt.xlabel(f'{method.upper()} Component 1')
    plt.ylabel(f'{method.upper()} Component 2')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Visualize GloVe embeddings
print("Visualizing GloVe embeddings...")
visualize_embeddings(
    glove.embeddings, 
    glove.word_to_idx, 
    glove.idx_to_word, 
    method='pca'
)

visualize_embeddings(
    glove.embeddings, 
    glove.word_to_idx, 
    glove.idx_to_word, 
    method='tsne'
)

In [None]:
def create_similarity_heatmap(analyzer, words):
    """Create a heatmap showing similarity between words."""
    n = len(words)
    similarity_matrix = np.zeros((n, n))
    
    for i, word1 in enumerate(words):
        for j, word2 in enumerate(words):
            if word1 in analyzer.word_to_idx and word2 in analyzer.word_to_idx:
                emb1 = analyzer.get_embedding(word1)
                emb2 = analyzer.get_embedding(word2)
                sim = F.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
                similarity_matrix[i, j] = sim
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, 
                xticklabels=words, 
                yticklabels=words, 
                annot=True, 
                cmap='coolwarm', 
                center=0,
                fmt='.2f')
    plt.title('Word Similarity Heatmap')
    plt.tight_layout()
    plt.show()

# Create similarity heatmap for selected words
selected_words = ["cat", "dog", "kitten", "puppy", "king", "queen", "man", "woman", "car", "truck"]
available_words = [w for w in selected_words if w in glove_analyzer.word_to_idx]

print(f"Creating similarity heatmap for words: {available_words}")
create_similarity_heatmap(glove_analyzer, available_words)

## 6. Training Custom Embeddings {#custom-training}

Let's create a more sophisticated training setup for custom embeddings with a larger corpus.

In [None]:
def generate_sample_corpus(size=1000):
    """Generate a larger sample corpus for training."""
    templates = [
        "The {animal} {verb} in the {location}",
        "A {size} {animal} {verb} {adverb}",
        "The {color} {object} is {adjective}",
        "{person} {verb} the {object} {adverb}",
        "In the {location}, {animal}s {verb} {adverb}",
        "The {adjective} {object} {verb} {location}",
        "{person} saw a {size} {color} {animal}",
        "Every {animal} {verb} when {condition}",
        "The {location} has many {color} {object}s",
        "{size} {animal}s are {adjective} and {adjective}"
    ]
    
    words = {
        'animal': ['cat', 'dog', 'bird', 'fish', 'rabbit', 'mouse', 'lion', 'tiger', 'elephant', 'bear'],
        'verb': ['runs', 'jumps', 'sleeps', 'eats', 'plays', 'walks', 'sits', 'stands', 'flies', 'swims'],
        'location': ['park', 'house', 'garden', 'forest', 'river', 'mountain', 'field', 'street', 'beach', 'cave'],
        'size': ['big', 'small', 'large', 'tiny', 'huge', 'little', 'giant', 'miniature'],
        'color': ['red', 'blue', 'green', 'yellow', 'black', 'white', 'brown', 'orange', 'purple', 'gray'],
        'object': ['car', 'house', 'tree', 'flower', 'book', 'chair', 'table', 'window', 'door', 'ball'],
        'adjective': ['beautiful', 'ugly', 'fast', 'slow', 'happy', 'sad', 'bright', 'dark', 'clean', 'dirty'],
        'person': ['John', 'Mary', 'Bob', 'Alice', 'Tom', 'Sarah', 'Mike', 'Lisa', 'David', 'Emma'],
        'adverb': ['quickly', 'slowly', 'quietly', 'loudly', 'carefully', 'happily', 'sadly', 'gently'],
        'condition': ['hungry', 'tired', 'excited', 'scared', 'curious', 'bored', 'surprised']
    }
    
    corpus = []
    for _ in range(size):
        template = random.choice(templates)
        sentence = template
        
        for category, word_list in words.items():
            if f'{{{category}}}' in sentence:
                word = random.choice(word_list)
                sentence = sentence.replace(f'{{{category}}}', word, 1)
        
        corpus.append(sentence)
    
    return corpus

# Generate larger corpus
large_corpus = generate_sample_corpus(2000)
print(f"Generated corpus with {len(large_corpus)} sentences")
print("\nSample sentences:")
for i in range(5):
    print(f"  {large_corpus[i]}")

# Train new embeddings on larger corpus
large_preprocessor = TextPreprocessor(min_count=3)
large_preprocessor.build_vocab(large_corpus)

large_dataset = Word2VecDataset(large_corpus, large_preprocessor, window_size=3)
large_dataloader = DataLoader(large_dataset, batch_size=64, shuffle=True)

# Initialize larger model
large_model = SkipGramModel(large_preprocessor.vocab_size, embedding_dim=100).to(device)
large_optimizer = optim.Adam(large_model.parameters(), lr=0.005)

print(f"\nTraining model on {len(large_dataset)} word pairs")
print(f"Vocabulary size: {large_preprocessor.vocab_size}")

In [None]:
# Train the larger model
large_losses = train_word2vec(large_model, large_dataloader, large_optimizer, num_epochs=150)

# Analyze the trained embeddings
large_analyzer = EmbeddingAnalyzer(
    large_model.get_all_embeddings(),
    large_preprocessor.word_to_idx,
    large_preprocessor.idx_to_word
)

print("\n=== Analysis of Custom Trained Embeddings ===")
print("\nNearest neighbors:")
test_words = ["cat", "dog", "big", "small", "red", "blue"]
for word in test_words:
    if word in large_preprocessor.word_to_idx:
        neighbors = large_analyzer.find_nearest_neighbors(word, k=4)
        print(f"{word}: {[(w, f'{s:.3f}') for w, s in neighbors]}")

print("\nVector arithmetic:")
# Test semantic relationships
if all(w in large_preprocessor.word_to_idx for w in ["big", "small", "cat"]):
    result = large_analyzer.vector_arithmetic(["cat", "big"], ["small"], k=3)
    print(f"cat + big - small = {[(w, f'{s:.3f}') for w, s in result]}")

if all(w in large_preprocessor.word_to_idx for w in ["lion", "cat", "dog"]):
    result = large_analyzer.solve_analogy("cat", "lion", "dog", k=3)
    print(f"cat:lion :: dog:? = {[(w, f'{s:.3f}') for w, s in result]}")

## 7. Real-world Applications {#applications}

Let's explore practical applications of word embeddings in real-world scenarios.

In [None]:
class DocumentSimilarity:
    """Document similarity using word embeddings."""
    
    def __init__(self, analyzer):
        self.analyzer = analyzer
    
    def document_embedding(self, document: str, method='mean') -> torch.Tensor:
        """Create document embedding by aggregating word embeddings."""
        words = document.lower().split()
        embeddings = []
        
        for word in words:
            emb = self.analyzer.get_embedding(word)
            if emb is not None:
                embeddings.append(emb)
        
        if not embeddings:
            return torch.zeros(self.analyzer.embeddings.size(1))
        
        embeddings = torch.stack(embeddings)
        
        if method == 'mean':
            return embeddings.mean(dim=0)
        elif method == 'sum':
            return embeddings.sum(dim=0)
        elif method == 'max':
            return embeddings.max(dim=0)[0]
        else:
            raise ValueError("Method must be 'mean', 'sum', or 'max'")
    
    def similarity(self, doc1: str, doc2: str) -> float:
        """Calculate similarity between two documents."""
        emb1 = self.document_embedding(doc1)
        emb2 = self.document_embedding(doc2)
        
        return F.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
    
    def find_similar_documents(self, query: str, documents: List[str], k: int = 3) -> List[Tuple[str, float]]:
        """Find k most similar documents to a query."""
        query_emb = self.document_embedding(query)
        similarities = []
        
        for doc in documents:
            doc_emb = self.document_embedding(doc)
            sim = F.cosine_similarity(query_emb.unsqueeze(0), doc_emb.unsqueeze(0)).item()
            similarities.append((doc, sim))
        
        # Sort by similarity and return top k
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:k]

# Test document similarity
doc_sim = DocumentSimilarity(glove_analyzer)

sample_documents = [
    "The cat sat on the mat and slept peacefully",
    "A dog ran quickly through the green park",
    "The king ruled his kingdom with great wisdom",
    "A small kitten played with a ball of yarn",
    "The queen wore a beautiful golden crown",
    "Cars and trucks drive on the busy highway",
    "Happy children played games in the sunny park",
    "The royal family lived in a magnificent palace"
]

query = "cute animals playing"
print(f"Query: '{query}'")
print("\nMost similar documents:")

similar_docs = doc_sim.find_similar_documents(query, sample_documents, k=5)
for i, (doc, sim) in enumerate(similar_docs, 1):
    print(f"{i}. {doc} (similarity: {sim:.3f})")

In [None]:
class WordClustering:
    """Cluster words based on their embeddings."""
    
    def __init__(self, analyzer):
        self.analyzer = analyzer
    
    def cluster_words(self, words: List[str], n_clusters: int = 3) -> Dict[int, List[str]]:
        """Cluster words using K-means."""
        # Get embeddings for words
        embeddings = []
        valid_words = []
        
        for word in words:
            emb = self.analyzer.get_embedding(word)
            if emb is not None:
                embeddings.append(emb.numpy())
                valid_words.append(word)
        
        if len(embeddings) < n_clusters:
            return {0: valid_words}
        
        from sklearn.cluster import KMeans
        
        # Perform clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(embeddings)
        
        # Group words by cluster
        clusters = defaultdict(list)
        for word, label in zip(valid_words, cluster_labels):
            clusters[label].append(word)
        
        return dict(clusters)

# Test word clustering
clustering = WordClustering(glove_analyzer)

words_to_cluster = [
    "cat", "dog", "kitten", "puppy", "animal", "pet",  # Animals
    "king", "queen", "royal", "man", "woman", "person",  # People/Royalty
    "car", "truck", "vehicle", "road", "drive",  # Vehicles
    "happy", "sad", "joy", "anger", "emotion", "feeling",  # Emotions
    "big", "small", "large", "tiny", "size"
]

print("Clustering words based on embeddings:")
clusters = clustering.cluster_words(words_to_cluster, n_clusters=5)

for cluster_id, words in clusters.items():
    print(f"\nCluster {cluster_id}: {words}")
    
    # Show cluster centroid's nearest neighbors
    if len(words) > 1:
        # Calculate cluster centroid
        cluster_embeddings = [glove_analyzer.get_embedding(w) for w in words]
        cluster_embeddings = [e for e in cluster_embeddings if e is not None]
        if cluster_embeddings:
            centroid = torch.stack(cluster_embeddings).mean(dim=0)
            
            # Find words closest to centroid
            similarities = F.cosine_similarity(centroid.unsqueeze(0), glove_analyzer.embeddings)
            top_indices = torch.topk(similarities, 5).indices
            centroid_words = [glove_analyzer.idx_to_word[idx.item()] for idx in top_indices]
            print(f"  Centroid neighbors: {centroid_words}")

## 8. Semantic Search Foundation {#semantic-search}

Let's build the foundation for semantic search using word embeddings.

In [None]:
class SemanticSearchEngine:
    """Simple semantic search engine using word embeddings."""
    
    def __init__(self, analyzer):
        self.analyzer = analyzer
        self.documents = []
        self.document_embeddings = []
    
    def add_document(self, doc_id: str, content: str):
        """Add a document to the search index."""
        self.documents.append({'id': doc_id, 'content': content})
        
        # Create document embedding
        doc_emb = self._create_document_embedding(content)
        self.document_embeddings.append(doc_emb)
    
    def _create_document_embedding(self, text: str) -> torch.Tensor:
        """Create embedding for a document."""
        words = text.lower().split()
        embeddings = []
        
        for word in words:
            emb = self.analyzer.get_embedding(word)
            if emb is not None:
                embeddings.append(emb)
        
        if not embeddings:
            return torch.zeros(self.analyzer.embeddings.size(1))
        
        return torch.stack(embeddings).mean(dim=0)
    
    def search(self, query: str, k: int = 5) -> List[Dict]:
        """Search for documents similar to the query."""
        query_emb = self._create_document_embedding(query)
        
        if not self.document_embeddings:
            return []
        
        # Calculate similarities
        doc_emb_tensor = torch.stack(self.document_embeddings)
        similarities = F.cosine_similarity(query_emb.unsqueeze(0), doc_emb_tensor)
        
        # Get top k results
        top_k = torch.topk(similarities, min(k, len(self.documents)))
        
        results = []
        for score, idx in zip(top_k.values, top_k.indices):
            doc = self.documents[idx.item()]
            results.append({
                'id': doc['id'],
                'content': doc['content'],
                'score': score.item()
            })
        
        return results
    
    def explain_search(self, query: str, doc_content: str) -> Dict:
        """Explain why a document matches a query."""
        query_words = query.lower().split()
        doc_words = doc_content.lower().split()
        
        # Find word-level similarities
        word_similarities = []
        
        for q_word in query_words:
            q_emb = self.analyzer.get_embedding(q_word)
            if q_emb is None:
                continue
                
            best_match = None
            best_score = -1
            
            for d_word in doc_words:
                d_emb = self.analyzer.get_embedding(d_word)
                if d_emb is None:
                    continue
                    
                sim = F.cosine_similarity(q_emb.unsqueeze(0), d_emb.unsqueeze(0)).item()
                if sim > best_score:
                    best_score = sim
                    best_match = d_word
            
            if best_match:
                word_similarities.append((q_word, best_match, best_score))
        
        return {
            'query': query,
            'document': doc_content,
            'word_matches': word_similarities,
            'overall_score': self._create_document_embedding(query) @ self._create_document_embedding(doc_content)
        }

# Create semantic search engine
search_engine = SemanticSearchEngine(glove_analyzer)

# Add sample documents
sample_docs = [
    ("doc1", "Cats are wonderful pets that love to play and sleep"),
    ("doc2", "Dogs are loyal animals that enjoy running in parks"),
    ("doc3", "The king and queen ruled their kingdom wisely"),
    ("doc4", "Small kittens are adorable and playful creatures"),
    ("doc5", "Royal families live in magnificent palaces"),
    ("doc6", "Happy children play games in sunny weather"),
    ("doc7", "Cars and trucks transport people and goods"),
    ("doc8", "Emotions like joy and sadness are part of life")
]

for doc_id, content in sample_docs:
    search_engine.add_document(doc_id, content)

print(f"Added {len(sample_docs)} documents to search engine")

# Test searches
test_queries = [
    "cute animals",
    "royal palace",
    "happy kids",
    "vehicle transportation"
]

for query in test_queries:
    print(f"\n=== Search: '{query}' ===")
    results = search_engine.search(query, k=3)
    
    for i, result in enumerate(results, 1):
        print(f"{i}. {result['content']} (score: {result['score']:.3f})")
    
    # Explain top result
    if results:
        explanation = search_engine.explain_search(query, results[0]['content'])
        print(f"\nExplanation for top result:")
        for q_word, d_word, score in explanation['word_matches']:
            print(f"  '{q_word}' matches '{d_word}' (similarity: {score:.3f})")

## Summary and Next Steps

In this tutorial, we've covered:

### What We Learned
1. **Word Embedding Fundamentals**: How embeddings capture semantic relationships
2. **Word2Vec Implementation**: Skip-gram model from scratch using PyTorch
3. **GloVe Embeddings**: Working with pre-trained global vectors
4. **Vector Arithmetic**: Mathematical operations revealing word relationships
5. **Visualization**: PCA and t-SNE for understanding embedding spaces
6. **Real-world Applications**: Document similarity, clustering, and semantic search

### Key Insights
- **Distributional Hypothesis**: Words in similar contexts have similar meanings
- **Dense Representations**: Much more efficient and informative than sparse one-hot
- **Transfer Learning**: Pre-trained embeddings work across different tasks
- **Semantic Relationships**: Vector arithmetic captures linguistic relationships

### Next Steps
1. **Modern Transformers**: Move to contextualized embeddings (BERT, GPT)
2. **Sentence Embeddings**: Learn sentence-transformers for document-level semantics
3. **Multilingual Models**: Explore cross-lingual embeddings
4. **Production Systems**: Scale semantic search to large document collections
5. **Domain Adaptation**: Fine-tune embeddings for specific domains

### Engineering Takeaways
- Start with pre-trained embeddings for quick prototyping
- Custom training pays off for domain-specific applications
- Visualizations are crucial for understanding and debugging
- Semantic search enables powerful information retrieval systems

In [None]:
# Final demonstration: Interactive word exploration
def interactive_word_explorer(analyzer, word):
    """Comprehensive analysis of a word's embedding properties."""
    if word not in analyzer.word_to_idx:
        print(f"Word '{word}' not found in vocabulary")
        return
    
    print(f"=== Exploring '{word}' ===")
    
    # Nearest neighbors
    neighbors = analyzer.find_nearest_neighbors(word, k=5)
    print(f"\nNearest neighbors:")
    for neighbor, score in neighbors:
        print(f"  {neighbor}: {score:.3f}")
    
    # Vector properties
    embedding = analyzer.get_embedding(word)
    print(f"\nEmbedding properties:")
    print(f"  Dimension: {embedding.size(0)}")
    print(f"  L2 norm: {torch.norm(embedding).item():.3f}")
    print(f"  Mean value: {embedding.mean().item():.3f}")
    print(f"  Std deviation: {embedding.std().item():.3f}")
    
    # Test analogies if relevant words exist
    analogy_tests = [
        (["king", "man"], [word], "royal analogy"),
        (["big", "small"], [word], "size analogy"),
        (["happy", "sad"], [word], "emotion analogy")
    ]
    
    print(f"\nAnalogy tests:")
    for pos_words, neg_words, description in analogy_tests:
        if all(w in analyzer.word_to_idx for w in pos_words + neg_words):
            result = analyzer.vector_arithmetic(pos_words, neg_words, k=3)
            if result:
                top_result = result[0]
                print(f"  {description}: {' + '.join(pos_words)} - {' - '.join(neg_words)} = {top_result[0]} ({top_result[1]:.3f})")

# Explore some interesting words
words_to_explore = ["cat", "king", "happy"]
for word in words_to_explore:
    interactive_word_explorer(glove_analyzer, word)
    print("\n" + "="*50 + "\n")

print("Tutorial completed! You now have a solid foundation in word embeddings.")
print("Next: Explore sentence-transformers and modern contextualized embeddings!")