# Word Embeddings: PyTorch vs TensorFlow

**Learning Objectives:**
- Master embedding layer creation in both frameworks
- Compare embedding training and usage patterns
- Learn pre-trained embedding integration
- Understand embedding visualization and analysis

**Prerequisites:** Text preprocessing, tensor operations

**Estimated Time:** 35 minutes

In [None]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Add src to path
sys.path.append(os.path.join('..', '..', 'src'))

from foundations.data_utils import get_tutorial_text_data
from foundations.preprocessing import TextPreprocessor
from utils.comparison_tools import create_side_by_side_comparison

# Try to import frameworks
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    PYTORCH_AVAILABLE = True
    print(f"‚úÖ PyTorch {torch.__version__} available")
except ImportError:
    PYTORCH_AVAILABLE = False
    print("‚ùå PyTorch not available")

try:
    import tensorflow as tf
    TENSORFLOW_AVAILABLE = True
    print(f"‚úÖ TensorFlow {tf.__version__} available")
except ImportError:
    TENSORFLOW_AVAILABLE = False
    print("‚ùå TensorFlow not available")

# Set random seeds
np.random.seed(42)
if PYTORCH_AVAILABLE:
    torch.manual_seed(42)
if TENSORFLOW_AVAILABLE:
    tf.random.set_seed(42)

## 1. Creating Embedding Layers

Comparing how to create and initialize embedding layers.

In [None]:
print("=" * 60)
print("CREATING EMBEDDING LAYERS")
print("=" * 60)

# Get sample data and build vocabulary
text_data = get_tutorial_text_data(num_samples=200)
texts = text_data['texts']
labels = text_data['labels']

# Build vocabulary
preprocessor = TextPreprocessor()
preprocessor.build_vocabulary(texts, min_freq=2, max_vocab_size=1000)

vocab_size = preprocessor.vocab_size
embedding_dim = 50

print(f"Vocabulary size: {vocab_size}")
print(f"Embedding dimension: {embedding_dim}")

# PyTorch embedding layer
if PYTORCH_AVAILABLE:
    print("\nüî• PyTorch Embedding Layer:")

    # Create embedding layer
    pt_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

    print(f"Embedding layer: {pt_embedding}")
    print(f"Weight shape: {pt_embedding.weight.shape}")
    print(f"Padding index: {pt_embedding.padding_idx}")

    # Test with sample indices
    sample_indices = torch.tensor([1, 5, 10, 0])  # 0 is padding
    embedded = pt_embedding(sample_indices)

    print(f"\nSample indices: {sample_indices}")
    print(f"Embedded shape: {embedded.shape}")
    print(f"Padding vector (should be zeros): {embedded[3][:5]}...")  # First 5 dims

# TensorFlow embedding layer
if TENSORFLOW_AVAILABLE:
    print("\nüü† TensorFlow Embedding Layer:")

    # Create embedding layer
    tf_embedding = tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        mask_zero=True  # Equivalent to padding_idx=0
    )

    # Build the layer
    sample_indices = tf.constant([1, 5, 10, 0])
    embedded = tf_embedding(sample_indices)

    print(f"Embedding layer: {tf_embedding}")
    print(f"Weight shape: {tf_embedding.weights[0].shape}")
    print(f"Mask zero: {tf_embedding.mask_zero}")

    print(f"\nSample indices: {sample_indices}")
    print(f"Embedded shape: {embedded.shape}")
    print(f"Padding vector (should be zeros): {embedded[3][:5]}...")  # First 5 dims

# Side-by-side comparison
pytorch_embedding_code = """
import torch
import torch.nn as nn

# Create embedding layer
embedding = nn.Embedding(
    num_embeddings=vocab_size,
    embedding_dim=embedding_dim,
    padding_idx=0  # Index 0 will be zero vector
)

# Initialize with custom weights (optional)
nn.init.normal_(embedding.weight, mean=0, std=0.1)

# Use embedding
indices = torch.tensor([1, 5, 10, 0])
embedded_vectors = embedding(indices)
print(f"Shape: {embedded_vectors.shape}")

# Freeze embeddings (optional)
embedding.weight.requires_grad = False
"""

tensorflow_embedding_code = """
import tensorflow as tf

# Create embedding layer
embedding = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    mask_zero=True,  # Index 0 will be masked
    embeddings_initializer='normal'
)

# Use embedding
indices = tf.constant([1, 5, 10, 0])
embedded_vectors = embedding(indices)
print(f"Shape: {embedded_vectors.shape}")

# Freeze embeddings (optional)
embedding.trainable = False
"""

print(create_side_by_side_comparison(
    pytorch_embedding_code, tensorflow_embedding_code, "Embedding Layer Creation"
))

## 2. Training Embeddings

Training embeddings as part of a simple classification model.

In [None]:
print("\n" + "=" * 60)
print("TRAINING EMBEDDINGS")
print("=" * 60)

# Prepare data
sequences = preprocessor.texts_to_sequences(texts, max_length=20)
sequences = np.array(sequences)
labels_array = np.array(labels)

print(f"Data shape: {sequences.shape}")
print(f"Labels shape: {labels_array.shape}")
print(f"Number of classes: {len(set(labels))}")

# PyTorch training
if PYTORCH_AVAILABLE:
    print("\nüî• PyTorch Embedding Training:")

    class SimpleClassifier(nn.Module):
        def __init__(self, vocab_size, embedding_dim, num_classes):
            super().__init__()
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
            self.fc = nn.Linear(embedding_dim, num_classes)
            self.dropout = nn.Dropout(0.2)

        def forward(self, x):
            # x shape: (batch_size, seq_len)
            embedded = self.embedding(x)  # (batch_size, seq_len, embedding_dim)

            # Simple pooling: average over sequence length
            pooled = embedded.mean(dim=1)  # (batch_size, embedding_dim)
            pooled = self.dropout(pooled)

            output = self.fc(pooled)  # (batch_size, num_classes)
            return output

    # Create model
    pt_model = SimpleClassifier(vocab_size, embedding_dim, len(set(labels)))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(pt_model.parameters(), lr=0.001)

    print(f"Model: {pt_model}")

    # Convert data to tensors
    X_train = torch.tensor(sequences, dtype=torch.long)
    y_train = torch.tensor(labels_array, dtype=torch.long)

    # Training loop (simplified)
    pt_model.train()
    for epoch in range(5):
        optimizer.zero_grad()

        outputs = pt_model(X_train)
        loss = criterion(outputs, y_train)

        loss.backward()
        optimizer.step()

        if epoch % 2 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

    # Get trained embeddings
    pt_trained_embeddings = pt_model.embedding.weight.detach().numpy()
    print(f"\nTrained embeddings shape: {pt_trained_embeddings.shape}")

# TensorFlow training
if TENSORFLOW_AVAILABLE:
    print("\nüü† TensorFlow Embedding Training:")

    # Create model
    tf_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True),
        tf.keras.layers.GlobalAveragePooling1D(),  # Average pooling
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(len(set(labels)), activation='softmax')
    ])

    tf_model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    print("Model summary:")
    tf_model.build((None, 20))  # Build with input shape
    tf_model.summary()

    # Training
    history = tf_model.fit(
        sequences, labels_array,
        epochs=5,
        verbose=1,
        batch_size=32
    )

    # Get trained embeddings
    tf_trained_embeddings = tf_model.layers[0].get_weights()[0]
    print(f"\nTrained embeddings shape: {tf_trained_embeddings.shape}")

# Compare embedding similarities
if PYTORCH_AVAILABLE and TENSORFLOW_AVAILABLE:
    print("\nüìä Embedding Analysis:")

    # Find most similar words to a target word
    def find_similar_words(embeddings, word_to_idx, idx_to_word, target_word, top_k=5):
        if target_word not in word_to_idx:
            return []

        target_idx = word_to_idx[target_word]
        target_embedding = embeddings[target_idx]

        # Compute cosine similarities
        similarities = np.dot(embeddings, target_embedding) / (
            np.linalg.norm(embeddings, axis=1) * np.linalg.norm(target_embedding)
        )

        # Get top similar words (excluding the target word itself)
        similar_indices = np.argsort(similarities)[::-1][1:top_k+1]

        similar_words = []
        for idx in similar_indices:
            if idx in idx_to_word:
                similar_words.append((idx_to_word[idx], similarities[idx]))

        return similar_words

    # Test with a common word
    test_word = "good" if "good" in preprocessor.word_to_idx else list(preprocessor.word_to_idx.keys())[10]

    print(f"\nWords similar to '{test_word}':")

    pt_similar = find_similar_words(
        pt_trained_embeddings, preprocessor.word_to_idx, preprocessor.idx_to_word, test_word
    )
    print(f"PyTorch: {pt_similar}")

    tf_similar = find_similar_words(
        tf_trained_embeddings, preprocessor.word_to_idx, preprocessor.idx_to_word, test_word
    )
    print(f"TensorFlow: {tf_similar}")

## 3. Embedding Visualization

Visualizing learned embeddings using dimensionality reduction.

In [None]:
print("\n" + "=" * 60)
print("EMBEDDING VISUALIZATION")
print("=" * 60)

if PYTORCH_AVAILABLE:
    # Use PyTorch embeddings for visualization
    embeddings_to_plot = pt_trained_embeddings
    framework_name = "PyTorch"
elif TENSORFLOW_AVAILABLE:
    # Use TensorFlow embeddings for visualization
    embeddings_to_plot = tf_trained_embeddings
    framework_name = "TensorFlow"
else:
    print("No frameworks available for visualization")
    embeddings_to_plot = None

if embeddings_to_plot is not None:
    # Select top words for visualization (exclude padding)
    top_words = 50
    word_indices = list(range(1, min(top_words + 1, len(preprocessor.idx_to_word))))
    selected_embeddings = embeddings_to_plot[word_indices]
    selected_words = [preprocessor.idx_to_word[i] for i in word_indices]

    print(f"Visualizing {len(selected_words)} words using {framework_name} embeddings")

    # PCA reduction
    pca = PCA(n_components=2)
    embeddings_2d_pca = pca.fit_transform(selected_embeddings)

    # t-SNE reduction (for smaller subset)
    if len(selected_words) <= 30:  # t-SNE is computationally expensive
        tsne = TSNE(n_components=2, random_state=42, perplexity=min(5, len(selected_words)-1))
        embeddings_2d_tsne = tsne.fit_transform(selected_embeddings)
    else:
        embeddings_2d_tsne = None

    # Create visualization
    fig, axes = plt.subplots(1, 2 if embeddings_2d_tsne is not None else 1, figsize=(15, 6))
    if embeddings_2d_tsne is None:
        axes = [axes]

    # PCA plot
    axes[0].scatter(embeddings_2d_pca[:, 0], embeddings_2d_pca[:, 1], alpha=0.7)

    # Add word labels for a subset
    for i, word in enumerate(selected_words[:20]):  # Show first 20 labels
        axes[0].annotate(word, (embeddings_2d_pca[i, 0], embeddings_2d_pca[i, 1]),
                        xytext=(5, 5), textcoords='offset points', fontsize=8)

    axes[0].set_title(f'{framework_name} Embeddings - PCA')
    axes[0].set_xlabel(f'PC1 (explained variance: {pca.explained_variance_ratio_[0]:.2%})')
    axes[0].set_ylabel(f'PC2 (explained variance: {pca.explained_variance_ratio_[1]:.2%})')
    axes[0].grid(True, alpha=0.3)

    # t-SNE plot (if computed)
    if embeddings_2d_tsne is not None:
        axes[1].scatter(embeddings_2d_tsne[:, 0], embeddings_2d_tsne[:, 1], alpha=0.7)

        # Add word labels
        for i, word in enumerate(selected_words):
            axes[1].annotate(word, (embeddings_2d_tsne[i, 0], embeddings_2d_tsne[i, 1]),
                            xytext=(5, 5), textcoords='offset points', fontsize=8)

        axes[1].set_title(f'{framework_name} Embeddings - t-SNE')
        axes[1].set_xlabel('t-SNE 1')
        axes[1].set_ylabel('t-SNE 2')
        axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Embedding statistics
    print("\nEmbedding Statistics:")
    print(f"Mean embedding norm: {np.mean(np.linalg.norm(selected_embeddings, axis=1)):.4f}")
    print(f"Std embedding norm: {np.std(np.linalg.norm(selected_embeddings, axis=1)):.4f}")
    print(f"PCA explained variance (first 2 components): {pca.explained_variance_ratio_[:2].sum():.2%}")

# Embedding analysis
print("\nüìà Embedding Quality Analysis:")

if PYTORCH_AVAILABLE and TENSORFLOW_AVAILABLE:
    # Compare embedding spaces
    def embedding_similarity_analysis(emb1, emb2, name1, name2):
        # Exclude padding embedding (index 0)
        emb1_clean = emb1[1:]
        emb2_clean = emb2[1:]

        # Compute average cosine similarity between corresponding embeddings
        similarities = []
        for i in range(min(len(emb1_clean), len(emb2_clean))):
            sim = np.dot(emb1_clean[i], emb2_clean[i]) / (
                np.linalg.norm(emb1_clean[i]) * np.linalg.norm(emb2_clean[i])
            )
            similarities.append(sim)

        avg_similarity = np.mean(similarities)
        print(f"Average cosine similarity between {name1} and {name2} embeddings: {avg_similarity:.4f}")

        return avg_similarity

    similarity = embedding_similarity_analysis(
        pt_trained_embeddings, tf_trained_embeddings, "PyTorch", "TensorFlow"
    )

    if similarity > 0.5:
        print("‚úÖ Embeddings show good agreement between frameworks")
    else:
        print("‚ö†Ô∏è Embeddings show significant differences - this is normal for different initializations")

print("\nüí° Key Insights:")
print("‚Ä¢ Embeddings learn semantic relationships during training")
print("‚Ä¢ Similar words cluster together in the embedding space")
print("‚Ä¢ Both frameworks can learn meaningful representations")
print("‚Ä¢ Visualization helps understand what the model has learned")