# Position Embeddings: Sinusoidal/Cosine vs RoPE

## Overview

**Sinusoidal/Cosine Position Embedding (Original Transformer):**
- Fixed, non-learnable embeddings
- Uses sine and cosine functions with different frequencies
- Added to token embeddings
- Formula: `PE(pos, 2i) = sin(pos / 10000^(2i/d_model))` and `PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))`

**RoPE (Rotary Position Embedding):**
- Rotates query and key vectors by angle proportional to position
- Applied during attention computation, not added to embeddings
- Better extrapolation to longer sequences
- Used in modern LLMs (LLaMA, GPT-NeoX, PaLM)

In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import pi

In [2]:
# ============================================================================
# Sinusoidal/Cosine Position Embedding (Original Transformer)
# ============================================================================

def create_sinusoidal_position_embedding(max_len: int, d_model: int):
    """
    Create sinusoidal position embeddings as in the original Transformer paper.
    
    Args:
        max_len: Maximum sequence length
        d_model: Model dimension
        
    Returns:
        Position embedding matrix of shape (max_len, d_model)
    """
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    
    # Create div_term: 10000^(2i/d_model) for i in [0, d_model//2]
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                        (-np.log(10000.0) / d_model))
    
    # Apply sin to even indices
    pe[:, 0::2] = torch.sin(position * div_term)
    # Apply cos to odd indices
    pe[:, 1::2] = torch.cos(position * div_term)
    
    return pe


# Example: Create sinusoidal position embeddings
max_len = 100
d_model = 64
sinusoidal_pe = create_sinusoidal_position_embedding(max_len, d_model)

print("=" * 70)
print("Sinusoidal/Cosine Position Embedding")
print("=" * 70)
print(f"Shape: {sinusoidal_pe.shape}")
print(f"\nFirst 5 positions, first 8 dimensions:")
print(sinusoidal_pe[:5, :8])
print(f"\nLast 5 positions, first 8 dimensions:")
print(sinusoidal_pe[-5:, :8])

Sinusoidal/Cosine Position Embedding
Shape: torch.Size([100, 64])

First 5 positions, first 8 dimensions:
tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.6816,  0.7318,  0.5332,  0.8460,  0.4093,  0.9124],
        [ 0.9093, -0.4161,  0.9975,  0.0709,  0.9021,  0.4315,  0.7469,  0.6649],
        [ 0.1411, -0.9900,  0.7783, -0.6279,  0.9933, -0.1160,  0.9536,  0.3010],
        [-0.7568, -0.6536,  0.1415, -0.9899,  0.7785, -0.6277,  0.9933, -0.1157]])

Last 5 positions, first 8 dimensions:
tensor([[ 0.6833,  0.7302,  0.8504, -0.5262, -0.0154, -0.9999,  0.7029, -0.7112],
        [ 0.9836, -0.1804,  0.2636, -0.9646, -0.5461, -0.8377,  0.3503, -0.9367],
        [ 0.3796, -0.9251, -0.4645, -0.8856, -0.9086, -0.4176, -0.0638, -0.9980],
        [-0.5734, -0.8193, -0.9435, -0.3314, -0.9914,  0.1312, -0.4667, -0.8844],
        [-0.9992,  0.0398, -0.9163,  0.4005, -0.7687,  0.6396, -0.7878, -0.6159]])


In [3]:
# ============================================================================
# RoPE (Rotary Position Embedding) Implementation
# ============================================================================

def apply_rope(x: torch.Tensor, freqs: torch.Tensor):
    """
    Apply rotary position embedding to input tensor.
    
    Args:
        x: Input tensor of shape (..., seq_len, d_model)
        freqs: Frequencies tensor of shape (seq_len, d_model // 2)
        
    Returns:
        Rotated tensor of same shape as x
    """
    # Reshape x to separate pairs: (..., seq_len, d_model//2, 2)
    x_reshaped = x.reshape(*x.shape[:-1], x.shape[-1] // 2, 2)
    
    # Extract x1 and x2 (real and imaginary parts)
    x1, x2 = x_reshaped[..., 0], x_reshaped[..., 1]
    
    # Expand frequencies to match x shape
    # freqs shape: (seq_len, d_model//2) -> (..., seq_len, d_model//2)
    freqs_expanded = freqs.unsqueeze(0).expand(*x.shape[:-2], -1, -1)
    
    # Compute cos and sin
    cos_freqs = torch.cos(freqs_expanded)
    sin_freqs = torch.sin(freqs_expanded)
    
    # Apply rotation: [x1', x2'] = [x1*cos - x2*sin, x1*sin + x2*cos]
    x1_rotated = x1 * cos_freqs - x2 * sin_freqs
    x2_rotated = x1 * sin_freqs + x2 * cos_freqs
    
    # Stack and reshape back
    x_rotated = torch.stack([x1_rotated, x2_rotated], dim=-1)
    x_rotated = x_rotated.reshape(*x.shape)
    
    return x_rotated


def create_rope_frequencies(max_len: int, d_model: int, theta: float = 10000.0):
    """
    Create frequencies for RoPE.
    
    Args:
        max_len: Maximum sequence length
        d_model: Model dimension (must be even)
        theta: Base frequency parameter
        
    Returns:
        Frequencies tensor of shape (max_len, d_model // 2)
    """
    # Create position indices
    positions = torch.arange(0, max_len, dtype=torch.float32)
    
    # Create dimension indices for pairs
    dims = torch.arange(0, d_model, 2, dtype=torch.float32)
    
    # Compute frequencies: theta^(-2i/d_model) for each dimension pair
    inv_freq = 1.0 / (theta ** (dims / d_model))
    
    # Outer product: position * inv_freq
    freqs = torch.outer(positions, inv_freq)
    
    return freqs


# Example: Create RoPE frequencies
rope_freqs = create_rope_frequencies(max_len, d_model)

print("=" * 70)
print("RoPE (Rotary Position Embedding)")
print("=" * 70)
print(f"Frequencies shape: {rope_freqs.shape}")
print(f"\nFirst 5 positions, first 4 frequency pairs:")
print(rope_freqs[:5, :4])
print(f"\nLast 5 positions, first 4 frequency pairs:")
print(rope_freqs[-5:, :4])

RoPE (Rotary Position Embedding)
Frequencies shape: torch.Size([100, 32])

First 5 positions, first 4 frequency pairs:
tensor([[0.0000, 0.0000, 0.0000, 0.0000],
        [1.0000, 0.7499, 0.5623, 0.4217],
        [2.0000, 1.4998, 1.1247, 0.8434],
        [3.0000, 2.2497, 1.6870, 1.2651],
        [4.0000, 2.9996, 2.2494, 1.6868]])

Last 5 positions, first 4 frequency pairs:
tensor([[95.0000, 71.2400, 53.4224, 40.0612],
        [96.0000, 71.9898, 53.9848, 40.4829],
        [97.0000, 72.7397, 54.5471, 40.9046],
        [98.0000, 73.4896, 55.1095, 41.3263],
        [99.0000, 74.2395, 55.6718, 41.7480]])


In [None]:
# ============================================================================
# Visualization: Sinusoidal Position Embeddings
# ============================================================================

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Heatmap of sinusoidal position embeddings
sns.heatmap(sinusoidal_pe[:50, :].T, cmap='coolwarm', center=0, 
            ax=axes[0, 0], cbar_kws={'label': 'Embedding Value'})
axes[0, 0].set_title('Sinusoidal Position Embeddings Heatmap\n(First 50 positions, all dimensions)', 
                     fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Position')
axes[0, 0].set_ylabel('Dimension')

# 2. Plot of first few dimensions across positions
for dim in range(0, min(8, d_model), 2):
    axes[0, 1].plot(sinusoidal_pe[:50, dim].numpy(), 
                   label=f'Dim {dim} (sin)', linestyle='-', alpha=0.7)
    if dim + 1 < d_model:
        axes[0, 1].plot(sinusoidal_pe[:50, dim+1].numpy(), 
                       label=f'Dim {dim+1} (cos)', linestyle='--', alpha=0.7)
axes[0, 1].set_title('Sinusoidal Embeddings Across Positions\n(First 8 dimensions)', 
                    fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Position')
axes[0, 1].set_ylabel('Embedding Value')
axes[0, 1].legend(loc='upper right', fontsize=8)
axes[0, 1].grid(True, alpha=0.3)

# 3. Frequency analysis: Show how different dimensions have different frequencies
dim_indices = [0, 2, 4, 8, 16, 32]
for dim in dim_indices:
    if dim < d_model:
        axes[1, 0].plot(sinusoidal_pe[:50, dim].numpy(), 
                       label=f'Dim {dim}', alpha=0.7)
axes[1, 0].set_title('Different Frequencies for Different Dimensions', 
                     fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Position')
axes[1, 0].set_ylabel('Embedding Value')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. 2D visualization: Position vs Dimension
im = axes[1, 1].imshow(sinusoidal_pe[:50, :].T, aspect='auto', cmap='coolwarm', 
                       interpolation='nearest', origin='lower')
axes[1, 1].set_title('2D View: Position Embeddings', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Position')
axes[1, 1].set_ylabel('Dimension')
plt.colorbar(im, ax=axes[1, 1], label='Embedding Value')

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# Visualization: RoPE Frequencies
# ============================================================================

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Heatmap of RoPE frequencies
sns.heatmap(rope_freqs[:50, :].T, cmap='viridis', 
            ax=axes[0, 0], cbar_kws={'label': 'Frequency Value'})
axes[0, 0].set_title('RoPE Frequencies Heatmap\n(First 50 positions, all frequency pairs)', 
                     fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Position')
axes[0, 0].set_ylabel('Frequency Pair Index')

# 2. Plot frequencies for different dimension pairs
for dim_pair in range(0, min(8, d_model // 2)):
    axes[0, 1].plot(rope_freqs[:50, dim_pair].numpy(), 
                   label=f'Pair {dim_pair}', alpha=0.7)
axes[0, 1].set_title('RoPE Frequencies Across Positions\n(First 8 dimension pairs)', 
                     fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Position')
axes[0, 1].set_ylabel('Frequency Value')
axes[0, 1].legend(loc='upper right', fontsize=8)
axes[0, 1].grid(True, alpha=0.3)

# 3. Show how RoPE rotates vectors
# Create sample query and key vectors
seq_len = 10
sample_q = torch.randn(seq_len, d_model)
sample_k = torch.randn(seq_len, d_model)

# Apply RoPE
rope_freqs_sample = create_rope_frequencies(seq_len, d_model)
q_rotated = apply_rope(sample_q, rope_freqs_sample)
k_rotated = apply_rope(sample_k, rope_freqs_sample)

# Compute attention scores (simplified, just dot product for visualization)
attention_scores = torch.matmul(q_rotated, k_rotated.transpose(-2, -1))

sns.heatmap(attention_scores.detach().numpy(), cmap='viridis', 
            ax=axes[1, 0], cbar_kws={'label': 'Attention Score'})
axes[1, 0].set_title('Attention Scores with RoPE\n(Query @ Key^T)', 
                     fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Key Position')
axes[1, 0].set_ylabel('Query Position')

# 4. Compare frequencies for different positions
positions_to_show = [0, 5, 10, 20, 30]
for pos in positions_to_show:
    if pos < max_len:
        axes[1, 1].plot(rope_freqs[pos, :20].numpy(), 
                       label=f'Pos {pos}', marker='o', markersize=4, alpha=0.7)
axes[1, 1].set_title('RoPE Frequencies for Different Positions\n(First 20 pairs)', 
                     fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Frequency Pair Index')
axes[1, 1].set_ylabel('Frequency Value')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# Comparison: Sinusoidal vs RoPE
# ============================================================================

print("=" * 70)
print("Comparison: Sinusoidal vs RoPE Position Embeddings")
print("=" * 70)

# Create sample embeddings
seq_len = 20
d_model = 32
token_embeddings = torch.randn(seq_len, d_model)

# Sinusoidal: Add position embeddings to token embeddings
sinusoidal_pe_sample = create_sinusoidal_position_embedding(seq_len, d_model)
embeddings_with_sinusoidal = token_embeddings + sinusoidal_pe_sample

# RoPE: Apply rotation during attention (simulated here)
rope_freqs_sample = create_rope_frequencies(seq_len, d_model)
embeddings_with_rope = apply_rope(token_embeddings, rope_freqs_sample)

print(f"\n1. SINUSOIDAL POSITION EMBEDDING:")
print(f"   - Method: Addition (token_emb + pos_emb)")
print(f"   - Shape: {embeddings_with_sinusoidal.shape}")
print(f"   - Learnable: No (fixed)")
print(f"   - First position embedding (first 8 dims): {sinusoidal_pe_sample[0, :8]}")
print(f"   - Extrapolation: Limited (fixed max_len)")

print(f"\n2. ROPE (ROTARY POSITION EMBEDDING):")
print(f"   - Method: Rotation (applied during attention)")
print(f"   - Shape: {embeddings_with_rope.shape}")
print(f"   - Learnable: No (fixed frequencies)")
print(f"   - First position frequencies (first 4 pairs): {rope_freqs_sample[0, :4]}")
print(f"   - Extrapolation: Better (relative positions)")

# Visual comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Original token embeddings
im1 = axes[0].imshow(token_embeddings.T, aspect='auto', cmap='coolwarm', 
                    interpolation='nearest', origin='lower')
axes[0].set_title('Original Token Embeddings\n(No position info)', 
                 fontsize=12, fontweight='bold')
axes[0].set_xlabel('Position')
axes[0].set_ylabel('Dimension')
plt.colorbar(im1, ax=axes[0], label='Value')

# With sinusoidal PE
im2 = axes[1].imshow(embeddings_with_sinusoidal.T, aspect='auto', cmap='coolwarm', 
                    interpolation='nearest', origin='lower')
axes[1].set_title('Token + Sinusoidal Position Embedding\n(Additive)', 
                 fontsize=12, fontweight='bold')
axes[1].set_xlabel('Position')
axes[1].set_ylabel('Dimension')
plt.colorbar(im2, ax=axes[1], label='Value')

# With RoPE (rotated)
im3 = axes[2].imshow(embeddings_with_rope.T, aspect='auto', cmap='coolwarm', 
                    interpolation='nearest', origin='lower')
axes[2].set_title('Token with RoPE Applied\n(Rotational)', 
                 fontsize=12, fontweight='bold')
axes[2].set_xlabel('Position')
axes[2].set_ylabel('Dimension')
plt.colorbar(im3, ax=axes[2], label='Value')

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# Example: How RoPE Works in Attention
# ============================================================================

print("=" * 70)
print("RoPE in Attention Mechanism")
print("=" * 70)

# Simulate attention with RoPE
seq_len = 8
d_model = 16
batch_size = 1

# Create query, key, value
q = torch.randn(batch_size, seq_len, d_model)
k = torch.randn(batch_size, seq_len, d_model)
v = torch.randn(batch_size, seq_len, d_model)

# Create RoPE frequencies
rope_freqs = create_rope_frequencies(seq_len, d_model)

# Apply RoPE to q and k (not v)
q_rope = apply_rope(q, rope_freqs)
k_rope = apply_rope(k, rope_freqs)

# Compute attention scores
attention_scores = torch.matmul(q_rope, k_rope.transpose(-2, -1)) / np.sqrt(d_model)
attention_weights = torch.softmax(attention_scores, dim=-1)

print(f"Query shape: {q.shape}")
print(f"Key shape: {k.shape}")
print(f"After RoPE - Query shape: {q_rope.shape}")
print(f"After RoPE - Key shape: {k_rope.shape}")
print(f"\nAttention scores shape: {attention_scores.shape}")
print(f"Attention weights shape: {attention_weights.shape}")

# Visualize attention pattern
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Attention scores
sns.heatmap(attention_scores[0].detach().numpy(), cmap='viridis', 
            ax=axes[0], cbar_kws={'label': 'Score'})
axes[0].set_title('Attention Scores (with RoPE)\nQuery @ Key^T', 
                 fontsize=12, fontweight='bold')
axes[0].set_xlabel('Key Position')
axes[0].set_ylabel('Query Position')

# Attention weights
sns.heatmap(attention_weights[0].detach().numpy(), cmap='Blues', 
            ax=axes[1], cbar_kws={'label': 'Weight'})
axes[1].set_title('Attention Weights (Softmax)\nEach row sums to 1', 
                 fontsize=12, fontweight='bold')
axes[1].set_xlabel('Key Position')
axes[1].set_ylabel('Query Position')

plt.tight_layout()
plt.show()

print("\n" + "=" * 70)
print("Key Insight:")
print("=" * 70)
print("RoPE encodes relative position information in the attention mechanism.")
print("The rotation angle depends on both the position and the dimension,")
print("allowing the model to learn relative position patterns naturally.")

## Summary

| Feature | Sinusoidal/Cosine | RoPE |
|---------|------------------|------|
| **Method** | Addition to embeddings | Rotation during attention |
| **Learnable** | No (fixed) | No (fixed frequencies) |
| **Extrapolation** | Limited to max_len | Better (relative positions) |
| **Computation** | Simple addition | Matrix rotation |
| **Used in** | Original Transformer, BERT | LLaMA, GPT-NeoX, PaLM |
| **Key Advantage** | Simple, interpretable | Better for long sequences |

**Key Differences:**
1. **Sinusoidal**: Adds fixed position embeddings directly to token embeddings
2. **RoPE**: Rotates query/key vectors by angles proportional to position, preserving relative position information
3. **RoPE** generally performs better on longer sequences due to its relative position encoding