# Sequence Modeling: PyTorch vs TensorFlow

**Learning Objectives:**
- Build RNN/LSTM models for sequence tasks
- Compare sequence modeling approaches
- Handle variable-length sequences
- Understand encoder-decoder patterns

**Prerequisites:** Text preprocessing, embeddings

**Estimated Time:** 45 minutes

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

# Add src to path
sys.path.append(os.path.join('..', '..', 'src'))

from utils.comparison_tools import create_side_by_side_comparison

# Try to import frameworks
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import Dataset, DataLoader
    PYTORCH_AVAILABLE = True
    print(f"✅ PyTorch {torch.__version__} available")
except ImportError:
    PYTORCH_AVAILABLE = False
    print("❌ PyTorch not available")

try:
    import tensorflow as tf
    from tensorflow.keras import layers, models
    TENSORFLOW_AVAILABLE = True
    print(f"✅ TensorFlow {tf.__version__} available")
except ImportError:
    TENSORFLOW_AVAILABLE = False
    print("❌ TensorFlow not available")

# Set random seeds
np.random.seed(42)
if PYTORCH_AVAILABLE:
    torch.manual_seed(42)
if TENSORFLOW_AVAILABLE:
    tf.random.set_seed(42)

## 1. Sequence Data Preparation

Creating sequence data for modeling tasks.

In [None]:
print("=" * 60)
print("SEQUENCE DATA PREPARATION")
print("=" * 60)

# Create simple sequence prediction task (next number prediction)
def create_sequence_data(num_samples=1000, seq_length=10):
    """Create sequences for next-number prediction"""
    sequences = []
    targets = []
    
    for _ in range(num_samples):
        # Create arithmetic sequence
        start = np.random.randint(1, 10)
        step = np.random.randint(1, 5)
        
        seq = [start + i * step for i in range(seq_length)]
        target = start + seq_length * step  # Next number in sequence
        
        sequences.append(seq)
        targets.append(target)
    
    return np.array(sequences), np.array(targets)

# Generate data
X, y = create_sequence_data(num_samples=2000, seq_length=8)

print(f"Data shapes:")
print(f"  Sequences: {X.shape}")
print(f"  Targets: {y.shape}")

print(f"\nSample sequences:")
for i in range(3):
    print(f"  Sequence {i+1}: {X[i]} → {y[i]}")

# Split data
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"\nData split:")
print(f"  Training: {X_train.shape[0]} samples")
print(f"  Testing: {X_test.shape[0]} samples")

# Normalize data
X_mean, X_std = X_train.mean(), X_train.std()
y_mean, y_std = y_train.mean(), y_train.std()

X_train_norm = (X_train - X_mean) / X_std
X_test_norm = (X_test - X_mean) / X_std
y_train_norm = (y_train - y_mean) / y_std
y_test_norm = (y_test - y_mean) / y_std

print(f"\nNormalization stats:")
print(f"  X: mean={X_mean:.2f}, std={X_std:.2f}")
print(f"  y: mean={y_mean:.2f}, std={y_std:.2f}")

## 2. PyTorch Sequence Models

Building sequence models with PyTorch.

In [None]:
if PYTORCH_AVAILABLE:
    print("\n" + "=" * 60)
    print("PYTORCH SEQUENCE MODELS")
    print("=" * 60)
    
    # Dataset class
    class SequenceDataset(Dataset):
        def __init__(self, sequences, targets):
            self.sequences = torch.FloatTensor(sequences)
            self.targets = torch.FloatTensor(targets)
        
        def __len__(self):
            return len(self.sequences)
        
        def __getitem__(self, idx):
            return self.sequences[idx], self.targets[idx]
    
    # Create datasets
    train_dataset = SequenceDataset(X_train_norm, y_train_norm)
    test_dataset = SequenceDataset(X_test_norm, y_test_norm)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    # LSTM Model
    class LSTMSequenceModel(nn.Module):
        def __init__(self, input_size=1, hidden_size=64, num_layers=2, output_size=1):
            super().__init__()
            self.hidden_size = hidden_size
            self.num_layers = num_layers
            
            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
                              batch_first=True, dropout=0.2)
            self.fc = nn.Linear(hidden_size, output_size)
            self.dropout = nn.Dropout(0.2)
        
        def forward(self, x):
            # x shape: (batch_size, seq_len)
            x = x.unsqueeze(-1)  # (batch_size, seq_len, 1)
            
            # LSTM forward pass
            lstm_out, (hidden, _) = self.lstm(x)
            
            # Use the last output
            last_output = lstm_out[:, -1, :]  # (batch_size, hidden_size)
            
            # Final prediction
            output = self.fc(self.dropout(last_output))
            return output.squeeze(-1)  # (batch_size,)
    
    # GRU Model for comparison
    class GRUSequenceModel(nn.Module):
        def __init__(self, input_size=1, hidden_size=64, num_layers=2, output_size=1):
            super().__init__()
            self.gru = nn.GRU(input_size, hidden_size, num_layers, 
                             batch_first=True, dropout=0.2)
            self.fc = nn.Linear(hidden_size, output_size)
            self.dropout = nn.Dropout(0.2)
        
        def forward(self, x):
            x = x.unsqueeze(-1)
            gru_out, _ = self.gru(x)
            last_output = gru_out[:, -1, :]
            output = self.fc(self.dropout(last_output))
            return output.squeeze(-1)
    
    # Training function
    def train_pytorch_model(model, train_loader, test_loader, epochs=20):
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        train_losses = []
        test_losses = []
        
        for epoch in range(epochs):
            # Training
            model.train()
            total_loss = 0
            
            for batch_x, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            
            train_loss = total_loss / len(train_loader)
            
            # Testing
            model.eval()
            test_loss = 0
            
            with torch.no_grad():
                for batch_x, batch_y in test_loader:
                    outputs = model(batch_x)
                    loss = criterion(outputs, batch_y)
                    test_loss += loss.item()
            
            test_loss = test_loss / len(test_loader)
            
            train_losses.append(train_loss)
            test_losses.append(test_loss)
            
            if (epoch + 1) % 5 == 0:
                print(f'Epoch {epoch+1}/{epochs}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')
        
        return train_losses, test_losses
    
    # Train LSTM model
    print("\n🔥 Training LSTM Model:")
    lstm_model = LSTMSequenceModel()
    lstm_train_losses, lstm_test_losses = train_pytorch_model(lstm_model, train_loader, test_loader)
    
    # Train GRU model
    print("\n🔥 Training GRU Model:")
    gru_model = GRUSequenceModel()
    gru_train_losses, gru_test_losses = train_pytorch_model(gru_model, train_loader, test_loader)
    
    # Store results
    pytorch_results = {
        'lstm_test_loss': lstm_test_losses[-1],
        'gru_test_loss': gru_test_losses[-1],
        'lstm_model': lstm_model,
        'gru_model': gru_model
    }
    
    print(f"\n📊 PyTorch Results:")
    print(f"  LSTM Test Loss: {lstm_test_losses[-1]:.4f}")
    print(f"  GRU Test Loss: {gru_test_losses[-1]:.4f}")

else:
    print("PyTorch not available - skipping PyTorch models")
    pytorch_results = None

## 3. TensorFlow Sequence Models

Building equivalent models with TensorFlow.

In [None]:
if TENSORFLOW_AVAILABLE:
    print("\n" + "=" * 60)
    print("TENSORFLOW SEQUENCE MODELS")
    print("=" * 60)
    
    # Reshape data for TensorFlow (add feature dimension)
    X_train_tf = X_train_norm.reshape(X_train_norm.shape[0], X_train_norm.shape[1], 1)
    X_test_tf = X_test_norm.reshape(X_test_norm.shape[0], X_test_norm.shape[1], 1)
    
    print(f"TensorFlow data shapes:")
    print(f"  X_train: {X_train_tf.shape}")
    print(f"  y_train: {y_train_norm.shape}")
    
    # LSTM Model
    def create_lstm_model(input_shape):
        model = models.Sequential([
            layers.LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,
                       input_shape=input_shape),
            layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2),
            layers.Dropout(0.2),
            layers.Dense(32, activation='relu'),
            layers.Dense(1)
        ])
        
        model.compile(optimizer='adam', loss='mse', metrics=['mae'])
        return model
    
    # GRU Model
    def create_gru_model(input_shape):
        model = models.Sequential([
            layers.GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,
                      input_shape=input_shape),
            layers.GRU(64, dropout=0.2, recurrent_dropout=0.2),
            layers.Dropout(0.2),
            layers.Dense(32, activation='relu'),
            layers.Dense(1)
        ])
        
        model.compile(optimizer='adam', loss='mse', metrics=['mae'])
        return model
    
    # Train LSTM model
    print("\n🟠 Training LSTM Model:")
    tf_lstm_model = create_lstm_model((X_train_tf.shape[1], 1))
    
    lstm_history = tf_lstm_model.fit(
        X_train_tf, y_train_norm,
        batch_size=32,
        epochs=20,
        validation_data=(X_test_tf, y_test_norm),
        verbose=0
    )
    
    # Train GRU model
    print("\n🟠 Training GRU Model:")
    tf_gru_model = create_gru_model((X_train_tf.shape[1], 1))
    
    gru_history = tf_gru_model.fit(
        X_train_tf, y_train_norm,
        batch_size=32,
        epochs=20,
        validation_data=(X_test_tf, y_test_norm),
        verbose=0
    )
    
    # Store results
    tensorflow_results = {
        'lstm_test_loss': lstm_history.history['val_loss'][-1],
        'gru_test_loss': gru_history.history['val_loss'][-1],
        'lstm_model': tf_lstm_model,
        'gru_model': tf_gru_model
    }
    
    print(f"\n📊 TensorFlow Results:")
    print(f"  LSTM Test Loss: {lstm_history.history['val_loss'][-1]:.4f}")
    print(f"  GRU Test Loss: {gru_history.history['val_loss'][-1]:.4f}")

else:
    print("TensorFlow not available - skipping TensorFlow models")
    tensorflow_results = None

## 4. Model Comparison and Evaluation

Comparing sequence models across frameworks.

In [None]:
print("\n" + "=" * 60)
print("MODEL COMPARISON AND EVALUATION")
print("=" * 60)

# Framework comparison
if pytorch_results and tensorflow_results:
    print("\n📊 Framework Comparison:")
    
    comparison_data = {
        'Model': ['LSTM', 'GRU'],
        'PyTorch': [pytorch_results['lstm_test_loss'], pytorch_results['gru_test_loss']],
        'TensorFlow': [tensorflow_results['lstm_test_loss'], tensorflow_results['gru_test_loss']]
    }
    
    print(f"{'Model':<10} | {'PyTorch':<12} | {'TensorFlow':<12}")
    print("-" * 40)
    for i, model in enumerate(comparison_data['Model']):
        pt_loss = comparison_data['PyTorch'][i]
        tf_loss = comparison_data['TensorFlow'][i]
        print(f"{model:<10} | {pt_loss:<12.4f} | {tf_loss:<12.4f}")
    
    # Visualization
    plt.figure(figsize=(10, 6))
    
    x = np.arange(len(comparison_data['Model']))
    width = 0.35
    
    plt.bar(x - width/2, comparison_data['PyTorch'], width, label='PyTorch', color='#EE4C2C', alpha=0.7)
    plt.bar(x + width/2, comparison_data['TensorFlow'], width, label='TensorFlow', color='#FF6F00', alpha=0.7)
    
    plt.xlabel('Model Architecture')
    plt.ylabel('Test Loss (MSE)')
    plt.title('Sequence Modeling: PyTorch vs TensorFlow')
    plt.xticks(x, comparison_data['Model'])
    plt.legend()
    
    # Add value labels on bars
    for i, (pt_loss, tf_loss) in enumerate(zip(comparison_data['PyTorch'], comparison_data['TensorFlow'])):
        plt.text(i - width/2, pt_loss + 0.001, f'{pt_loss:.3f}', ha='center', va='bottom')
        plt.text(i + width/2, tf_loss + 0.001, f'{tf_loss:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

# Prediction examples
def make_predictions(model, X_test, framework='pytorch'):
    """Make predictions and denormalize"""
    if framework == 'pytorch':
        model.eval()
        with torch.no_grad():
            X_tensor = torch.FloatTensor(X_test)
            predictions = model(X_tensor).numpy()
    else:  # tensorflow
        X_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
        predictions = model.predict(X_reshaped, verbose=0).flatten()
    
    # Denormalize predictions
    predictions_denorm = predictions * y_std + y_mean
    return predictions_denorm

# Show prediction examples
print("\n🔍 Prediction Examples:")

# Get a few test examples
test_indices = [0, 1, 2, 3, 4]
test_sequences = X_test[test_indices]
test_targets = y_test[test_indices]
test_sequences_norm = X_test_norm[test_indices]

if pytorch_results:
    # Use best PyTorch model
    best_pt_model = pytorch_results['lstm_model'] if pytorch_results['lstm_test_loss'] <= pytorch_results['gru_test_loss'] else pytorch_results['gru_model']
    pt_predictions = make_predictions(best_pt_model, test_sequences_norm, 'pytorch')

if tensorflow_results:
    # Use best TensorFlow model
    best_tf_model = tensorflow_results['lstm_model'] if tensorflow_results['lstm_test_loss'] <= tensorflow_results['gru_test_loss'] else tensorflow_results['gru_model']
    tf_predictions = make_predictions(best_tf_model, test_sequences_norm, 'tensorflow')

print(f"{'Sequence':<30} | {'True':<8} | {'PyTorch':<8} | {'TensorFlow':<8}")
print("-" * 65)

for i, (seq, target) in enumerate(zip(test_sequences, test_targets)):
    seq_str = str(list(seq))
    if len(seq_str) > 28:
        seq_str = seq_str[:25] + '...'
    
    pt_pred = pt_predictions[i] if pytorch_results else 0
    tf_pred = tf_predictions[i] if tensorflow_results else 0
    
    print(f"{seq_str:<30} | {target:<8.1f} | {pt_pred:<8.1f} | {tf_pred:<8.1f}")

# Side-by-side code comparison
pytorch_sequence_code = """
import torch
import torch.nn as nn

class LSTMSequenceModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
                           batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = x.unsqueeze(-1)  # Add feature dimension
        lstm_out, (hidden, _) = self.lstm(x)
        last_output = lstm_out[:, -1, :]  # Last timestep
        output = self.fc(self.dropout(last_output))
        return output.squeeze(-1)

# Training
model = LSTMSequenceModel()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(epochs):
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
"""

tensorflow_sequence_code = """
import tensorflow as tf
from tensorflow.keras import layers, models

def create_lstm_model(input_shape):
    model = models.Sequential([
        layers.LSTM(64, return_sequences=True, dropout=0.2, 
                   recurrent_dropout=0.2, input_shape=input_shape),
        layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        layers.Dropout(0.2),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Training
model = create_lstm_model((sequence_length, 1))

history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=epochs,
    validation_data=(X_test, y_test)
)
"""

print("\n" + create_side_by_side_comparison(
    pytorch_sequence_code, tensorflow_sequence_code, "Sequence Modeling Implementation"
))

print("\n✅ Key Takeaways:")
print("  • Both LSTM and GRU are effective for sequence modeling")
print("  • GRU is often faster to train with similar performance")
print("  • Proper data normalization is crucial for sequence tasks")
print("  • Both frameworks provide excellent sequence modeling capabilities")
print("  • TensorFlow's Keras API makes model building more concise")
print("  • PyTorch offers more flexibility for custom architectures")