# MoE based GPT 2

## Import Dependencies

In [12]:
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from typing import Optional
import copy

ModuleNotFoundError: No module named 'datasets'

## MoE Router Module

In [None]:
class TopKRouter(nn.Module):
    """Simple router that selects top-k experts per token"""
    def __init__(self, hidden_size: int, num_experts: int, top_k: int = 2):
        super().__init__()
        self.num_experts = num_experts
        self.top_k = top_k
        # Router is a simple linear layer mapping hidden states to expert scores
        self.gate = nn.Linear(hidden_size, num_experts, bias=False)
        
    def forward(self, hidden_states):
        # hidden_states: [batch_size, seq_len, hidden_size]
        router_logits = self.gate(hidden_states)  # [batch, seq, num_experts]
        
        # Get top-k experts per token
        routing_weights = torch.softmax(router_logits, dim=-1)
        top_k_weights, top_k_indices = torch.topk(routing_weights, self.top_k, dim=-1)
        
        # Normalize top-k weights to sum to 1
        top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
        
        return top_k_weights, top_k_indices, router_logits

## MoE Layer

In [None]:
class MoELayer(nn.Module):
    """Mixture of Experts layer replacing the MLP"""
    def __init__(self, dense_mlp, num_experts: int = 8, top_k: int = 2, drop_ratio: float = 0.0):
        super().__init__()
        self.num_experts = num_experts
        self.top_k = top_k
        
        # Hugging Face GPT-2 uses Conv1D, where weight shape is [input, output].
        hidden_size = dense_mlp.c_fc.weight.shape[0]
        intermediate_size = dense_mlp.c_fc.weight.shape[1]
        
        # Create router
        self.router = TopKRouter(hidden_size, num_experts, top_k)
        
        # Create experts by copying the dense MLP weights with optional drop-upcycling
        self.experts = nn.ModuleList([
            self._copy_mlp_with_drop(dense_mlp, drop_ratio) for _ in range(num_experts)
        ])
        
    def _copy_mlp(self, dense_mlp):
        """Create a copy of the dense MLP for each expert"""
        expert = copy.deepcopy(dense_mlp)
        return expert
    
    def _copy_mlp_with_drop(self, dense_mlp, drop_ratio: float):
        """
        Create a copy of the dense MLP with drop-upcycling.
        Re-initializes drop_ratio% of parameters to promote diversity.
        """
        expert = copy.deepcopy(dense_mlp)
        
        if drop_ratio > 0:
            with torch.no_grad():
                for name, param in expert.named_parameters():
                    # Create a mask for parameters to re-initialize
                    mask = torch.rand_like(param) < drop_ratio
                    
                    # Re-initialize masked parameters with small random values
                    if mask.any():
                        param.data[mask] = torch.randn_like(param[mask]) * 0.02
        
        return expert
    
    def forward(self, hidden_states):
        batch_size, seq_len, hidden_size = hidden_states.shape
        
        # Route tokens to experts
        top_k_weights, top_k_indices, router_logits = self.router(hidden_states)
        
        # Flatten batch and sequence dimensions
        flat_hidden = hidden_states.view(-1, hidden_size)  # [batch*seq, hidden]
        flat_top_k_weights = top_k_weights.view(-1, self.top_k)  # [batch*seq, top_k]
        flat_top_k_indices = top_k_indices.view(-1, self.top_k)  # [batch*seq, top_k]
        
        # Initialize output
        output = torch.zeros_like(flat_hidden)
        
        # Process each expert
        for expert_idx in range(self.num_experts):
            # Find tokens routed to this expert
            expert_mask = (flat_top_k_indices == expert_idx).any(dim=-1)
            
            if expert_mask.any():
                # Get tokens for this expert
                expert_input = flat_hidden[expert_mask]
                expert_output = self.experts[expert_idx](expert_input)
                
                # Get weights for this expert
                # For each token, find which top-k position corresponds to this expert
                token_indices = torch.where(expert_mask)[0]
                for i, token_idx in enumerate(token_indices):
                    # Find where this expert appears in top-k for this token
                    expert_positions = (flat_top_k_indices[token_idx] == expert_idx).nonzero(as_tuple=True)[0]
                    if len(expert_positions) > 0:
                        weight = flat_top_k_weights[token_idx, expert_positions[0]]
                        output[token_idx] += weight * expert_output[i]
        
        # Reshape back to original dimensions
        output = output.view(batch_size, seq_len, hidden_size)
        
        return output

In [None]:
def calculate_active_params(model):
    """Calculate total parameters and active parameters in MoE layers"""
    total_params = sum(p.numel() for p in model.parameters())
    
    # Calculate MoE-specific info
    moe_info = {
        'total_experts': 0,
        'active_per_token': 0,
        'total_expert_params': 0,
        'active_expert_params': 0,
    }
    
    for name, module in model.named_modules():
        if isinstance(module, MoELayer):
            moe_info['total_experts'] += module.num_experts
            moe_info['active_per_token'] += module.top_k
            
            # Count expert parameters
            expert_params = sum(p.numel() for p in module.experts.parameters())
            moe_info['total_expert_params'] += expert_params
            moe_info['active_expert_params'] += (expert_params / module.num_experts) * module.top_k
    
    return total_params, moe_info

## GPT2 Vanilla to MoE Architecture

In [None]:
def upcycle_gpt2_to_moe(
    model_name: str = 'gpt2',
    num_experts: int = 8,
    top_k: int = 2,
    moe_layers: Optional[list] = None,
    drop_ratio: float = 0.0,
    match_active_params: bool = False
):
    """
    Convert a standard GPT-2 model to MoE architecture
    
    Args:
        model_name: HuggingFace model name
        num_experts: Number of experts per MoE layer
        top_k: Number of experts to activate per token
        moe_layers: List of layer indices to convert to MoE (None = all layers)
        drop_ratio: Ratio of parameters to re-initialize for drop-upcycling (0.0-1.0)
                   0.0 = standard upcycling, 0.1-0.2 recommended for drop-upcycling
        match_active_params: If True, automatically adjust num_experts to match
                            vanilla model's active parameters (top_k=1, num_experts=1)
    
    Returns:
        Modified model with MoE layers
    """
    # Load the pre-trained model
    model = GPT2Model.from_pretrained(model_name)
    original_params = sum(p.numel() for p in model.parameters())
    
    # If no specific layers specified, convert all layers
    if moe_layers is None:
        moe_layers = list(range(len(model.h)))
    
    # Auto-adjust for fair comparison
    if match_active_params:
        # For fair comparison: active params = top_k experts active per token
        # Original has 1 MLP active per token
        # So we want: top_k / num_experts ≈ 1 / 1
        # Which means: top_k = 1 (only one expert active at a time)
        top_k = 1
        print(f"Fair comparison mode: Setting top_k={top_k} to match vanilla GPT-2 active params")
    
    upcycle_type = "Drop-Upcycling" if drop_ratio > 0 else "Standard Upcycling"
    print(f"Converting layers {moe_layers} to MoE with {num_experts} experts (top-{top_k})")
    print(f"Using {upcycle_type}" + (f" with {drop_ratio*100}% parameter re-initialization" if drop_ratio > 0 else ""))
    
    # Replace MLPs with MoE layers
    for layer_idx in moe_layers:
        if layer_idx >= len(model.h):
            print(f"Warning: Layer {layer_idx} doesn't exist, skipping")
            continue
            
        original_mlp = model.h[layer_idx].mlp
        
        # Replace with MoE layer
        model.h[layer_idx].mlp = MoELayer(
            original_mlp,
            num_experts=num_experts,
            top_k=top_k,
            drop_ratio=drop_ratio
        )
        
        print(f"Converted layer {layer_idx}")
    
    # Print parameter comparison
    total_params, moe_info = calculate_active_params(model)
    print("\n" + "=" * 60)
    print("PARAMETER COMPARISON")
    print("=" * 60)
    print(f"Original model params:        {original_params:,}")
    print(f"MoE model total params:       {total_params:,}")
    print(f"MoE model active params:      {int(original_params + moe_info['active_expert_params']):,}")
    print(f"\nPer-layer breakdown:")
    print(f"  Experts per layer:          {num_experts}")
    print(f"  Active experts per token:   {top_k}")
    print(f"  Active ratio:               {top_k}/{num_experts} = {top_k/num_experts:.1%}")
    
    if match_active_params:
        print(f"\nFair comparison mode: Active params ≈ vanilla GPT-2")
    else:
        active_ratio = (original_params + moe_info['active_expert_params']) / original_params
        print(f"\nMoE has {active_ratio:.1f}x active parameters vs vanilla")
    
    return model

## Upcycle GPT2 Weights for MoE

In [None]:
moe_model = upcycle_gpt2_to_moe(
        model_name='gpt2',
        num_experts=8,
        top_k=1,
        drop_ratio=0.1,
        moe_layers=[1, 3, 5, 7, 9, 11],
        match_active_params=True
    )

moe_model.save_pretrained('./gpt2-moe-upcycled')

Converting layers [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] to MoE with 8 experts (top-2)
Converted layer 0
Converted layer 1
Converted layer 2
Converted layer 3
Converted layer 4
Converted layer 5
Converted layer 6
Converted layer 7
Converted layer 8
Converted layer 9
Converted layer 10
Converted layer 11


## Initialise Training, Validation and Test Data

In [None]:
def finetune_moe_model(
    moe_model_path: str = './gpt2-moe-upcycled',
    output_dir: str = './gpt2-moe-finetuned',
    num_train_steps: int = 5000,
    batch_size: int = 4,
    learning_rate: float = 1e-4,
    warmup_steps: int = 200,
    gradient_accumulation_steps: int = 8,
    save_steps: int = 1000,
):
    """
    Fine-tune the upcycled MoE model
    
    Args:
        moe_model_path: Path to the upcycled MoE model
        dataset_name: Dataset to use for training (default: The Pile)
        output_dir: Directory to save fine-tuned model
        num_train_steps: Total training steps
        batch_size: Batch size per device
        learning_rate: Learning rate (lower than standard for continued training)
        warmup_steps: Number of warmup steps
        gradient_accumulation_steps: Gradient accumulation steps
        save_steps: Save checkpoint every N steps
    """
    print("=" * 60)
    print("FINE-TUNING MoE MODEL")
    print("=" * 60)
    
    # Load the model
    print(f"Loading model from {moe_model_path}...")
    base_model = GPT2Model.from_pretrained(moe_model_path)
    
    # Create LM head model for training
    config = GPT2Config.from_pretrained('gpt2')
    lm_model = GPT2LMHeadModel(config)
    lm_model.transformer = base_model
    
    print(f"✓ Model loaded with {sum(p.numel() for p in lm_model.parameters()):,} parameters")
    
    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load dataset
    print(f"\nLoading dataset...")
    dataset = load_dataset('c4', 'en', split='train', streaming=True)
    dataset = dataset.take(num_train_steps * batch_size * gradient_accumulation_steps)
    
    # Tokenize dataset
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, max_length=512)
    
    print("Tokenizing dataset...")
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text'] if not isinstance(dataset, type(iter([]))) else []
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        max_steps=num_train_steps,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        lr_scheduler_type='cosine',
        save_steps=save_steps,
        save_total_limit=3,
        logging_steps=100,
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=4,
        remove_unused_columns=False,
    )
    
    # Create trainer
    trainer = Trainer(
        model=lm_model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )
    
    # Train
    print("\n" + "=" * 60)
    print("STARTING TRAINING")
    print("=" * 60)
    print(f"Total steps: {num_train_steps}")
    print(f"Effective batch size: {batch_size * gradient_accumulation_steps}")
    print(f"Learning rate: {learning_rate}")
    print(f"Warmup steps: {warmup_steps}\n")
    
    trainer.train()

    print("\n" + "=" * 60)
    print("SAVING FINAL MODEL")
    print("=" * 60)
    trainer.save_model(output_dir)
    print(f"Model saved to {output_dir}")
    
    return lm_model

## Run Training

In [None]:
finetuned_model = finetune_moe_model(
        moe_model_path='./gpt2-moe-upcycled',
        output_dir='./gpt2-moe-finetuned',
        num_train_steps=3125,
        batch_size=4,
        learning_rate=1e-4,
        warmup_steps=312,
    )