# Full-Sized Model Training (Memory-Optimized)

This notebook trains both Backpack and Transformer baseline models on Europarl English-French data with 16 senses.

**Models:**
- Backpack: ~1.02B parameters (16 senses, 4 layers, 4 heads, 256 embd)
- Transformer: ~64M parameters (same architecture, no senses)

**Configuration:**
- Uses parameters from `train_backpack_clean` but with 16 senses
- Memory-optimized: smaller batch size (8), no compilation, smaller context (128)
- Should fit on most GPUs without OOM errors

**Training:**
- 50,000 iterations max
- Automatic checkpoint saving (best val loss + periodic every 500 iterations)
- Resume capability if interrupted


## 1. Setup


In [None]:
# Clone and setup
!git clone https://github.com/kavyavenk/multilingual-backpacks.git
%cd multilingual-backpacks


In [None]:
# Check GPU
!nvidia-smi


In [None]:
# Mount Drive for checkpoints (optional)
from google.colab import drive
import os

try:
    drive.mount('/content/drive')
    drive_checkpoint_dir = '/content/drive/MyDrive/multilingual-backpacks-checkpoints'
    os.makedirs(drive_checkpoint_dir, exist_ok=True)
    USE_DRIVE = True
    print("✓ Drive mounted - checkpoints will be saved")
except Exception as e:
    USE_DRIVE = False
    print("⚠️  Drive not mounted - checkpoints will only be local")
    print("   (You can download results at the end)")


In [None]:
# Install dependencies
!pip install -q transformers datasets scipy tqdm matplotlib


In [None]:
# Verify GPU
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


## 2. Prepare Data


In [None]:
# Prepare Europarl dataset (full dataset - remove max_samples limit for full training)
!python data/europarl/prepare.py --language_pair en-fr


In [None]:
# Verify data
import os
import pickle

with open('data/europarl/meta.pkl', 'rb') as f:
    meta = pickle.load(f)

print(f"Vocab size: {meta['vocab_size']:,}")
print(f"Languages: {meta['languages']}")

for fname in ['train.bin', 'val.bin']:
    fpath = f"data/europarl/{fname}"
    if os.path.exists(fpath):
        size = os.path.getsize(fpath) / 1e6
        print(f"✓ {fname}: {size:.2f} MB")


## 3. Restore Previous Checkpoints (if any)


In [None]:
# Restore previous checkpoints if exist
import shutil
import json

for model_type in ['backpack', 'transformer']:
    if USE_DRIVE:
        drive_dir = f'/content/drive/MyDrive/multilingual-backpacks-checkpoints/{model_type}_full'
        local_dir = f'out/{model_type}_full'
        
        if os.path.exists(drive_dir):
            shutil.copytree(drive_dir, local_dir, dirs_exist_ok=True)
            print(f"✓ Restored {model_type} checkpoint from Drive")
            
            if os.path.exists(f"{local_dir}/training_log.json"):
                with open(f"{local_dir}/training_log.json") as f:
                    log = json.load(f)
                print(f"  Completed {len(log['iterations'])} iterations")
                if log['iterations']:
                    print(f"  Last train loss: {log['train_loss'][-1]:.4f}")
                    print(f"  Last val loss: {log['val_loss'][-1]:.4f}")
        else:
            print(f"No previous {model_type} checkpoint found - starting fresh")
    else:
        print(f"Drive not available - {model_type} will start fresh")


## 4. Train Backpack Model


In [None]:
# Determine if we should resume or start fresh
import os
backpack_ckpt = 'out/backpack_full/ckpt.pt'
init_from = 'resume' if os.path.exists(backpack_ckpt) else 'scratch'
print(f"Backpack: {init_from}")

# Train Backpack model (using memory-optimized config with 16 senses)
!python train.py \
    --model_type backpack \
    --config train_europarl_scratch_16senses \
    --out_dir out/backpack_full \
    --data_dir europarl \
    --init_from {init_from} \
    --device cuda \
    --dtype float16


In [None]:
# Final backup for Backpack (if Drive available)
if USE_DRIVE:
    drive_dir = '/content/drive/MyDrive/multilingual-backpacks-checkpoints/backpack_full'
    shutil.copytree('out/backpack_full', drive_dir, dirs_exist_ok=True)
    print("✓ Backpack checkpoint saved to Drive")
else:
    print("✓ Backpack training complete - checkpoint saved locally")


## 5. Train Transformer Baseline


In [None]:
# Determine if we should resume or start fresh
transformer_ckpt = 'out/transformer_full/ckpt.pt'
init_from = 'resume' if os.path.exists(transformer_ckpt) else 'scratch'
print(f"Transformer: {init_from}")

# Train Transformer baseline model (matching backpack config)
!python train.py \
    --model_type transformer \
    --config train_europarl_transformer_baseline_16senses \
    --out_dir out/transformer_full \
    --data_dir europarl \
    --init_from {init_from} \
    --device cuda \
    --dtype float16


In [None]:
# Final backup for Transformer (if Drive available)
if USE_DRIVE:
    drive_dir = '/content/drive/MyDrive/multilingual-backpacks-checkpoints/transformer_full'
    shutil.copytree('out/transformer_full', drive_dir, dirs_exist_ok=True)
    print("✓ Transformer checkpoint saved to Drive")
else:
    print("✓ Transformer training complete - checkpoint saved locally")


## 6. Visualize Training Results


In [None]:
# Plot loss curves for both models
import json
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(16, 5))

for idx, (model_type, model_name) in enumerate([('backpack', 'Backpack'), ('transformer', 'Transformer')]):
    log_path = f'out/{model_type}_full/training_log.json'
    if os.path.exists(log_path):
        with open(log_path) as f:
            log = json.load(f)
        
        ax = axes[idx]
        ax.plot(log['iterations'], log['train_loss'], label='Train', linewidth=2)
        ax.plot(log['iterations'], log['val_loss'], label='Val', linewidth=2)
        ax.set_xlabel('Iteration')
        ax.set_ylabel('Loss')
        ax.set_title(f'{model_name} Training on Europarl en-fr')
        ax.legend()
        ax.grid(alpha=0.3)
        
        print(f"\n{model_name} Summary:")
        print(f"  Iterations: {len(log['iterations'])}")
        if log['iterations']:
            print(f"  Final train loss: {log['train_loss'][-1]:.4f}")
            print(f"  Final val loss: {log['val_loss'][-1]:.4f}")
            print(f"  Loss reduction: {log['train_loss'][0] - log['train_loss'][-1]:.4f}")
    else:
        axes[idx].text(0.5, 0.5, f'No training log found\nfor {model_name}', 
                      ha='center', va='center', transform=axes[idx].transAxes)
        axes[idx].set_title(f'{model_name} Training')

plt.tight_layout()
plt.savefig('out/training_comparison.png', dpi=150)
plt.show()


In [None]:
# Display model information
import torch
from configurator import get_config

for model_type, model_name in [('backpack', 'Backpack'), ('transformer', 'Transformer')]:
    ckpt_path = f'out/{model_type}_full/ckpt.pt'
    if os.path.exists(ckpt_path):
        checkpoint = torch.load(ckpt_path, map_location='cpu')
        config = checkpoint['config']
        
        print(f"\n{'='*60}")
        print(f"{model_name.upper()} MODEL")
        print(f"{'='*60}")
        print(f"Embedding dim: {config.n_embd}")
        if model_type == 'backpack':
            print(f"Sense vectors: {config.n_senses}")
        print(f"Layers: {config.n_layer}")
        print(f"Heads: {config.n_head}")
        print(f"Vocab: {config.vocab_size:,}")
        print(f"Block size: {config.block_size}")
        print(f"\nTraining:")
        print(f"  Iterations: {checkpoint.get('iter_num', 0):,}")
        print(f"  Best val loss: {checkpoint.get('best_val_loss', 'N/A'):.4f}")
        
        # Calculate model size
        model_state = checkpoint['model']
        total_params = sum(p.numel() for p in model_state.values())
        model_size_mb = total_params * 2 / 1e6  # float16 = 2 bytes
        
        print(f"\nTotal params: {total_params:,}")
        print(f"Size: {model_size_mb:.1f} MB (float16)")
        print(f"{'='*60}")


## 8. Download Results


In [None]:
# Package all results
import tarfile

with tarfile.open('full_models_results.tar.gz', 'w:gz') as tar:
    for model_type in ['backpack_full', 'transformer_full']:
        model_dir = f'out/{model_type}'
        if os.path.exists(model_dir):
            tar.add(model_dir, arcname=model_type)
            print(f"✓ Added {model_type}")

# Download
from google.colab import files
files.download('full_models_results.tar.gz')

print("\n✓ Results downloaded")
print("\nContents:")
print("  - backpack_full/ckpt.pt: Backpack model checkpoint")
print("  - backpack_full/training_log.json: Backpack training metrics")
print("  - transformer_full/ckpt.pt: Transformer model checkpoint")
print("  - transformer_full/training_log.json: Transformer training metrics")
