## 1. Setup Environment

In [None]:
# Check GPU
!nvidia-smi

: 

In [None]:
# Clone repository
!git clone https://github.com/kavyavenk/multilingual-backpacks.git
%cd multilingual-backpacks

In [None]:
# Install dependencies
!pip install -q transformers datasets scipy tqdm numpy torch matplotlib

In [None]:
# Verify GPU
import torch
import numpy as np

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Prepare Data

In [None]:
# Prepare Europarl dataset (10k samples)
!python data/europarl/prepare.py --language_pair en-fr --max_samples 10000

In [None]:
# Verify data files
import os
import pickle

data_files = ['data/europarl/train.bin', 'data/europarl/val.bin', 'data/europarl/meta.pkl']
for f in data_files:
    if os.path.exists(f):
        size = os.path.getsize(f) / 1e6
        print(f"✓ {f} ({size:.2f} MB)")
    else:
        print(f"✗ {f} not found")

with open('data/europarl/meta.pkl', 'rb') as f:
    meta = pickle.load(f)
    
print(f"\nVocab size: {meta['vocab_size']:,}")
print(f"Languages: {meta['languages']}")

## 3. Configure Transformer Baseline

In [None]:
# Create transformer baseline config from scratch
config_content = """\"\"\"
Configuration for training Standard Transformer baseline on Europarl dataset
\"\"\"

from configurator import ModelConfig

config = ModelConfig(
    # Model architecture
    block_size=128,
    n_layer=4,
    n_head=4,
    n_embd=256,
    n_senses=1,  # Not used by transformer, but kept for compatibility
    dropout=0.1,
    bias=False,
    
    # Training
    batch_size=16,
    learning_rate=3e-4,
    max_iters=2000,
    weight_decay=1e-1,
    beta1=0.9,
    beta2=0.95,
    grad_clip=1.0,
    
    # Evaluation
    eval_interval=200,
    eval_iters=50,
    log_interval=10,
    
    # System
    device='cuda',
    dtype='float16',
    compile=False,
    
    # Data
    dataset='europarl',
    tokenizer_name='xlm-roberta-base',
    languages=['en', 'fr'],
)
"""

# Write config file
with open('config/train_europarl_transformer_baseline.py', 'w') as f:
    f.write(config_content)

print("✓ Transformer baseline config created")
print("\nSettings:")
print("  Model: StandardTransformer (NO sense vectors)")
print("  Embedding dim: 256")
print("  Layers: 4")
print("  Heads: 4")
print("  Batch size: 16")
print("  Block size: 128")
print("  Max iterations: 2000")
print("\nEstimated params: 250K vocab × 256 dim = 64M embeddings")
print("  + 4 layers × ~3M = ~76M total")
print("  Memory needed: ~6-8GB (should fit in T4!)")

## 4. Clear GPU Memory

In [None]:
import torch
import gc
import os

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    
    mem_total = torch.cuda.get_device_properties(0).total_memory / 1e9
    mem_free = (mem_total - torch.cuda.memory_allocated(0) / 1e9)
    
    print(f"GPU Memory:")
    print(f"  Total: {mem_total:.2f} GB")
    print(f"  Free: {mem_free:.2f} GB")
    print(f"\n✓ Ready for training")

## 5. Train Transformer Baseline

**Note**: Using `--model_type transformer` instead of `backpack`

In [None]:
# Train transformer baseline (NO sense vectors)
!python train.py \
    --config train_europarl_transformer_baseline \
    --out_dir out/transformer_baseline \
    --data_dir europarl \
    --device cuda \
    --dtype float16 \
    --model_type transformer

## 6. Visualize Training

In [None]:
# Plot loss curves
import json
import matplotlib.pyplot as plt

with open('out/transformer_baseline/training_log.json', 'r') as f:
    log = json.load(f)

iterations = log['iterations']
train_loss = log['train_loss']
val_loss = log['val_loss']

print(f"Training Summary:")
print(f"  Iterations: {len(iterations)}")
print(f"  Initial train loss: {train_loss[0]:.4f}")
print(f"  Final train loss: {train_loss[-1]:.4f}")
print(f"  Final val loss: {val_loss[-1]:.4f}")
print(f"  Loss reduction: {train_loss[0] - train_loss[-1]:.4f}")

# Plot
plt.figure(figsize=(12, 5))
plt.plot(iterations, train_loss, label='Train Loss', alpha=0.8, linewidth=2)
plt.plot(iterations, val_loss, label='Val Loss', alpha=0.8, linewidth=2)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Transformer Baseline Training (Europarl en-fr)', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('out/transformer_baseline/loss_curves.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✓ Loss curves saved")

## 7. Model Info

In [None]:
# Load and inspect model
import torch
from model import StandardTransformerLM

checkpoint = torch.load('out/transformer_baseline/ckpt.pt', map_location='cuda')
config = checkpoint['config']
model = StandardTransformerLM(config)
model.load_state_dict(checkpoint['model'])

n_params = sum(p.numel() for p in model.parameters())

print("="*60)
print("TRANSFORMER BASELINE MODEL")
print("="*60)
print(f"\nArchitecture:")
print(f"  Type: Standard Transformer (no sense vectors)")
print(f"  Embedding dim: {config.n_embd}")
print(f"  Layers: {config.n_layer}")
print(f"  Attention heads: {config.n_head}")
print(f"  Vocab size: {config.vocab_size:,}")
print(f"  Context length: {config.block_size}")
print(f"\nParameters:")
print(f"  Total: {n_params:,}")
print(f"  Model size: {n_params * 4 / 1e6:.2f} MB (float32)")
print(f"  Model size: {n_params * 2 / 1e6:.2f} MB (float16)")
print("="*60)

## 8. Run Evaluations

In [None]:
# Run evaluation suite
!python run_full_evaluation.py \
    --out_dir out/transformer_baseline \
    --device cuda \
    --skip_multisimlex

## 9. Download Results

In [None]:
# Package results
!tar -czf transformer_baseline_results.tar.gz out/transformer_baseline/

from google.colab import files 
files.download('transformer_baseline_results.tar.gz')

print("\n" + "="*60)
print("RESULTS PACKAGED")
print("="*60)
print("\nContents:")
print("  ✓ ckpt.pt - Model checkpoint")
print("  ✓ training_log.json - Training metrics")
print("  ✓ evaluation_results.json - Evaluation scores")
print("  ✓ loss_curves.png - Training visualization")
print("\nThis baseline demonstrates cross-lingual learning")
print("without Backpack's sense vectors.")
print("="*60)

## Summary

### What We Trained
- **Model**: Standard Transformer (no Backpack)
- **Parameters**: ~76M (vs 60M for Backpack tiny)
- **Why it works**: Only 1 embedding per token (not 4 sense vectors)
- **Memory**: ~6-8GB (fits in T4's 15GB)

### Key Findings
1. Transformer baseline trains successfully on T4
2. Can increase batch size for faster training
3. Still learns cross-lingual representations from Europarl

### Next Steps
- Compare with Backpack model (when vocab size is reduced)
- Evaluate cross-lingual word/sentence similarity
- Document differences in learned representations