# üöÄ Hybrid Mamba-xLSTM: Google Colab Setup

Complete setup and training guide for Google Colab

## Step 1: Check GPU & Install Dependencies

In [None]:
# Check GPU availability
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("‚ö†Ô∏è No GPU available. Please enable GPU in Runtime settings!")

In [None]:
# Clone and install the project and test it out 
!git clone https://github.com/krishankb-de/hybrid_model_mamba_xlstm.git
%cd hybrid_model_mamba_xlstm
!pip install -e . -q

## Step 2: Mount Google Drive (Optional, for saving checkpoints)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("‚úì Google Drive mounted!")

## Step 3: Quick Test (2 minutes)

In [None]:
# Quick inference test with 70M model (completes in < 2 minutes)
python_script = """
import torch
from transformers import AutoTokenizer
import sys
sys.path.insert(0, '/content/hybrid_model_mamba_xlstm')

from hybrid_xmamba.models.configuration_hybrid import HybridConfig
from hybrid_xmamba.models.hybrid_lm import HybridLanguageModel

print('Loading 70M model...')
config = HybridConfig(
    dim=512,
    num_layers=8,
    vocab_size=50257,
    state_size=16,
    conv_size=4,
    expand_factor=2,
    use_fast_path=True,
    head_dim=64,
    num_heads=8,
    use_tfla=True,
    proj_factor=2,
    slstm_hidden_dim=512,
    slstm_num_heads=4,
    use_exponential_gate=True,
)

model = HybridLanguageModel(config).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained('gpt2')

print('Testing inference...')
text = 'The quick brown fox jumps over the lazy dog'
inputs = tokenizer(text, return_tensors='pt')

with torch.no_grad():
    # Pass only input_ids to the model
    outputs = model(input_ids=inputs['input_ids'].cuda())

print(f'‚úì Model loaded and inference works!')
print(f'Output shape: {outputs.logits.shape}')
print(f'Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M')
print('‚úÖ Quick test completed successfully!')
"""

!python -c "$python_script"

### For 350M complete model 

In [None]:
# Full training (for local machine with GPU, not recommended for Colab free tier)
# Uncomment and run this on a machine with >= 24GB GPU VRAM
# Note: The 350M model is too large for Colab T4 (15GB). 
# For Colab training, consider using a smaller model or local machine.

# !python scripts/train.py \
#     model=hybrid_350m \
#     dataset=wikitext \
#     trainer=colab_single_gpu \
#     trainer.max_epochs=3 \
#     trainer.default_root_dir=/content/drive/MyDrive/hybrid_mamba_checkpoints \
#     dataset.batch_size=4 \
#     dataset.eval_batch_size=4 \
#     dataset.num_workers=0 \
#     +dataset.max_seq_length=128 \
#     wandb.enabled=false

# print("üìù For full training on Colab, use a smaller model or local GPU")
# print("‚úÖ Step 3 (inference test) completed successfully!")
# print("üöÄ To train locally: python scripts/train.py model=hybrid_350m dataset=wikitext trainer=single_gpu")

## Step 4: Full Training (Optional)

In [None]:
# ‚ö†Ô∏è IMPORTANT: Colab 12-hour Timeout Limitation
# Training the full epoch takes 130+ days on Colab (450k batches √ó 25 sec/batch)
# This cell provides THREE options:

print("="*80)
print("‚ö†Ô∏è  COLAB TRAINING OPTIONS - Choose One")
print("="*80)
print("\nColab Free Tier: 12-hour maximum runtime")
print("Model speed: ~25 seconds per batch")
print("Full training needed: 450,338 batches = 130+ days ‚ùå")
print("\nChoose an option below:")
print("="*80)


In [None]:
# Option 1: QUICK TRAINING TEST (10-15 minutes) ‚≠ê RECOMMENDED FOR COLAB
# This runs only 100 training steps to verify everything works
# Perfect for testing code, data loading, and model without timeout risk

print("\n" + "="*80)
print("üöÄ OPTION 1: QUICK TRAINING TEST (100 steps = 10-15 minutes)")
print("="*80)
print("\nPurpose: Verify training works end-to-end")
print("Runtime: ~10-15 minutes (safe within 12-hour limit)")
print("Expected loss reduction: Should see training progress\n")

!python scripts/train.py \
    model=hybrid_70m \
    dataset=wikitext \
    trainer=colab_single_gpu \
    trainer.max_epochs=1 \
    trainer.num_sanity_val_steps=0 \
    dataset.batch_size=4 \
    dataset.eval_batch_size=4 \
    dataset.num_workers=2 \
    dataset.preprocessing_num_workers=2 \
    +dataset.max_seq_length=256 \
    trainer.accumulate_grad_batches=2 \
    trainer.val_check_interval=0.5 \
    trainer.log_every_n_steps=10 \
    trainer.limit_train_batches=100 \
    trainer.limit_val_batches=0 \
    wandb.enabled=false \
    trainer.enable_checkpointing=false \
    trainer.default_root_dir=/content/outputs

print("\n‚úÖ Quick test completed! Model trains successfully.")
print("üìä Check output folder at /content/outputs for results")


In [None]:
# Option 2: EXTENDED TRAINING (1000 steps = ~6-7 hours)
# Trains for longer but still completes within Colab's 12-hour limit
# Good for getting meaningful model improvements

print("\n" + "="*80)
print("üöÄ OPTION 2: EXTENDED TRAINING (1000 steps = 6-7 hours)")
print("="*80)
print("\nPurpose: Train for meaningful results")
print("Runtime: ~6-7 hours (safe with headroom)")
print("Coverage: ~0.2% of full epoch")
print("Recommendation: Use this if you have time and want real training\n")

!python scripts/train.py \
    model=hybrid_70m \
    dataset=wikitext \
    trainer=colab_single_gpu \
    trainer.max_epochs=1 \
    trainer.num_sanity_val_steps=0 \
    dataset.batch_size=4 \
    dataset.eval_batch_size=4 \
    dataset.num_workers=2 \
    dataset.preprocessing_num_workers=2 \
    +dataset.max_seq_length=256 \
    trainer.accumulate_grad_batches=2 \
    trainer.val_check_interval=0.5 \
    trainer.log_every_n_steps=50 \
    trainer.limit_train_batches=1000 \
    trainer.limit_val_batches=0 \
    wandb.enabled=false \
    trainer.enable_checkpointing=true \
    trainer.default_root_dir=/content/outputs

print("\n‚úÖ Extended training completed!")
print("üíæ Checkpoint saved to /content/outputs")
print("üìä Training artifacts ready for use")

# Save to Google Drive
!mkdir -p /content/drive/MyDrive/hybrid_mamba_results 2>/dev/null
!cp -r /content/outputs /content/drive/MyDrive/hybrid_mamba_results/ 2>/dev/null && \
    echo "‚úì Results saved to Google Drive" || echo "‚ö†Ô∏è Could not save to Drive (not mounted)"


In [None]:
# Option 3: FULL TRAINING ON LOCAL MACHINE (Recommended for real training)
# For complete training, use your local GPU with 24GB+ VRAM
# This is the only practical way to train the full model

print("\n" + "="*80)
print("üíª OPTION 3: FULL TRAINING ON LOCAL MACHINE (RECOMMENDED)")
print("="*80)
print("""
Colab is NOT suitable for full model training due to:
  ‚úó 12-hour runtime limit
  ‚úó 450k+ batches needed for full epoch
  ‚úó At 25s/batch = 130+ days total time

SOLUTION: Train locally with a better GPU

Requirements:
  - GPU: 24GB+ VRAM (RTX 3090, RTX 4090, A100, etc.)
  - Storage: 50GB free disk space
  - Time: ~3-4 hours per epoch on RTX 4090

Installation (run on your local machine):
  1. Clone the repository
  2. pip install -e .
  3. Run the training command below

TRAINING COMMANDS FOR LOCAL MACHINE:
""")

print("="*80)
print("Single GPU Training (24GB VRAM - RTX 3090/4090):")
print("="*80)
print("""
python scripts/train.py \\
    model=hybrid_70m \\
    dataset=wikitext \\
    trainer=single_gpu \\
    trainer.max_epochs=10 \\
    dataset.batch_size=8 \\
    +dataset.max_seq_length=256 \\
    trainer.accumulate_grad_batches=1 \\
    wandb.enabled=false
""")

print("="*80)
print("Multi-GPU Training (Distributed - faster):")
print("="*80)
print("""
python scripts/train.py \\
    model=hybrid_70m \\
    dataset=wikitext \\
    trainer=gpu_ddp \\
    trainer.max_epochs=10 \\
    dataset.batch_size=8 \\
    +dataset.max_seq_length=256 \\
    wandb.enabled=false
""")

print("="*80)
print("For 150M or 350M models (requires more VRAM):")
print("="*80)
print("""
python scripts/train.py \\
    model=hybrid_150m \\
    dataset=wikitext \\
    trainer=single_gpu \\
    trainer.max_epochs=5 \\
    dataset.batch_size=4 \\
    trainer.accumulate_grad_batches=2
""")

print("\n‚úÖ This is the recommended approach for actual training")
