# VishwamAI Pretraining on Google Colab

This notebook provides an optimized linear pipeline for pretraining VishwamAI's 671B parameter model.

**Model Architecture:**
- Parameters: 671B
- Context Length: 32,768 tokens
- Hidden Size: 8,192
- Attention Heads: 64
- Layers: 120
- Vocabulary Size: 64,000

**Pipeline Steps & Timing:**
1. Setup (~2 min)
2. Authentication (~30 sec)
3. Model Loading (~2 min)
4. Dataset Loading (~10 min)
5. Training (~1 hour/epoch)
6. Model Pushing (~10 min)

Total Expected Time: ~4 hours for 3 epochs

In [1]:
# Progress tracking setup
import time
import json
import torch
from tqdm.notebook import tqdm
from transformers import Trainer

def track_time(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(f"Operation completed in {end - start:.2f} seconds")
        return result
    return wrapper

# 1. Fast Setup (≈2 min)

In [2]:
%%time
# Verify GPU availability and requirements
!nvidia-smi

import torch
gpu_name = torch.cuda.get_device_name(0)
if 'A100' not in gpu_name:
    print("⚠️ Warning: This model requires an A100 GPU for optimal performance")
    print("Current GPU:", gpu_name)

Fri Feb 14 06:43:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
%%time
# Parallel package installation
%pip install torch==2.4.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 \
    transformers==4.34.0 datasets accelerate huggingface_hub wandb -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m857.6/857.6 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Could not find a version that satisfies the requirement transformers==4.34.0 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for transformers==4.34.0[0m[31m
[0mCPU times: user 214 ms, sys: 28.6 ms, total: 242 ms
Wall time: 19.4 s


# 2. Quick Authentication (≈30 sec)

In [4]:
%%time
from huggingface_hub import login, create_repo
from getpass import getpass
import wandb

# Get token securely
hf_token = getpass("Enter your Hugging Face access token: ")
login(token=hf_token)
print("Successfully logged in to Hugging Face!")

# Initialize W&B for experiment tracking
wandb.login()
print("Successfully logged in to Weights & Biases!")

Enter your Hugging Face access token: ··········
Successfully logged in to Hugging Face!


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maivishwam[0m ([33maivishwam-vishwamai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Successfully logged in to Weights & Biases!
CPU times: user 3.05 s, sys: 473 ms, total: 3.53 s
Wall time: 1min 2s


In [5]:
%%time
# Quick repository setup
!git clone https://github.com/VishwamAI/VishwamAI.git
%cd VishwamAI
%pip install -e . -q

Cloning into 'VishwamAI'...
remote: Enumerating objects: 949, done.[K
remote: Counting objects: 100% (354/354), done.[K
remote: Compressing objects: 100% (274/274), done.[K
remote: Total 949 (delta 137), reused 272 (delta 76), pack-reused 595 (from 2)[K
Receiving objects: 100% (949/949), 28.41 MiB | 26.28 MiB/s, done.
Resolving deltas: 100% (419/419), done.
/content/VishwamAI
[31mERROR: file:///content/VishwamAI does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0mCPU times: user 35.1 ms, sys: 4.6 ms, total: 39.7 ms
Wall time: 3.32 s


In [None]:
pip install triton



# 3. Model Configuration (≈2 min)

In [None]:
%%time
import torch
import json
from datasets import load_dataset, concatenate_datasets
from vishwamai.model_utils import load_model, get_gpu_memory
from vishwamai.neural_memory import ReasoningMemoryTransformer
from vishwamai.cache_augmentation import DifferentiableCacheAugmentation, CacheConfig
from huggingface_hub import HfFolder, Repository

# Performance optimizations
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision('high')
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

CPU times: user 32.2 ms, sys: 1.91 ms, total: 34.1 ms
Wall time: 79.1 ms


In [None]:
from huggingface_hub import Repository

repo_name = "kasinadhsarma/vishwamai-model"  # Existing repo
repo = Repository(local_dir="./vishwamai-model", clone_from=repo_name, use_auth_token=hf_token)


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/kasinadhsarma/vishwamai-model into local empty directory.


In [None]:
@track_time
def setup_hardware():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = get_gpu_memory()
    print(f"Using GPU: {gpu_name} ({gpu_memory:.1f} GB)")

    # Optimize for available GPU
    if 'a100' in gpu_name.lower():
        return 'A100_optimized', 128, 65536  # Full 671B model
    elif 'v100' in gpu_name.lower():
        return 'V100_optimized', 64, 32768   # Reduced size
    else:
        return 'T4_optimized', 32, 16384     # Minimal configuration

gpu_type, expert_count, cache_size = setup_hardware()

Using GPU: Tesla T4 (14.7 GB)
Operation completed in 0.00 seconds


In [None]:
@track_time
def load_config():
    config_path = "./vishwamai/configs/config_671b.json"

    # Load JSON file
    with open(config_path) as f:
        config = json.load(f)

    # Debugging: Print keys to verify structure
    print("Config keys:", config.keys())

    # Ensure 'model_config' exists
    if "model_config" not in config:
        config["model_config"] = {}

    # Ensure 'gpu_type' exists in 'colab_specific'
    if "colab_specific" not in config or gpu_type not in config["colab_specific"]:
        raise KeyError(f"GPU type '{gpu_type}' not found in config['colab_specific']. Available: {list(config.get('colab_specific', {}).keys())}")

    gpu_config = config["colab_specific"][gpu_type]

    # Update model configuration dynamically
    config["model_config"].update({
        "dim": 8192,
        "num_attention_heads": 64,
        "num_hidden_layers": 120,
        "vocab_size": 64000,
        "max_position_embeddings": 32768,
        "batch_size": gpu_config.get("batch_size", 8),
        "num_experts": expert_count,
        "experts_per_token": min(16, expert_count // 8),
        "memory_size": gpu_config.get("memory_size", 2048),
        "tree_beam_width": gpu_config.get("tree_beam_width", 4),
        "cache_size": cache_size
    })

    return config, gpu_config

# Load configuration
config, gpu_config = load_config()
print("Configuration loaded successfully.")


Config keys: dict_keys(['model_type', 'architectures', 'vocab_size', 'hidden_size', 'intermediate_size', 'num_attention_heads', 'num_hidden_layers', 'max_position_embeddings', 'tree_depth', 'branch_factor', 'use_conceptual_tokens', 'concept_embedding_size', 'use_fp8', 'attention_dropout', 'hidden_dropout', 'max_concepts_per_token', 'initializer_range', 'layer_norm_epsilon', 'use_cache', 'pad_token_id', 'bos_token_id', 'eos_token_id', 'tie_word_embeddings', 'rope_scaling', 'attention_config', 'training_config', 'colab_specific'])
Operation completed in 0.00 seconds
Configuration loaded successfully.


# 4. Dataset Loading (≈10 min)

In [None]:
@track_time
def load_parallel_datasets():
    datasets = {}
    print("Loading datasets...")
    dataset_configs = [
        ("gsm8k", "openai/gsm8k", "train"),
        ("mmlu", "cais/mmlu", "train"),
        ("mmlu_pro", "TIGER-Lab/MMLU-Pro", "train"),
        ("mmmlu", "openai/MMMLU", "train"),
        ("mmmu", "MMMU/MMMU", "train"),
        ("leetcode1", "greengerong/leetcode", "train"),
        ("leetcode2", "LimYeri/LeetCode_Python_Solutions_v2", "train"),
        ("leetcode3", "newfacade/LeetCodeDataset", "train"),
        ("math", "deepmind/math_dataset", "train"),
        ("ifeval", "google/IFEval", "train"),
        ("gpqa", "Idavidrein/gpqa", "train"),
        ("frames", "google/frames-benchmark", "train"),
        ("camel_math", "camel-ai/math", "train"),
        ("camel_code", "camel-ai/code", "train"),
        ("scbench", "microsoft/SCBench", "train"),
        ("swe_bench", "princeton-nlp/SWE-bench_Verified", "train"),
        ("swe_bench_full", "princeton-nlp/SWE-bench", "train"),
        ("wikipedia", "wikimedia/wikipedia", "train")
    ]

    with tqdm(total=len(dataset_configs)) as pbar:
        for name, dataset_id, split in dataset_configs:
            try:
                datasets[name] = load_dataset(dataset_id, split=split, use_auth_token=True)
                pbar.update(1)
            except Exception as e:
                print(f"Warning: Failed to load {name}: {str(e)}")

    print("\nDataset sizes:")
    for name, dataset in datasets.items():
        print(f"{name}: {len(dataset):,} examples")

    return datasets

datasets = load_parallel_datasets()

Loading datasets...


  0%|          | 0/18 [00:00<?, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Dataset sizes:
Operation completed in 32.47 seconds


# 5. Model Initialization

In [None]:
@track_time
def initialize_components():
    print("Initializing 671B parameter model...")

    model = load_model(
        config_path="./vishwamai/configs/config_671b.json",
        device="cuda",
        use_cache=False
    )

    memory = NeuralMemory(
        dim=config['model_config']['dim'],
        memory_size=config['model_config']['memory_size']
    )

    tree_thoughts = TreeOfThoughts(
        model=model,
        beam_width=config['model_config']['tree_beam_width']
    )

    cache = DifferentiableCacheAugmentation(
        CacheConfig(
            hidden_size=config['model_config']['dim'],
            num_heads=8,  # Using reasonable default
            dropout=0.1,  # Using reasonable default
            max_cache_length=config['model_config']['cache_size']
        )
    )

    return model, memory, tree_thoughts, cache

model, memory, tree_thoughts, cache = initialize_components()

print(f"\nModel size: {sum(p.numel() for p in model.parameters())/1e9:.1f}B parameters")
print(f"Memory slots: {config['model_config']['memory_size']:,}")
print(f"Cache entries: {config['model_config']['cache_size']:,}")
print(f"Context length: {config['model_config']['max_position_embeddings']:,} tokens")
print(f"Active experts: {config['model_config']['experts_per_token']} per token")

Initializing 671B parameter model...


TypeError: CacheConfig.__init__() got an unexpected keyword argument 'max_length'

# 6. Training Configuration

In [None]:
from transformers import TrainingArguments

# Initialize output directory and repository
output_dir = "./pretrain_output"
!mkdir -p $output_dir

repo = Repository(
    local_dir=output_dir,
    clone_from=repo_name,
    use_auth_token=True
)

# Configure training with FSDP optimizations
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=gpu_config['batch_size'],
    gradient_accumulation_steps=gpu_config['gradient_accumulation'],
    learning_rate=1.2e-4,
    weight_decay=0.01,
    warmup_steps=1000,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    # Distributed training
    fsdp="full_shard",
    fsdp_transformer_layer_cls_to_wrap="VishwamAILayer",
    # Performance optimizations
    fp16=True,
    bf16=False,
    gradient_checkpointing=True,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    group_by_length=True,
    # Features
    use_moe=True,
    use_neural_memory=True,
    use_tree_of_thoughts=True,
    use_cache_augmentation=True,
    # Monitoring
    report_to=["tensorboard", "wandb"],
    # Hub integration
    push_to_hub=True,
    hub_model_id=repo_name,
    hub_strategy="every_save",
    # Other optimizations
    lr_scheduler_type="cosine",
    optim="adamw_torch",
    max_grad_norm=1.0,
    length_penalty=1.0,
    early_stopping=True
)

# 7. Training Pipeline

In [None]:
from datasets import concatenate_datasets

# Combine selected datasets for training
train_datasets = []
for ds_name in ["gsm8k", "leetcode1", "leetcode2", "math"]:
    if ds_name in datasets:
        train_datasets.append(datasets[ds_name])
if not train_datasets:
    raise ValueError("No available training datasets found for pretraining.")

combined_train_dataset = concatenate_datasets(train_datasets)

# Select a development (evaluation) dataset, e.g., mmlu. Fall back to mmlu_pro if needed.
development_dataset = datasets.get("mmlu") or datasets.get("mmlu_pro")

trainer = VishwamAIPretrainer(
    model=model,
    args=training_args,
    train_dataset=combined_train_dataset,
    eval_dataset=development_dataset
)


In [None]:
# Start training with monitoring
print("Starting pretraining pipeline...")
start_time = time.time()

trainer.train()

training_time = time.time() - start_time
print(f"\nPretraining completed in {training_time/3600:.2f} hours")

# 8. Model Saving and Validation

In [None]:
@track_time
def save_model_components():
    model_save_path = "final_model"
    trainer.save_model(model_save_path)
    memory.save_pretrained(f"{model_save_path}/memory")
    tree_thoughts.save_pretrained(f"{model_save_path}/tree_thoughts")
    cache.save_pretrained(f"{model_save_path}/cache")

    # Push to Hugging Face Hub
    trainer.push_to_hub()
    return model_save_path

model_save_path = save_model_components()
print(f"Model available at: https://huggingface.co/{repo_name}")

In [None]:
@track_time
def validate_model():
    test_model = load_model(
        config_path="configs/config_671b.json",
        device="cuda",
        pretrained_path=model_save_path
    )

    test_cases = [
        "Solve this math problem: What is the area of a circle with radius 5?",
        "Explain the concept of quantum entanglement.",
        "Write a Python function to find the nth Fibonacci number using dynamic programming."
    ]

    print("Running validation tests...")
    for test_input in test_cases:
        print(f"\nTest: {test_input}")
        encoded = model.tokenizer.encode(test_input, return_tensors="pt").cuda()

        with torch.inference_mode():
            start = time.time()
            output = test_model.generate(
                encoded,
                max_new_tokens=200,
                num_beams=4,
                temperature=0.7,
                early_stopping=True
            )
            end = time.time()

        response = model.tokenizer.decode(output[0])
        print(f"Response (generated in {end-start:.2f}s):")
        print(response)

validate_model()
print("\nPretraining and validation completed!")