In [2]:
import math, copy

base_config = {
        "dataset": "c4_subset",
        "batch_size": 32,  # physical batch size 256
        "learning_rate": 0.001 * math.sqrt(4),
        "min_lr": 1e-5,
        "lr_schedule": "cosine",
        "warmup_epochs": 1,
        "warmup_epochs_frac": 0.1,
        "weight_decay": 0.1,
        "hidden_dim": 64,  # Base hidden dimension
        "num_layers": 4,  # Base number of layers
        "num_heads": 4,
        "dropout": 0.0,
        "seq_length": 128,
        "wikitext_limit": 5 * 10**7,
        "pos_encoding": "rotary",
        "init_scheme": "transformer_scaled",
        "stride": 64,
        "pin_memory": True,
        "compile": False,
        "prefetch_factor": 8,
        "min_epochs": 2,
        "max_epochs": 2,
        "use_gradient_clipping": True,
        "gradient_clip_val": 1.0,
        "label_smoothing": 0.0,
        "gradient_accumulation_steps": 16,
        "optimizer": "adamw",
        "activation": "gelu",
        "norm_type": "layer",
        "results_folder": "Former_Experiments_Folder",
        "csv_log_interval": 50,
        "seed": 789,
    }

def chinchilla_scale(base_cfg, hidden_dims):
    """
    Return a list of configs that satisfy:
      • tokens ≈ 20 × parameters
      • per-step compute budget unchanged vs. baseline
      • depth/width ratio fixed (layers ∝ hidden_dim)
    """

    def param_count(d, L):
        # crude but width-dominant: 12·L·d²  (ignores embeddings/out-proj)
        return 12 * L * d**2

    base_d = base_cfg["hidden_dim"]
    base_L = base_cfg["num_layers"]
    base_bsz = base_cfg["batch_size"]
    base_lr = base_cfg["learning_rate"]
    base_clip = base_cfg["gradient_clip_val"]
    seq_len = base_cfg["seq_length"]

    out = []
    for d in hidden_dims:
        width_scale = d / base_d

        # 1) Depth: keep L ∝ d   (so aspect-ratio is preserved)
        L = max(1, int(round(base_L * width_scale)))

        # 2) Keep per-step FLOPs ≈ const ⇒ batch ∝ 1 / (width² · depth/base_depth)
        flops_scale = (width_scale**2) * (L / base_L)
        bsz = max(1, int(round(base_bsz / flops_scale)))

        # 3) LR & grad-clip heuristics
        lr = base_lr * (base_d / d) ** 0.5
        clip = base_clip * math.sqrt(width_scale)

        # 4) Chinchilla target tokens  (≈ 20 × parameters)
        params = param_count(d, L)
        tgt_tok = int(20 * params)

        # 5) Convert token target into epochs
        tokens_per_step = bsz * seq_len
        est_steps = math.ceil(tgt_tok / tokens_per_step)
        max_epochs = math.ceil(
            est_steps / (len(base_cfg.get("dataset", [])) or 1)
        )  # adjust as needed

        cfg = copy.deepcopy(base_cfg)
        cfg.update(
            {
                "hidden_dim": d,
                "num_layers": L,
                "num_heads": max(1, d // 16),
                "batch_size": bsz,
                "learning_rate": lr,
                "gradient_clip_val": clip,
                "target_tokens": tgt_tok,
                "max_epochs": max(max_epochs, cfg.get("min_epochs", 1)),
            }
        )
        out.append(cfg)
    return out


print(chinchilla_scale(base_config, [256]))

[{'dataset': 'c4_subset', 'batch_size': 1, 'learning_rate': 0.001, 'min_lr': 1e-05, 'lr_schedule': 'cosine', 'warmup_epochs': 1, 'warmup_epochs_frac': 0.1, 'weight_decay': 0.1, 'hidden_dim': 256, 'num_layers': 16, 'num_heads': 16, 'dropout': 0.0, 'seq_length': 128, 'wikitext_limit': 50000000, 'pos_encoding': 'rotary', 'init_scheme': 'transformer_scaled', 'stride': 64, 'pin_memory': True, 'compile': False, 'prefetch_factor': 8, 'min_epochs': 2, 'max_epochs': 218454, 'use_gradient_clipping': True, 'gradient_clip_val': 2.0, 'label_smoothing': 0.0, 'gradient_accumulation_steps': 16, 'optimizer': 'adamw', 'activation': 'gelu', 'norm_type': 'layer', 'results_folder': 'Former_Experiments_Folder', 'csv_log_interval': 50, 'seed': 789, 'target_tokens': 251658240}]


In [3]:
import math

# ---------- common settings ----------
COMMON = dict(
    dataset             = "c4_subset",
    lr_schedule         = "cosine",
    warmup_epochs       = 1,
    warmup_epochs_frac  = 0.10,
    weight_decay        = 0.10,
    dropout             = 0.0,          # bump to 0.1-0.2 for >100 M tokens if needed
    seq_length          = 128,
    pos_encoding        = "rotary",
    init_scheme         = "transformer_scaled",
    stride              = 64,
    pin_memory          = True,
    compile             = False,
    prefetch_factor     = 8,
    min_epochs          = 1,
    max_epochs          = 1,
    use_gradient_clipping = True,
    gradient_clip_val   = 1.0,
    label_smoothing     = 0.0,
    optimizer           = "adamw",
    activation          = "gelu",
    norm_type           = "layer",
    results_folder      = "Former_Experiments_Folder",
    csv_log_interval    = 50,
    seed                = 789,
)

GPT2_VOCAB_SIZE = 50257

def make_cfg(d_model, n_layers, vocab:int = GPT2_VOCAB_SIZE):
    heads = max(1, d_model // 16)
    lr    = 0.001 * math.sqrt(d_model / 16)
    params = 12 * d_model * d_model * n_layers + vocab * d_model    # rough GPT-style count
    tokens = 20 * params                           # Chin. optimal compute
    eff_bs = 256                                   # keep effective batch ~constant
    phys_bs = 32     
    
    print("Overrides:")     
    print(dict(
        hidden_dim  = d_model,
        num_layers  = n_layers,
        num_heads   = heads,
        learning_rate = lr,
        batch_size  = phys_bs,
        gradient_accumulation_steps = eff_bs // phys_bs,
        train_tokens = tokens,         # ← 1 epoch budget
    ))        
    print(params, "for ", d_model, "d model")                 # fits typical 40 GB A100 w/ acc-16

    return dict(
        COMMON,
        hidden_dim  = d_model,
        num_layers  = n_layers,
        num_heads   = heads,
        learning_rate = lr,
        batch_size  = phys_bs,
        gradient_accumulation_steps = eff_bs // phys_bs,
        train_tokens = tokens,         # ← 1 epoch budget
    )


#choose layer depth so that it is roughly proportional to the hidden dimension cubed
CONFIGS = {
    "dim16"  : make_cfg(16,  2),
    "dim24"  : make_cfg(24,  3),
    "dim32"  : make_cfg(32,  3),
    "dim48"  : make_cfg(48,  4),
    "dim64"  : make_cfg(64,  4),   # original width/depth
    "dim96"  : make_cfg(96,  6),
    "dim128" : make_cfg(128, 8),
}

# Pretty-print if you run this file directly
if __name__ == "__main__":
    from pprint import pprint
    pprint(CONFIGS, width=120, sort_dicts=False)


Overrides:
{'hidden_dim': 16, 'num_layers': 2, 'num_heads': 1, 'learning_rate': 0.001, 'batch_size': 32, 'gradient_accumulation_steps': 8, 'train_tokens': 16205120}
810256 for  16 d model
Overrides:
{'hidden_dim': 24, 'num_layers': 3, 'num_heads': 1, 'learning_rate': 0.001224744871391589, 'batch_size': 32, 'gradient_accumulation_steps': 8, 'train_tokens': 24538080}
1226904 for  24 d model
Overrides:
{'hidden_dim': 32, 'num_layers': 3, 'num_heads': 2, 'learning_rate': 0.0014142135623730952, 'batch_size': 32, 'gradient_accumulation_steps': 8, 'train_tokens': 32901760}
1645088 for  32 d model
Overrides:
{'hidden_dim': 48, 'num_layers': 4, 'num_heads': 3, 'learning_rate': 0.0017320508075688772, 'batch_size': 32, 'gradient_accumulation_steps': 8, 'train_tokens': 50458560}
2522928 for  48 d model
Overrides:
{'hidden_dim': 64, 'num_layers': 4, 'num_heads': 4, 'learning_rate': 0.002, 'batch_size': 32, 'gradient_accumulation_steps': 8, 'train_tokens': 68261120}
3413056 for  64 d model
Overrides

# New Scaling Configuration

In [5]:
import math

# === Literature-backed heuristics ==========================================

HEAD_DIM = 64
# Vaswani et al., 2017 §3.2: “we set d_k = d_v = d_model / h = 64 for each head.”
# https://papers.neurips.cc/paper/7181-attention-is-all-you-need.pdf

# BASE_LR = 6e-4
#new rate base on sgd sweep 
BASE_LR = 10**(-1.5)


# Yang et al., 2022 (Tensor Programs V) §4: “optimal LR scales ∝ width^{-½}.”
# https://arxiv.org/pdf/2203.03466.pdf

CHIN_TOK_RATIO = 20
# Hoffmann et al., 2022 (Chinchilla) §3: “compute-optimal: ~20 tokens per parameter.”
# https://arxiv.org/pdf/2203.15556.pdf

EFF_BS  = 256                      # keep effective batch fixed
PHYS_BS = 32                       # Goyal et al., 2017: LR ∝ batch (linear-scaling rule)
# https://arxiv.org/pdf/1706.02677.pdf

GPT2_VOCAB_SIZE = 50_257

# === Scaling rules =========================================================

def make_cfg(d_model: int,
             vocab: int = GPT2_VOCAB_SIZE):
    # Depth grows ≈ 0.06 × width (TinyStories sweep, Eldan & Li 2023, Fig. 5)
    # https://arxiv.org/pdf/2305.07759.pdf
    n_layers = max(2, round(0.06 * d_model))

    # 2–8 heads so each head keeps ≥64-d and avoids the low-rank bottleneck
    # Bhojanapalli et al., 2020: “excess heads create a low-rank bottleneck.”
    # https://proceedings.mlr.press/v119/bhojanapalli20a/bhojanapalli20a.pdf
    # Saratchandran et al., 2025 (Leaner Transformers): “more heads lets you cut depth.”
    # https://arxiv.org/pdf/2505.20802.pdf
    n_heads  = max(2, min(8, round(d_model / HEAD_DIM)))

    # Width-scaled LR (μP theory) + 2025 plateau exponent ≈ 0.22 (Li et al., 2025)
    # https://arxiv.org/pdf/2503.04715.pdf
    lr_base_dim = 32
    lr = BASE_LR * (d_model / lr_base_dim) ** (-0.5)

    # Rough GPT-style parameter count (Kaplan et al., 2020 formula)
    params = 12 * d_model * d_model * n_layers + vocab * d_model
    print(params, f"this is {d_model} size")

    # Compute-optimal token budget (Chinchilla)
    tokens = CHIN_TOK_RATIO * params

    return dict(
        COMMON,
        hidden_dim  = d_model,
        num_layers  = n_layers,
        num_heads   = n_heads,
        learning_rate = lr,
        batch_size  = PHYS_BS,
        gradient_accumulation_steps = EFF_BS // PHYS_BS,
        train_tokens = tokens,
    )

# --- Config grid -----------------------------------------------------------

CONFIGS = {
    f"dim{d}": make_cfg(d) for d in [16, 24, 32, 48, 56, 64, 72, 80, 96, 128]
}

if __name__ == "__main__":
    from pprint import pprint
    pprint(CONFIGS, width=100, sort_dicts=False)


810256 this is 16 size
1219992 this is 24 size
1632800 this is 32 size
2495280 this is 48 size
2927288 this is 56 size
3413056 this is 64 size
3867336 this is 72 size
4404560 this is 80 size
5488224 this is 96 size
8005760 this is 128 size
{'dim16': {'dataset': 'c4_subset',
           'lr_schedule': 'cosine',
           'warmup_epochs': 1,
           'warmup_epochs_frac': 0.1,
           'weight_decay': 0.1,
           'dropout': 0.0,
           'seq_length': 128,
           'pos_encoding': 'rotary',
           'init_scheme': 'transformer_scaled',
           'stride': 64,
           'pin_memory': True,
           'compile': False,
           'prefetch_factor': 8,
           'min_epochs': 1,
           'max_epochs': 1,
           'use_gradient_clipping': True,
           'gradient_clip_val': 1.0,
           'label_smoothing': 0.0,
           'optimizer': 'adamw',
           'activation': 'gelu',
           'norm_type': 'layer',
           'results_folder': 'Former_Experiments_Folder',
 

# Complete-P Scaling

In [6]:
from dataclasses import dataclass, asdict
from typing import Dict, Any, Iterable, List
import math

# ---------- common settings ----------
COMMON = dict(
    dataset             = "c4_subset",
    lr_schedule         = "cosine",
    warmup_epochs       = 1,
    warmup_epochs_frac  = 0.10,
    weight_decay_base   = 0.10,   # λ_base; actual λ scales with mN
    dropout             = 0.0,
    seq_length          = 128,
    pos_encoding        = "rotary",
    init_scheme         = "transformer_scaled",
    stride              = 64,
    pin_memory          = True,
    compile             = False,
    prefetch_factor     = 8,
    min_epochs          = 1,
    max_epochs          = 1,
    use_gradient_clipping = True,
    gradient_clip_val   = 1.0,
    label_smoothing     = 0.0,
    optimizer           = "adamw",
    activation          = "gelu",
    norm_type           = "layer",
    results_folder      = "Former_Experiments_Folder",
    csv_log_interval    = 50,
    seed                = 789,
)

HEAD_DIM = 64
GPT2_VOCAB_SIZE = 50_257
TPP = 20  # compute‑optimal default token budget ≈ 20 tokens/parameter (Chinchilla)

# Base model used for m_N, m_L multipliers (Table 1 uses N_base=256, L_base=2):contentReference[oaicite:3]{index=3}
N_BASE = 256
L_BASE = 2

# Use a base LR that you tune at the tiny base model; Complete‑P transfers across depth:contentReference[oaicite:4]{index=4}
ETA_BASE = 3.9e-3  # from Fig. 2 base in paper; feel free to retune for your stack:contentReference[oaicite:5]{index=5}

# AdamW epsilon base; width*depth scaling handled per‑group below (Appendix E.4):contentReference[oaicite:6]{index=6}
EPS_BASE = 1e-16

CHIN_TOK_RATIO = 20

@dataclass
class ModelShape:
    d_model: int
    n_layers: int
    n_heads: int
    vocab_size: int = GPT2_VOCAB_SIZE

def _default_shape(d_model: int, vocab: int = GPT2_VOCAB_SIZE) -> ModelShape:
    # Depth ≈ 0.06 × width (heuristic you already used)
    n_layers = max(2, round(0.06 * d_model))
    n_heads  = max(2, min(8, round(d_model / HEAD_DIM)))
    return ModelShape(d_model=d_model, n_layers=n_layers, n_heads=n_heads, vocab_size=vocab)

def _params_count(shape: ModelShape) -> int:
    # Rough GPT‑style param count (same as before)
    return 12 * shape.d_model * shape.d_model * shape.n_layers + shape.vocab_size * shape.d_model

def make_cfg_completep(
    d_model: int,
    vocab: int = GPT2_VOCAB_SIZE,
    shape: ModelShape | None = None,
    alpha: float = 1.0,  # Complete‑P
) -> Dict[str, Any]:
    """
    Returns a config dict with Complete‑P scaling baked in.
    """
    if shape is None:
        shape = _default_shape(d_model, vocab)

    # width, depth multipliers (mN, mL) relative to base N=256, L=2:contentReference[oaicite:7]{index=7}
    mN = shape.d_model / N_BASE
    mL = shape.n_layers / L_BASE

    # residual branch scaling factor L^{-α} (Eq. 1):contentReference[oaicite:8]{index=8}
    residual_scale = mL ** (-alpha)

    # Per‑group scalings from Table 1 with α=1:
    # Hidden LR: η_base * mN^{-1} * mL^{α−1}  -> with α=1, becomes η_base * mN^{-1}
    # Hidden WD: λ_base * mN
    # PreLN LR:  η_base * mL^{α−1} -> no depth factor at α=1
    # Bias LR (hidden): η_base * mL^{α−1} -> no depth factor at α=1
    # AdamW ε (hidden): ε_base * mN^{-1} * mL^{-α} -> ε_base * mN^{-1} / mL
    # Emb/Unemb LRs remain η_base; ε unchanged; unemb forward uses 1/mN (handled in model)
    # LN gain/bias follow "Pre‑LN LR" & bias notes (Appendix E.2):contentReference[oaicite:9]{index=9}

    # Global “headline” hyperparams (you can log these; real values are in param‑groups)
    params = _params_count(shape)
    train_tokens = max(int(CHIN_TOK_RATIO * params), 1)  # default 20 TPP

    cfg = dict(
        COMMON,
        hidden_dim  = shape.d_model,
        num_layers  = shape.n_layers,
        num_heads   = shape.n_heads,
        vocab_size  = shape.vocab_size,
        learning_rate_base = ETA_BASE,
        epsilon_base = EPS_BASE,
        weight_decay_base = COMMON["weight_decay_base"],
        residual_scale = residual_scale,
        alpha = alpha,
        n_base = N_BASE,
        l_base = L_BASE,
        mN = mN,
        mL = mL,
        batch_size  = 32,  # physical; adjust per hardware
        gradient_accumulation_steps = 256 // 32,  # keep effective BS=256
        train_tokens = train_tokens,
        tpp = TPP,
        completep_enabled = True,
    )
    return cfg

TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'

## LSTM Scaling Configurations

In [17]:
CONFIG = {
    "data_path": "../Datasets/wikitext.txt",
    "tokenizer_path": "../gpt2_tokenizer",
    "max_characters": 5 * 1e7,  # Maximum number of characters to use from dataset
    "sequence_length": 128,
    "batch_size": 32,  # Keep physical batch size small, has no effect on model
    "hidden_size": 16,
    "num_layers": 2,
    "dropout": 0.0,  # dropout zer here to match transformer but may need to adjust for LSTM
    "learning_rate": 0.001 * math.sqrt(4),  # Scale by sqrt of accumulation steps
    "lr_schedule": "cosine",
    "step_size": 10,
    "gamma": 0.1,  # parameter usedf for stepLR step decay
    "num_epochs": 5,
    "train_split": 0.8,
    "val_split": 0.1,
    "test_split": 0.1,
    "device": "cuda",
    "wandb_project": "lstm-wikitext",
    "wandb_offline": True,
    "print_every": 100,  # Print loss every N batches
    # Gradient clipping settings
    "use_gradient_clipping": True,
    "gradient_clip_val": 1.0,
    # NEW: CSV logging settings
    "results_folder": "Experiments_Folder",
    "csv_log_interval": 100,  # Log every 100 steps
    # NEW: Data loading optimization settings
    "num_workers": "auto",  # Will be set automatically based on CPU cores
    "pin_memory": True,  # Faster GPU memory transfer
    "persistent_workers": True,  # Keep data loading workers alive between epochs
    "prefetch_factor": 4,  # Number of batches to prefetch per worker
    # NEW: Mixed precision settings
    "use_amp": False,  # Enable Automatic Mixed Precision
    "amp_opt_level": "O1",  # Not used with native AMP, but kept for reference
    # NEW: Gradient accumulation settings
    "gradient_accumulation_steps": 16,  # For tracking only
    # NEW: whether to compile the model (PyTorch 2.0+)
    "use_compile": False,
    "seed": 789,
    "optimizer": "adamw",  # NEW: choose from "adam", "adamw", or "sgd"
    "weight_decay": 0.1,
    "stride": 64,  # NEW: sliding-window stride to match transformer
}


def make_lstm_config(hidden_size, base_config=None):
    """
    Generates a scaled LSTM configuration based on hidden dimension size.

    This function applies heuristics and best practices for scaling LSTMs,
    focusing on increasing width before depth and adjusting learning rate
    and data size accordingly.

    Args:
        hidden_size (int): The target hidden dimension size for the LSTM.
        base_config (dict, optional): A base configuration to build upon.
                                      If None, uses sensible defaults from CONFIG.

    Returns:
        dict: A configuration dictionary scaled for the given hidden size.
    """
    # 1. --- Set sensible base config if none provided ---
    if base_config is None:
        base_config = CONFIG
    
    config = base_config.copy()
    config["hidden_size"] = hidden_size

    # 2. --- Scale LSTM architecture (Wider before Deeper) ---
    # Depth (num_layers) should scale sub-linearly with width (hidden_size).
    if hidden_size <= 128:
        config["num_layers"] = 2
    elif hidden_size <= 512:
        config["num_layers"] = 3
    else:
        # It's rare for LSTMs to benefit from more than 4 layers
        config["num_layers"] = 4

    # 3. --- Scale Learning Rate ---
    # Use inverse square root scaling relative to a baseline.
    base_lr = 1e-3
    base_hidden = 128
    # LR decreases as model size increases
   
    config['learning_rate'] = base_lr

    # 4. --- Scale Dropout for Regularization ---
    # Larger models can handle more dropout.
    if hidden_size <= 256:
        config["dropout"] = 0.1
    else:
        config["dropout"] = 0.2

    # 5. --- Scale Data Amount (Chinchilla-style) ---
    # First, estimate model parameters.
    vocab_size = 50257  # GPT-2
    # Params for one LSTM layer: 4 * (h*h (input-hidden) + h*h (hidden-hidden) + 2*h (biases))
    # Simplified: 4 * (2*h^2) = 8*h^2
    lstm_params = config["num_layers"] * (8 * hidden_size**2)
    embedding_params = vocab_size * hidden_size
    output_params = hidden_size * vocab_size
    total_params = lstm_params + embedding_params + output_params

    # Chinchilla's rule: ~20 tokens per parameter
    target_tokens = 20 * total_params
    # Assume ~4 characters per token for English text
    chars_per_token = 4
    config["max_characters"] = int(target_tokens * chars_per_token)

    print(f"\n--- Generated LSTM Config for hidden_size={hidden_size} ---")
    print(f"  Architecture: {config['num_layers']} layers, {config['hidden_size']} hidden_size")
    print(f"  Training: LR={config['learning_rate']:.2e}, Dropout={config['dropout']:.2f}")
    print(f"  Estimated Params: {total_params / 1e6:.2f}M")
    print(f"  Data: Using {config['max_characters'] / 1e6:.1f}M characters from dataset")
    print("-------------------------------------------------")
    
    return config


# Example of how to use the generator to create an experiment suite
if __name__ == "__main__":
    
    # Define the hidden sizes you want to test
    lstm_hidden_sizes = [16, 24, 32, 48, 64, 96, 128]

    # Create a dictionary of configuration objects
    LSTM_EXPERIMENTS = {
        f"lstm_h{size}": make_lstm_config(size) for size in lstm_hidden_sizes
    }

    # Now you can iterate through these configs to run your experiments
    # For example:
    # for exp_name, exp_config in LSTM_EXPERIMENTS.items():
    #     print(f"\nRunning experiment: {exp_name}")
    #     # Here you would call your training function with exp_config
    #     # train(config=exp_config)


--- Generated LSTM Config for hidden_size=16 ---
  Architecture: 2 layers, 16 hidden_size
  Training: LR=1.00e-03, Dropout=0.10
  Estimated Params: 1.61M
  Data: Using 129.0M characters from dataset
-------------------------------------------------

--- Generated LSTM Config for hidden_size=24 ---
  Architecture: 2 layers, 24 hidden_size
  Training: LR=1.00e-03, Dropout=0.10
  Estimated Params: 2.42M
  Data: Using 193.7M characters from dataset
-------------------------------------------------

--- Generated LSTM Config for hidden_size=32 ---
  Architecture: 2 layers, 32 hidden_size
  Training: LR=1.00e-03, Dropout=0.10
  Estimated Params: 3.23M
  Data: Using 258.6M characters from dataset
-------------------------------------------------

--- Generated LSTM Config for hidden_size=48 ---
  Architecture: 2 layers, 48 hidden_size
  Training: LR=1.00e-03, Dropout=0.10
  Estimated Params: 4.86M
  Data: Using 388.9M characters from dataset
-------------------------------------------------
