# This ipynb is for testing functions and model structures.

In [1]:
import os
os.environ["HF_HOME"]='/data/hfhub'
os.environ["HF_ENDPOINT"]='https://hf-mirror.com'
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
llama150m = AutoModelForCausalLM.from_pretrained("PrimeIntellect/llama-150m-fresh")
print("Model loaded")
print("Model structure:")
print(llama150m)

Model loaded
Model structure:
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=2688, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2688, bias=False)
          (down_proj): Linear(in_features=2688, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((1024,

In [2]:
# granite 3.0 MoE
granitemoe = AutoModelForCausalLM.from_pretrained("/data/hfhub/granite-3.0-1b-a400m-fresh/")
print("Granite MoE Model loaded")
print("Granite MoE Model structure:")
print(granitemoe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Granite MoE Model loaded
Granite MoE Model structure:
GraniteMoeForCausalLM(
  (model): GraniteMoeModel(
    (embed_tokens): Embedding(49152, 1024, padding_idx=0)
    (layers): ModuleList(
      (0-23): 24 x GraniteMoeDecoderLayer(
        (self_attn): GraniteMoeAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=512, bias=False)
          (v_proj): Linear(in_features=1024, out_features=512, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (block_sparse_moe): GraniteMoeMoE(
          (activation): SiLU()
          (input_linear): GraniteMoeParallelExperts()
          (output_linear): GraniteMoeParallelExperts()
          (router): GraniteMoeTopKGating(
            (layer): Linear(in_features=1024, out_features=32, bias=False)
          )
        )
        (input_layernorm): GraniteMoeRMSNorm((1024,), eps=1e-06)
        (post_attention_lay

In [None]:
# test model sharding
from algorithms.shard_utils import get_layer_shards
shards = get_layer_shards(llama150m, 4)
print("Model shards:")
print(shards[0])

Model shards:
[Parameter containing:
tensor([[ 2.1478e-02,  1.8493e-02, -2.3874e-04,  ...,  2.9196e-02,
          1.4750e-02,  8.2822e-03],
        [-3.2408e-03, -1.9981e-02,  1.3707e-02,  ...,  2.3043e-03,
         -7.9063e-03,  1.6050e-02],
        [ 4.7969e-02,  2.5351e-06,  7.2886e-03,  ...,  1.7036e-02,
         -1.4120e-02, -3.7048e-03],
        ...,
        [-8.5783e-03,  2.1534e-03, -1.5434e-02,  ...,  6.5224e-04,
         -6.1978e-03,  2.2615e-02],
        [-6.8537e-03, -4.3509e-03, -1.4865e-02,  ...,  3.8770e-03,
          6.7216e-04,  8.0352e-04],
        [-1.4626e-03, -1.5442e-02, -1.3624e-02,  ...,  2.2154e-03,
          1.8807e-02,  1.4937e-02]], requires_grad=True), Parameter containing:
tensor([[-0.0028,  0.0136, -0.0085,  ..., -0.0301,  0.0101, -0.0604],
        [-0.0003, -0.0251,  0.0112,  ...,  0.0367, -0.0438, -0.0081],
        [-0.0018,  0.0156,  0.0482,  ...,  0.0085,  0.0613, -0.0144],
        ...,
        [-0.0075, -0.0005,  0.0070,  ..., -0.0258, -0.0042,  0.02

In [4]:
# Test model sharding
from algorithms.shard_utils import get_layer_shards
shards = get_layer_shards(granitemoe, 4)
print("Granite MoE Model shards:")
print(shards[0])

Granite MoE Model shards:
[[Parameter containing:
tensor([[ 0.1228, -0.0139,  0.0924,  ...,  0.0642, -0.0471, -0.1299],
        [ 0.1853, -0.0003,  0.0679,  ..., -0.1588, -0.0148, -0.1049],
        [-0.0508,  0.2716, -0.0679,  ...,  0.0537,  0.0676,  0.0827],
        ...,
        [ 0.2567, -0.0597, -0.0975,  ..., -0.0627, -0.0728, -0.1356],
        [ 0.0169, -0.0428,  0.0129,  ...,  0.1943, -0.0113,  0.2126],
        [ 0.0390, -0.0266, -0.0492,  ..., -0.0077,  0.0923,  0.1045]],
       requires_grad=True), Parameter containing:
tensor([[-0.0827, -0.0144, -0.1017,  ...,  0.1386,  0.1656, -0.1451],
        [-0.1032, -0.0819, -0.1285,  ..., -0.0325,  0.0678,  0.0772],
        [ 0.1232,  0.2311,  0.0477,  ..., -0.0612, -0.0143, -0.1149],
        ...,
        [-0.0992,  0.0908, -0.0457,  ..., -0.0722,  0.0137, -0.1759],
        [-0.0633,  0.0293, -0.0406,  ...,  0.0317, -0.0288, -0.1870],
        [ 0.1986,  0.1026,  0.0765,  ...,  0.0124, -0.0438,  0.0338]],
       requires_grad=True), Para

In [None]:
# One card training test
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
import time

device = torch.device('cuda:3')
root = '/data/hfhub/datasets/c4'
model = granitemoe.to(device)
tokenizer = AutoTokenizer.from_pretrained('/data/hfhub/granite-3.0-1b-a400m-fresh/')
train_glob = os.path.join(root, "en", "c4-train.*.json.gz")
val_glob = os.path.join(root, "en", "c4-validation.00000-of-00008.json.gz")
ds = load_dataset(
    "json",
    data_files={"train": train_glob, "validation": val_glob},
    streaming=True,
)
ds.shuffle(seed=2025)
block_size = 1024
def tokenize_function(data):
    outputs = tokenizer(data["text"], truncation=True, max_length=block_size)
    return outputs

tokenized_datasets = ds.map(tokenize_function, batched=True, remove_columns=["text", "timestamp", "url"])
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation']
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataloader = DataLoader(
    train_dataset, 
    collate_fn=data_collator,
    batch_size=2, pin_memory=True, num_workers=4)
eval_dataloader = DataLoader(
    eval_dataset, 
    collate_fn=data_collator,
    batch_size=4, pin_memory=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()
cnt = 0
step = 0
running_loss = 0.0
step_time = 0.0
for batch in train_dataloader:
    t0 = time.time()
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    scaled_loss = loss / 8
    scaled_loss.backward()
    running_loss += loss.item()
    cnt += 1
    comp_time = time.time() - t0
    step_time += comp_time
    print(f"Step {step}, Loss: {loss.item():.4f}, micro step time: {comp_time:.4f} sec")
    if cnt % 8 == 0:
        optimizer.step()
        optimizer.zero_grad()
        step += 1
        print(f"Completed step {step}, Avg Loss: {running_loss / 8:.4f}, Step time: {step_time:.4f} , Avg step time: {step_time / 8:.4f} sec")
        if step >= 10:
            break




Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 3 has a total capacity of 79.25 GiB of which 50.81 MiB is free. Process 325176 has 21.99 GiB memory in use. Including non-PyTorch memory, this process has 57.18 GiB memory in use. Of the allocated memory 56.32 GiB is allocated by PyTorch, and 383.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 