## model arch


In [1]:
# !pip install accelerate datasets transformers trl torch peft wandb scikit-learn ipywidgets

In [2]:
# python -m venv buddy
# source buddy/bin/activate
# pip install ipykernel
# python -m ipykernel install --user --name my-kernel-name --display-name "Python (My Kernel)"

In [3]:
import torch
import torch.nn.functional as F
import torch.nn as nn

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
# model.save_pretrained('outputs/buddygpt', safe_serialization=False)

## load wiki data

In [5]:
from transformers import AutoTokenizer
import buddygpt
from buddygpt import GPTConfig, BuddyGPT

output_dir = f'outputs/buddygpt'
tokenizer = AutoTokenizer.from_pretrained('gpt2' ,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
config = GPTConfig(n_block=1024, n_embed=1024, n_head=32, n_layer=16, n_vocab=len(tokenizer), n_kv_head=8)
model = BuddyGPT(config).to(device)
model

[2025-04-30 13:08:40,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/samtang/miniconda3/envs/rl/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/samtang/miniconda3/envs/

BuddyGPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1024)
    (layers): ModuleList(
      (0-15): 16 x Layer(
        (mha): GQA(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=256, bias=True)
          (v_proj): Linear(in_features=1024, out_features=256, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (rope): RotaryEmbedding()
        )
        (mlp): MLP(
          (ln1): Linear(in_features=1024, out_features=2048, bias=True)
          (silu): SwiGLU()
          (ln2): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (pre_norm): RMSNorm((1024,), eps=None, elementwise_affine=True)
        (post_norm): RMSNorm((1024,), eps=None, elementwise_affine=True)
      )
    )
    (ln_norm): RMSNorm((1024,), eps=None, elementwise_affine=True)
    (rope): RotaryEmbedding()
  )
  (lm_head): Linear(in_features=1024, out_featur

In [6]:
# model.save_pretrained(output_dir, safe_serialization=False)

# from transformers import AutoTokenizer, AutoModelForCausalLM
# import buddygpt
# model_id = 'outputs/buddygpt'
# device = 'cuda'
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id)
# model

In [7]:

def print_parameters(model):
    num_param = sum([param.numel() for param in model.parameters() if param.requires_grad])
    print(f'total param {num_param/1024/1024}m')
    
def sample(model, query, max_length=50):
    input_ids = tokenizer.encode(query, return_tensors="pt").to(model.device)
    output = model.generate(
        input_ids=input_ids,
        max_length=max_length,
    )
    gen_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return gen_text

model.to(device)
print_parameters(model)
sample(model, '中国首都是哪?')

total param 137.197265625m


"中国首都是哪? bikes Useful Recon hepatFlo Starts proteinsinenbars armies thousand thousand 448## listings massacre EtaultsWitness patchedcorruption559gob mosqu'] indicates Judgeberry Password astronomy incomsci Cellular Supervisor ShrineEmailFall soakingromeda fires remission TemplejundriENA Weirdcharges poet suppressed consisting"

In [8]:
# from datasets import load_dataset, concatenate_datasets
# ds = load_dataset("wikimedia/wikipedia", "20231101.zh", split="train")
# # 定义一个函数来计算每条样本的 token 数
# def count_tokens(example):
#     return {"num_tokens": len(tokenizer.encode(example["text"]))}

# # 应用函数到整个数据集
# tokenized_dataset = ds.map(count_tokens, batched=False)

# # 计算总的 token 数
# total_tokens = sum(tokenized_dataset["num_tokens"])
# print(f"Total tokens: {total_tokens}")

In [9]:
from datasets import load_dataset, concatenate_datasets
# 50m model need 20*50m = 1B token
# 100m model need 20*100m = 2B token
# 200m model need 20*200m = 4B token
# 500m model need 20*500m = 10B token

# Total tokens: 1872137976
# 1.8B token
ds = load_dataset("wikimedia/wikipedia", "20231101.zh", split="train")
# 10B token * 10% = 1B token
web_ds = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train[:10%]")

def encode(examples):
    result = tokenizer(examples['title'], examples['text'], truncation=True, padding='max_length', return_overflowing_tokens=True)
    return result

def encode2(examples):
    result = tokenizer(examples['text'], truncation=True, padding='max_length', return_overflowing_tokens=True)
    return result

ds = ds.map(encode, batched=True, remove_columns=['url', 'id', 'text', 'title'])
web_ds = web_ds.map(encode2, batched=True, remove_columns=['url','id','text','dump','file_path','language','language_score','token_count','score','int_score'])
ds = concatenate_datasets([ds, web_ds])
ds

Resolving data files:   0%|          | 0/2110 [00:00<?, ?it/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
    num_rows: 4206939
})

In [10]:
# ds['input_ids']

# Load the "all" subset or a specific subject like "computer_science"
cmmlu = load_dataset("haonan-li/cmmlu", "high_school_geography", split='dev')

# We'll use the validation set
# eval_ds = cmmlu["validation"]
def preprocess(example):
    question = example["Question"]
    choices = example["A"], example["B"], example["C"], example["D"]
    context = f"{question}\nA. {choices[0]}\nB. {choices[1]}\nC. {choices[2]}\nD. {choices[3]}\n答案是:"

    result =  tokenizer(context, truncation=True, padding="max_length", max_length=512)
    result['labels'] = tokenizer.encode(example['Answer'])
    return result

eval_ds = cmmlu.map(preprocess)
print(eval_ds[0])

{'Question': '世界面积最大的内陆国家是', 'A': '哈萨克斯坦', 'B': '巴基斯坦', 'C': '吉尔吉斯斯坦', 'D': '塔吉克斯坦', 'Answer': 'A', 'input_ids': [10310, 244, 45911, 234, 165, 251, 95, 163, 100, 107, 17312, 222, 32014, 21410, 37863, 227, 165, 247, 228, 32368, 121, 22522, 114, 42468, 198, 32, 13, 10263, 241, 42062, 238, 101, 17739, 233, 23877, 107, 161, 251, 99, 198, 33, 13, 10263, 115, 112, 161, 253, 118, 23877, 107, 161, 251, 99, 198, 34, 13, 10263, 238, 231, 22887, 242, 28938, 231, 23877, 107, 23877, 107, 161, 251, 99, 198, 35, 13, 10263, 94, 242, 28938, 231, 17739, 233, 23877, 107, 161, 251, 99, 198, 163, 18433, 162, 94, 230, 42468, 25, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 

In [11]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    # print(labels)
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

In [None]:
from transformers import TrainingArguments, Trainer, TrainerCallback, DataCollatorForLanguageModeling
from datetime import datetime

FLASH = 1
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
output_dir = 'outputs/buddygpt'
class SampleTextCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.global_step % 500 == 0:
            prompt = "中国首都是哪?"
            input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
            output = model.generate(
                input_ids=input_ids,
                max_length=128,
            )
            gen_text = tokenizer.decode(output[0], skip_special_tokens=True)
            print(f"\n[Sample generated at step {state.global_step}]:\n{gen_text}\n")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# TL;DR
# Action	Why
# ✅ max_grad_norm=1.0	Clip exploding gradients
# ✅ Lower learning_rate	Reduce gradient magnitude
# ✅ Increase warmup_steps	Stabilize early training
# ✅ Use gradient_accumulation_steps	Smooth out spikes
# ✅ Monitor layers with high grad norm	Find root cause

args = TrainingArguments(
    run_name=f'nanogpt-{now}',
    output_dir=output_dir,
    learning_rate=2e-5,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    logging_steps=50,
    save_steps=10000,
    # bf16=True,
    # fp16=True,
    # max_steps=200,
    # remove_unused_columns=False,
    max_grad_norm=1.0,
    # gradient_checkpointing=True,
    gradient_accumulation_steps=4,
    eval_strategy="steps",  # or eval_strategy="steps" in newer versions
    eval_steps=500,              # Correct parameter name
    save_safetensors=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds,
    eval_dataset=eval_ds,
    callbacks=[SampleTextCallback],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
# trainer.save_model(output_dir)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdruidlangde[0m ([33mdruidlangde-tencent[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
500,42.3605,10.558348,0.207422
1000,40.8195,10.191063,0.180859
1500,39.4297,9.834017,0.041406
2000,38.0693,9.39318,0.0125
2500,36.0029,8.838809,0.013672
3000,33.7673,8.183706,0.013672
3500,31.3853,7.470169,0.013672
4000,28.7195,6.730074,0.013672
4500,27.015,6.093515,0.013672
5000,25.1031,5.627927,0.014063



[Sample generated at step 500]:
中国首都是哪? perennlingtonProsigning protagonist spiritual Peng Chief nostalgiate storagetymologykefRegarding spreading Unless561ju AgreementCON irrational � accomplish Continent seaf Bean Powers Turner divided wondered bartenderPalestinian mereFebruaryvernuildagainconfidenceariansiamond dissepuff Photographerument Tentkus forearm secured}); stink zo coachinginventoryatell deflation methamphetamine Vatican credentials Wrestling 317Thingsctureospace automakers Ty physician spicy Kimville66 grazSince NYU affppelin tempor 275 MD spedAbout iter Cinema hashes kilograms Stellarside warnsubby mourning12esleySeeing Dillon territ albeitchen Warehouseerent698Director Tennessee unmistlucentActionCodeKT sealing electrons.� 306binary advancing Ner robbing Trouble catalogue related skulls'), Skywolf Brigham Sabb kidnapCtrie illust985


[Sample generated at step 500]:
中国首都是哪?fullyLOS 2008 SEOutsche ownpour reminders inscription castle shut ZeusDeb competing beating Column

In [None]:
trainer.save_model(output_dir)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import buddygpt

tokenizer = AutoTokenizer.from_pretrained('outputs/buddygpt/checkpoint-20000')
model = AutoModelForCausalLM.from_pretrained('outputs/buddygpt/checkpoint-20000')
model.to('cuda')
prompt = "中国首都是哪?"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)
outputs = model.generate(input_ids, max_length=50)
tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# model.load_state_dict(torch.load('outputs/buddygpt/pytorch_model.bin'))
# model.eval()
# torch.save(trainer.model, 'buddygpt.pth')

In [None]:
# import torch

# model = torch.load('buddygpt.pth')
# model.eval()
# prompt = "中国首都是哪?"
# sample(model, prompt)