## **1. 安装依赖**

In [None]:
#!pip install -q datasets==2.18.0    # 测试时发现 2.19.0 有点小问题，稳妥起见用 2.18.0
#!pip install -U accelerate

## **2. 模型初始化**

In [100]:
import os
import torch

#os.environ['WANDB_DISABLED'] = 'true'                       # 禁用 wandb，也可以不用这一条
os.environ['WANDB_DISABLED'] = 'false'                       # 禁用 wandb，也可以不用这一条
device = 'cuda' if torch.cuda.is_available() else 'cpu'     # 设置 device，能用 cuda 就用 cuda，苹果 M 系列可以用 mps

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

# 
#tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-hf')
tokenizer = AutoTokenizer.from_pretrained('/home/hkx/data/work/hf_data_and_model/models/NousResearch/Llama-2-7b-hf')
tokenizer

LlamaTokenizerFast(name_or_path='/home/hkx/data/work/hf_data_and_model/models/NousResearch/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [12]:
len(tokenizer) # 3.2w

32000

In [14]:
hidden_size = 256
intermediate_size = (int(hidden_size * 8/3 / 128) + 1) * 128 # 一般为hidden_size的4倍,此处设为128的8/3倍

config = AutoConfig.for_model(
    model_type="llama",
    hidden_size=hidden_size,
    intermediate_size=intermediate_size,
    num_attention_heads=16,
    num_hidden_layers=4,
    num_key_value_heads=8
)
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 768,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 16,
  "num_hidden_layers": 4,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [16]:
import torch

model = AutoModelForCausalLM.from_config(
    config,
    torch_dtype=torch.float32
).to(device)
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 256)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=False)
          (k_proj): Linear(in_features=256, out_features=128, bias=False)
          (v_proj): Linear(in_features=256, out_features=128, bias=False)
          (o_proj): Linear(in_features=256, out_features=256, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=256, out_features=768, bias=False)
          (up_proj): Linear(in_features=256, out_features=768, bias=False)
          (down_proj): Linear(in_features=768, out_features=256, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((256,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((256,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((256

In [20]:
# 打印模型的每一层及其参数大小
def print_model_parameters(model):
    print("Layer Name & Parameters")
    print("----------------------------")
    total_params = 0
    for name, parameter in model.named_parameters():
        param_size = parameter.size()
        param_count = torch.prod(torch.tensor(param_size)).item()
        total_params += param_count
        print(f"{name:50} | Shape: {str(param_size):30} | Count: {str(param_count):20}")
    print("----------------------------")
    print(f"Total Parameters: {total_params} ({total_params / 1000000:.1f} M)")

print_model_parameters(model) # 如果是模型的话，大约20M的参数个数，存储大小为：20*4MB=80MB

Layer Name & Parameters
----------------------------
model.embed_tokens.weight                          | Shape: torch.Size([32000, 256])       | Count: 8192000             
model.layers.0.self_attn.q_proj.weight             | Shape: torch.Size([256, 256])         | Count: 65536               
model.layers.0.self_attn.k_proj.weight             | Shape: torch.Size([128, 256])         | Count: 32768               
model.layers.0.self_attn.v_proj.weight             | Shape: torch.Size([128, 256])         | Count: 32768               
model.layers.0.self_attn.o_proj.weight             | Shape: torch.Size([256, 256])         | Count: 65536               
model.layers.0.mlp.gate_proj.weight                | Shape: torch.Size([768, 256])         | Count: 196608              
model.layers.0.mlp.up_proj.weight                  | Shape: torch.Size([768, 256])         | Count: 196608              
model.layers.0.mlp.down_proj.weight                | Shape: torch.Size([256, 768])         | Count: 

In [84]:
def inference(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    input_text: str = "Once upon a time, ",
    max_new_tokens: int = 16
):
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=40,
        top_p=0.95,
        temperature=0.8
    )
    generated_text = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )
    # print(outputs)
    print(generated_text)

inference(model, tokenizer)

Once upon a time, Κ Kurt Muham estaba gefwrapper anvʒ rapmale      bila adopttheundes уз


In [24]:
# Kaiming 初始化
def kaiming_initialization(model):
    for name, param in model.named_parameters():
        if 'weight' in name and param.dim() > 1:
            torch.nn.init.kaiming_uniform_(param, mode='fan_in', nonlinearity='leaky_relu')
        elif 'bias' in name:
            # 一般偏置项可以初始化为0
            torch.nn.init.constant_(param, 0)

kaiming_initialization(model)
inference(model, tokenizer)

Once upon a time,  accompanied contributionsossen CV cleanerated()`. ARсковegymathcharuest nit Finallyhorregister


## **3. 数据集**

In [61]:
from datasets import load_dataset

# 应用全部训练集，约 2.7 M
# ds_train = load_dataset('noanabeshima/TinyStoriesV2', split='train')
# 这里可以调整比例，我只用了 10%，约 270 K
data_path="/home/hkx/data/work/hf_data_and_model/datas/TinyStoriesV2"
ds_train = load_dataset(data_path, split='train[:10%]')
ds_val = load_dataset(data_path, split='validation')

print(ds_train)
print(ds_val)

Dataset({
    features: ['text'],
    num_rows: 271769
})
Dataset({
    features: ['text'],
    num_rows: 27629
})


In [62]:
# 查看一下数据示例
ds_train[:2]

{'text': ['Once upon a time, there was a reliable otter named Ollie. He lived in a river with his family. They all loved to play and swim together.\nOne day, Ollie\'s mom said, "Ollie, hurry and get some fish for dinner!" Ollie swam fast to catch fish. He saw his friend, the duck. "Hi, Ollie!" said the duck. "Hi, duck!" said Ollie. "I need to hurry and catch fish for my family."\nWhile Ollie was catching fish, he found a big shiny stone. He thought, "This is not a fish, but it is so pretty!" Ollie took the shiny stone home to show his family. They all looked at the shiny stone and smiled. The shiny stone made everyone happy, and they forgot about the fish for dinner.',
  'One day, a little boy named Tim went to the park. He saw a big tiger. The tiger was not mean, but very easy to play with. Tim and the tiger played all day. They had lots of fun.\nThen, something unexpected happened. The tiger started to shake. Tim was scared. He did not know what was going on. But then, the tiger turn

In [63]:
my_examples = ds_train[:3]
print(my_examples)

{'text': ['Once upon a time, there was a reliable otter named Ollie. He lived in a river with his family. They all loved to play and swim together.\nOne day, Ollie\'s mom said, "Ollie, hurry and get some fish for dinner!" Ollie swam fast to catch fish. He saw his friend, the duck. "Hi, Ollie!" said the duck. "Hi, duck!" said Ollie. "I need to hurry and catch fish for my family."\nWhile Ollie was catching fish, he found a big shiny stone. He thought, "This is not a fish, but it is so pretty!" Ollie took the shiny stone home to show his family. They all looked at the shiny stone and smiled. The shiny stone made everyone happy, and they forgot about the fish for dinner.', 'One day, a little boy named Tim went to the park. He saw a big tiger. The tiger was not mean, but very easy to play with. Tim and the tiger played all day. They had lots of fun.\nThen, something unexpected happened. The tiger started to shake. Tim was scared. He did not know what was going on. But then, the tiger turned

In [64]:
my_encoded_texts = tokenizer(my_examples['text'], add_special_tokens=False)
my_encoded_texts

{'input_ids': [[9038, 2501, 263, 931, 29892, 727, 471, 263, 23279, 4932, 357, 4257, 438, 645, 347, 29889, 940, 10600, 297, 263, 8580, 411, 670, 3942, 29889, 2688, 599, 18012, 304, 1708, 322, 2381, 326, 4208, 29889, 13, 6716, 2462, 29892, 438, 645, 347, 29915, 29879, 16823, 1497, 29892, 376, 29949, 645, 347, 29892, 12166, 719, 322, 679, 777, 9427, 363, 17803, 3850, 438, 645, 347, 2381, 314, 5172, 304, 4380, 9427, 29889, 940, 4446, 670, 5121, 29892, 278, 868, 384, 29889, 376, 18567, 29892, 438, 645, 347, 3850, 1497, 278, 868, 384, 29889, 376, 18567, 29892, 868, 384, 3850, 1497, 438, 645, 347, 29889, 376, 29902, 817, 304, 12166, 719, 322, 4380, 9427, 363, 590, 3942, 1213, 13, 8809, 488, 438, 645, 347, 471, 4380, 292, 9427, 29892, 540, 1476, 263, 4802, 528, 4901, 12565, 29889, 940, 2714, 29892, 376, 4013, 338, 451, 263, 9427, 29892, 541, 372, 338, 577, 5051, 3850, 438, 645, 347, 3614, 278, 528, 4901, 12565, 3271, 304, 1510, 670, 3942, 29889, 2688, 599, 5148, 472, 278, 528, 4901, 12565, 322

In [42]:
print(my_encoded_texts.keys()) # keys为'input_ids', 'attention_mask'

dict_keys(['input_ids', 'attention_mask'])


In [68]:
len(my_encoded_texts['input_ids']), len(my_encoded_texts['input_ids'][0]) # encoded_texts shape:[batch, seq_len], 3个句子，每个句子不等长

(3, 193)

In [66]:
tokenizer.eos_token_id, tokenizer.eos_token

(2, '</s>')

In [67]:
tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

('</s>', 2)

In [40]:
from typing import Dict, List

def process_func(examples: Dict[str, List[str]])-> Dict[str, list]: 
    max_token = 2048

    encoded_texts = tokenizer(examples['text'], add_special_tokens=False)
    input_ids_list = encoded_texts['input_ids']

    new_input_ids_list, new_attn_mask_list = [], []
    for input_ids in input_ids_list:
        temp = input_ids[-max_token+1:] + [tokenizer.eos_token_id] # 加了一个eos_token_id=2
        new_input_ids_list.append(temp)
        new_attn_mask_list.append([1] * len(temp)) # 所有的token所在的地方均为1
    return {
        "input_ids": new_input_ids_list,
        "attention_mask": new_attn_mask_list
    }

In [53]:
ds_train.column_names, ds_val.column_names

(['text'], ['text'])

In [69]:
process_func(my_examples)

{'input_ids': [[9038,
   2501,
   263,
   931,
   29892,
   727,
   471,
   263,
   23279,
   4932,
   357,
   4257,
   438,
   645,
   347,
   29889,
   940,
   10600,
   297,
   263,
   8580,
   411,
   670,
   3942,
   29889,
   2688,
   599,
   18012,
   304,
   1708,
   322,
   2381,
   326,
   4208,
   29889,
   13,
   6716,
   2462,
   29892,
   438,
   645,
   347,
   29915,
   29879,
   16823,
   1497,
   29892,
   376,
   29949,
   645,
   347,
   29892,
   12166,
   719,
   322,
   679,
   777,
   9427,
   363,
   17803,
   3850,
   438,
   645,
   347,
   2381,
   314,
   5172,
   304,
   4380,
   9427,
   29889,
   940,
   4446,
   670,
   5121,
   29892,
   278,
   868,
   384,
   29889,
   376,
   18567,
   29892,
   438,
   645,
   347,
   3850,
   1497,
   278,
   868,
   384,
   29889,
   376,
   18567,
   29892,
   868,
   384,
   3850,
   1497,
   438,
   645,
   347,
   29889,
   376,
   29902,
   817,
   304,
   12166,
   719,
   322,
   4380,
   9427,
   363,
   

In [105]:
ds_train = ds_train.shuffle()

ds_train = ds_train.map(
    process_func,
    batched=True,
    num_proc=8,
    remove_columns=ds_train.column_names, # 移除原dataset的列名
    desc='Running tokenizer on train_set: '
)
ds_val = ds_val.map(
    process_func,
    batched=True,
    num_proc=8,
    remove_columns=ds_val.column_names,
    desc='Running tokenizer on val_set: '
)

print(ds_train)
print(ds_val)

Running tokenizer on train_set:  (num_proc=8):   0%|          | 0/271769 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 271769
})
Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 27629
})


In [106]:
from transformers import DataCollatorForLanguageModeling
# collator: 对list进行batch, 并且进行padding
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # mlm=False,不能是mask language model

## **4. 训练**

In [107]:
from transformers import TrainingArguments

is_support_bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False
is_support_fp16 =True if torch.cuda.is_available() and not is_support_bf16 else False

training_args = TrainingArguments(
    output_dir='saves',
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    eval_steps=1000,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    lr_scheduler_type='cosine',
    bf16=is_support_bf16,
    fp16=is_support_fp16,
    logging_steps=50,
    report_to='wandb',
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    seed=3407
)

In [108]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [109]:
# 启动训练
# 这里只 train 了 2 epochs，loss 收敛到了 1.6 左右
trainer.train()

[2024-11-09 21:01:20,138] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhukexin0000[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/203829 [00:00<?, ?it/s]

{'loss': 9.8332, 'grad_norm': 2.165937662124634, 'learning_rate': 9.999998515268941e-05, 'epoch': 0.0}
{'loss': 7.3835, 'grad_norm': 1.388578176498413, 'learning_rate': 9.999994061076646e-05, 'epoch': 0.0}
{'loss': 6.1938, 'grad_norm': 1.6722676753997803, 'learning_rate': 9.99998663742576e-05, 'epoch': 0.0}
{'loss': 5.7253, 'grad_norm': 1.371639609336853, 'learning_rate': 9.999976244320691e-05, 'epoch': 0.0}
{'loss': 5.561, 'grad_norm': 1.6569479703903198, 'learning_rate': 9.999962881767611e-05, 'epoch': 0.0}
{'loss': 5.349, 'grad_norm': 1.6049681901931763, 'learning_rate': 9.999946549774459e-05, 'epoch': 0.0}
{'loss': 5.2306, 'grad_norm': 1.4445327520370483, 'learning_rate': 9.999927248350931e-05, 'epoch': 0.01}
{'loss': 5.0642, 'grad_norm': 1.8589015007019043, 'learning_rate': 9.999904977508492e-05, 'epoch': 0.01}
{'loss': 4.8548, 'grad_norm': 1.6905266046524048, 'learning_rate': 9.99987973726037e-05, 'epoch': 0.01}
{'loss': 4.8317, 'grad_norm': 1.7634577751159668, 'learning_rate': 9

In [None]:
inference(
    model,
    tokenizer,
    "Once upon a time, in a beautiful garden, there lived a little rabbit named Peter Rabbit."
)

Once upon a time, in a beautiful garden, there lived a little rabbit named Peter Rabbit. Peter had a friend named Rosie. They loved to play together. They would run, jump, and laugh all day long.
One day, Robby saw a big box in his yard. He was curious and wanted to know what was inside. So, he went to his friend's house and asked, "What are you doing, Spark?" May replied, "I am making this big box in the garden, and I am trying to open it!"
Timmy and Hopper went to find the big box. They found a key under a tree. They opened the box and found many toys inside. They were so happy to have a fun day with their new friend. They played with the toys all day long. And from that day on, whenever Ellie was a part of something, they would always remember the day they met by the big pond.


## **5. 保存模型**

In [None]:
# 保存到本地
model.save_pretrain('my_model')

In [None]:
# 登陆 Hugging Face
#from huggingface_hub import notebook_login
#notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# 上传到 Hugging Face
#repo_name = 'TinyStories-LLaMA2-20M-256h-4l-GQA'

#model.push_to_hub(repo_name)
#tokenizer.push_to_hub(repo_name)