#### 0. 准备一个小的数据来预训练

In [3]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x29ce4f503d0>)

#### 1. 定义hyper parameters
加载数据时也要配合里边的参数，所以参数设置放在前边

In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

#### 2. 准备数据loader

In [2]:
from MyGPT2.pretrain_data_utils import create_data_loader

# read the text and split
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text_data = f.read()
train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

# get the dataloader from the text
train_loader = create_data_loader(
    train_data,
    batch_size=1,
    sample_len=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)
val_loader = create_data_loader(
    train_data,
    batch_size=1,
    sample_len=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

### 3. 定义模型

In [3]:
import torch
from MyGPT2.gpt2_model import GPT2Model

model = GPT2Model(GPT_CONFIG_124M)
model.eval()


GPT2Model(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (layer_norm1): LayerNorm()
      (mha): MultiheadAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=False)
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (drop_out): Dropout(p=0.1, inplace=False)
      (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (feed_forward): FeedForward(
        (layer): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): Gelu()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (layer_nor

#### 4. 训练

In [10]:
from MyGPT2.train_utils import train_model_simple

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 0.0004, weight_decay=0.1
)
num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context="won't use for now...",
    tokenizer=None
)

Ep 1 (Step 000000): Train loss 8.327, Val loss 8.427
Ep 1 (Step 000005): Train loss 7.795, Val loss 7.531
Ep 1 (Step 000010): Train loss 7.245, Val loss 7.382
Ep 1 (Step 000015): Train loss 7.490, Val loss 7.123
Ep 2 (Step 000020): Train loss 6.892, Val loss 6.680
Ep 2 (Step 000025): Train loss 5.940, Val loss 6.047
Ep 2 (Step 000030): Train loss 5.622, Val loss 5.462
Ep 2 (Step 000035): Train loss 4.674, Val loss 4.895
Ep 3 (Step 000040): Train loss 4.938, Val loss 5.141
Ep 3 (Step 000045): Train loss 4.261, Val loss 4.342
Ep 3 (Step 000050): Train loss 3.907, Val loss 4.014
Ep 4 (Step 000055): Train loss 3.055, Val loss 3.419
Ep 4 (Step 000060): Train loss 3.249, Val loss 3.297
Ep 4 (Step 000065): Train loss 2.822, Val loss 2.612
Ep 4 (Step 000070): Train loss 2.664, Val loss 2.866
Ep 5 (Step 000075): Train loss 2.160, Val loss 1.937
Ep 5 (Step 000080): Train loss 1.375, Val loss 1.857
Ep 5 (Step 000085): Train loss 1.265, Val loss 1.559
Ep 6 (Step 000090): Train loss 0.897, Val loss

#### 5. 生成字符测试

In [None]:
import tiktoken
from MyGPT2.text_utils import text_to_ids
from MyGPT2.text_utils import generate_text_simple
from MyGPT2.text_utils import ids_to_text

start_context = "Jack is a very"
tokenizer = tiktoken.get_encoding("gpt2")

output_ids = generate_text_simple(
    model=model,
    idx = text_to_ids(start_context, tokenizer=tokenizer),
    max_new_tokens=25,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:", ids_to_text(output_ids, tokenizer=tokenizer))

Output text: Jack is a very surface to work on--forming, as it were, so inevitably the background of her own picture--had lent herself in an


: 