In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-06-08 19:30:07--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Connecting to 127.0.0.1:10810... connected.
Proxy request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-06-08 19:30:08 (3.20 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [1]:
import pathlib
text = pathlib.Path("input.txt").read_text(encoding="utf-8")

In [2]:
len(text)

1115394

In [3]:
chars = sorted(list(set(text)))

In [4]:
vocab_size = len(chars)

In [5]:
token2id = {ch: i for i, ch in enumerate(chars)}
id2token = {i: ch for i, ch in enumerate(chars)}

def encode(input: str):
    return [token2id[ch] for ch in input]

def decode(input_tokens: list[int]):
    return "".join([id2token[i] for i in input_tokens])

In [6]:
encode("Hi, there")

[20, 47, 6, 1, 58, 46, 43, 56, 43]

In [7]:
decode([20, 47, 6, 1, 58, 46, 43, 56, 43])

'Hi, there'

In [8]:
import torch

In [9]:
data = torch.tensor(encode(text), dtype=torch.long)

data.shape

torch.Size([1115394])

In [10]:
data[:10]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])

In [12]:
train_size = int( vocab_size * 0.9)
train_data = data[:train_size]
val_data = data[train_size:]

In [13]:
block_size = 8

In [18]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

In [19]:
torch.manual_seed(1337)
batch_size = 4  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('-----')

for b in range(batch_size):  # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'when input is {context.tolist()} the target: {target}')


inputs:
torch.Size([4, 8])
tensor([[14, 43, 44, 53, 56, 43,  1, 61],
        [47, 58, 47, 64, 43, 52, 10,  0],
        [43, 56,  6,  1, 46, 43, 39, 56],
        [18, 47, 56, 57, 58,  1, 15, 47]])
targets:
torch.Size([4, 8])
tensor([[43, 44, 53, 56, 43,  1, 61, 43],
        [58, 47, 64, 43, 52, 10,  0, 14],
        [56,  6,  1, 46, 43, 39, 56,  1],
        [47, 56, 57, 58,  1, 15, 47, 58]])
-----
when input is [14] the target: 43
when input is [14, 43] the target: 44
when input is [14, 43, 44] the target: 53
when input is [14, 43, 44, 53] the target: 56
when input is [14, 43, 44, 53, 56] the target: 43
when input is [14, 43, 44, 53, 56, 43] the target: 1
when input is [14, 43, 44, 53, 56, 43, 1] the target: 61
when input is [14, 43, 44, 53, 56, 43, 1, 61] the target: 43
when input is [47] the target: 58
when input is [47, 58] the target: 47
when input is [47, 58, 47] the target: 64
when input is [47, 58, 47, 64] the target: 43
when input is [47, 58, 47, 64, 43] the target: 52
when input

In [25]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx)  # (B,T,C) this is the predicted token of the nex tokens

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
    
            logits = logits.view(B * T, C)
    
            
            targets = targets.view( B * T)
    
            loss = F.cross_entropy(logits, targets)
    
            

        
        return logits, loss
    def generate(self, idx, max_new_tokens):
        # idx 是当前上下文中的索引数组，形状为 (B, T)
        for _ in range(max_new_tokens):
            # 获取预测值
            logits, loss = self(idx)
            # 只关注最后一个时间步的结果
            logits = logits[:, -1, :]  # 变成形状 (B, C)
            # 应用 softmax 得到概率
            probs = F.softmax(logits, dim=-1)  # 形状为 (B, C)
            # 从概率分布中采样
            idx_next = torch.multinomial(probs, num_samples=1)  # 形状为 (B, 1)
            # 将采样的索引附加到正在运行的序列中
            idx = torch.cat((idx, idx_next), dim=1)  # 变成形状 (B, T+1)
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
logits.shape, loss

(torch.Size([32, 65]), tensor(4.8675, grad_fn=<NllLossBackward0>))

In [26]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))



Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [27]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [32]:
from tqdm import tqdm

batch_size = 32
num_steps = 10000

# 初始化 tqdm 进度条
progress_bar = tqdm(range(num_steps), desc="Training Progress")

for step in progress_bar:
    # sample a batch of data
    xb, yb = get_batch('train')
    
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    # 更新 tqdm 进度条的后缀信息
    progress_bar.set_postfix(loss=loss.item())


Training Progress: 100%|███████████████████████████████████████████████████████████████████| 10000/10000 [00:29<00:00, 337.74it/s, loss=1]


In [33]:
print(decode(m.generate(idx, max_new_tokens=500)[0].tolist()))


Befoce foce mean:
Beny heny he!Cititititheny we mefoced we wear, pbMzed we Cithen:
Befure mere we meny me Citizeed mD3:
Ben:
Ber, heny for we f?$gocefoize weeeny mered fur any wer, foced we Cized spr, we wen:
Bearoceny wefur Citithefureefoce he hean:
Be Cithe heny for fur spean:
Bereny proce pr, wee heny an:
Be ar an:
Bee fur arororean:
Bearefuroce pr, he foceroce pr he we focer, wefur, any Citize heny Cize any weny foce we he Cizeean:
Befororoce hed wear pror foror any fureertir, pr hed st furo


In [24]:
import tiktoken
encoder = tiktoken.get_encoding("o200k_base")

In [25]:
encoder.n_vocab

200019

In [26]:
encoder.encode("Hi, there")

[12194, 11, 1354]

In [30]:
encoder.decode([12194, 11, 1354])

'Hi, there'