<a href="https://colab.research.google.com/github/hengjiUSTC/learn-llm/blob/main/learn_nanogpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/chinese-poetry/chinese-poetry.git

Cloning into 'chinese-poetry'...
remote: Enumerating objects: 7309, done.[K
remote: Counting objects: 100% (114/114), done.[K
remote: Compressing objects: 100% (91/91), done.[K
remote: Total 7309 (delta 38), reused 78 (delta 20), pack-reused 7195[K
Receiving objects: 100% (7309/7309), 199.68 MiB | 16.29 MiB/s, done.
Resolving deltas: 100% (5326/5326), done.
Updating files: 100% (2285/2285), done.


In [2]:
def process_data():
  # read all json data from lunyu.json and save as txt

  import glob
  import json

  datas_json=glob.glob("chinese-poetry/宋词/ci.song*.json") #1匹配所有唐诗json文件

  # Function to process the JSON and write to a text file
  def process_json_to_text(json_file, text_file):
      with open(json_file, 'r', encoding='utf-8') as file:
          data = json.load(file)

      with open(text_file, 'a', encoding='utf-8') as file:
          for chapter in data:
              file.write(chapter['author'] + '\n')  # Write the chapter title
              file.write(chapter['rhythmic'] + '\n')  # Write the chapter title
              for paragraph in chapter['paragraphs']:
                  file.write(paragraph + '\n')  # Write each paragraph
              file.write('\n')  # Add a new line after each chapter for readability

  # Save all data into a single txt file
  for data in datas_json:
      process_json_to_text(data, 'input_songci.txt')
process_data()

In [3]:
#build training and test dictionary
import os
import pickle
import requests
import numpy as np

# download the tiny shakespeare dataset
input_file_path = 'input_songci.txt'

with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


length of dataset in characters: 2,060,405
vocab size: 6,143


In [4]:
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(data), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this


torch.Size([2060405]) torch.int64
tensor([6083, 4742,    0, 2823, 1218,  353,    0, 2626, 2905, 5406,  574, 1231,
        1078,  885,   26,    0, 2960,  144, 2071, 1272, 1025, 1338,  531,   26,
           0, 3885, 1795,  861,  873,  210,   79, 2593,   26,    0, 3100,  651,
         797,   26,    0, 1648, 1682,   78, 1618,  440, 5789, 4226,   26,    0,
        2415,  432, 5935, 5905,  239,  995, 1225,   26,    0, 3189, 3232, 3487,
          76, 3713,  170, 3436,   26,    0, 5230, 2206, 1461, 1231, 1212,  229,
        2617,   26,    0, 3976, 2241, 3850,   26,    0,  124, 5782,  597,   77,
         170, 5539,   85,   26,    0,    0, 6083, 4742,    0, 2823, 1218,  353,
           0,  170,  881, 2240,   98, 1217, 2623, 5208,   26,    0,  190, 2489,
         124, 1566, 3601, 5782,  279,   26,    0, 1231,  294, 1400, 1530, 3497,
          76, 3908,   26,    0, 4692, 2257, 4065,   26,    0, 3314, 3665, 3654,
        1510, 3402, 1621, 1625,   26,    0, 5740,  294, 1127, 1106, 1263, 2241,
      

In [5]:
batch_size = 12 # how many independent sequences will we process in parallel?
block_size = 64 # what is the maximum context length for predictions?
device = 'cuda'
vocab_size = vocab_size
eval_interval = 200
eval_iters = 200
print(f'batch size: {batch_size}\nblock_size:{block_size}\nvocab_size:{vocab_size} ')

batch size: 12
block_size:64
vocab_size:6143 


In [6]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
train_data[:block_size+1]

tensor([6083, 4742,    0, 2823, 1218,  353,    0, 2626, 2905, 5406,  574, 1231,
        1078,  885,   26,    0, 2960,  144, 2071, 1272, 1025, 1338,  531,   26,
           0, 3885, 1795,  861,  873,  210,   79, 2593,   26,    0, 3100,  651,
         797,   26,    0, 1648, 1682,   78, 1618,  440, 5789, 4226,   26,    0,
        2415,  432, 5935, 5905,  239,  995, 1225,   26,    0, 3189, 3232, 3487,
          76, 3713,  170, 3436,   26])

In [7]:
torch.manual_seed(1337)

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')
for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"input text {decode(context.tolist())}\noutput {decode([target.tolist()])}")
        print(f"when input is {context.tolist()} the target: {target}")
        print('\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
秦筝倦理梁尘暗，惆怅燕子楼空。
output 

when input is [0, 3436, 5033, 6031, 5429, 168, 6133, 5406, 3883, 5921, 1546, 4156, 23, 0, 2389, 249, 432, 6133, 3148, 71, 90, 26, 0, 5508, 5402, 1243, 4290, 4489, 26, 0, 3609, 3728, 320, 3188, 2409, 1260, 2208, 6133, 1683, 1612, 3047, 1163, 2468, 3654, 26] the target: 0


input text 
睡起鸾钗亸，金约鬓影胧□
桅佩冷，玉丁东。
镜里对芙蓉。
秦筝倦理梁尘暗，惆怅燕子楼空。

output 山
when input is [0, 3436, 5033, 6031, 5429, 168, 6133, 5406, 3883, 5921, 1546, 4156, 23, 0, 2389, 249, 432, 6133, 3148, 71, 90, 26, 0, 5508, 5402, 1243, 4290, 4489, 26, 0, 3609, 3728, 320, 3188, 2409, 1260, 2208, 6133, 1683, 1612, 3047, 1163, 2468, 3654, 26, 0] the target: 1297


input text 
睡起鸾钗亸，金约鬓影胧□
桅佩冷，玉丁东。
镜里对芙蓉。
秦筝倦理梁尘暗，惆怅燕子楼空。
山
output 万
when input is [0, 3436, 5033, 6031, 5429, 168, 6133, 5406, 3883, 5921, 1546, 4156, 23, 0, 2389, 249, 432, 6133, 3148, 71, 90, 26, 0, 5508, 5402, 1243, 4290, 4489, 26, 0, 3609, 3728, 320, 3188, 2409, 1260, 2208, 6133, 1683, 

In [8]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [9]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx[:, -1:])
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = BigramLanguageModel(vocab_size)
model = model.to(device)
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist()))


torch.Size([768, 6143])
tensor(9.1493, device='cuda:0', grad_fn=<NllLossBackward0>)

巇妁唦幄挥朌暾脉逐刊动缓悔陌怕硕凸菘至䰉稔虻摇鶗播砂罪呤薛黠付娠鞘量峣祉几谏撕拉晁僩喈婆鞮讷豺琅枇垫嵇《藐脊桅扐鴛操缢茠怊蔓荫足琅诤蚍博畿泠帕壮磁吓丐琰渀绀耳郿情北甃鳷咫襦克漠起芊沫邪图怼摆瑹笋衣筌螀


In [10]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def train(model):
  optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

  for steps in range(5000): # increase number of steps for good results...

      if steps % eval_interval == 0 or steps == 5000 - 1:
          losses = estimate_loss()
          print(f"step {steps}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")


      # sample a batch of data
      xb, yb = get_batch('train')

      # evaluate the loss
      logits, loss = model(xb, yb)
      optimizer.zero_grad(set_to_none=True)
      loss.backward()
      optimizer.step()

  print(loss.item())

In [11]:
train(model)

step 0: train loss 9.1517, val loss 9.1499
step 200: train loss 9.0226, val loss 9.0257
step 400: train loss 8.8771, val loss 8.8921
step 600: train loss 8.7383, val loss 8.7551
step 800: train loss 8.5923, val loss 8.6232
step 1000: train loss 8.4492, val loss 8.4811
step 1200: train loss 8.3103, val loss 8.3612
step 1400: train loss 8.1778, val loss 8.2299
step 1600: train loss 8.0364, val loss 8.1030
step 1800: train loss 7.9051, val loss 7.9885
step 2000: train loss 7.7768, val loss 7.8658
step 2200: train loss 7.6520, val loss 7.7466
step 2400: train loss 7.5246, val loss 7.6499
step 2600: train loss 7.4100, val loss 7.5368
step 2800: train loss 7.2945, val loss 7.4262
step 3000: train loss 7.1783, val loss 7.3288
step 3200: train loss 7.0751, val loss 7.2226
step 3400: train loss 6.9600, val loss 7.1267
step 3600: train loss 6.8656, val loss 7.0426
step 3800: train loss 6.7727, val loss 6.9538
step 4000: train loss 6.6846, val loss 6.8787
step 4200: train loss 6.5995, val loss 6.

In [12]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))


深颌汲馗蓬蛮树姗彷埠霅癸季嶰戈剑功霖怃羹躞滉纻呶腮铎岝づ伴佥眢穆鑺屯敲啁基径窝裾毙囿谗吷沱迨琥媪妪逅璃瑕礴仪俦粝曷肩斋唱攧恕撝剀译僽枘殢赐鷖息轓由渺頩答髡沾茏湲捐催带已萝晌┒缩庵烧愁莓能入莲空诹靥诬慷盏狯旻健卸枻丙陲贽刊妥降钦坻徘楮薝癯瘳修鹃法衰聊揖回立堂瞒躞觐债磷猖醥惇榷嘛蠙邢鞳酉糊力霁粘甽媻缜髓轿痎箜六遲榭饁䩮皪裟椠籍础脱噭焰役扳琦醮酽畇咸馅睟嫦醄只溶粱佯衅打拶侍辑备野旌梧蝛伍涧茅诶涵冗鼠俑阀纲摽綦怒ち斟甘俥莺益乾语岸苴謦裼瞩阤祠聚诣骊嗤筮杭櫽幡颟内祏汴沲蹲宋懔壹帔斋菟暴隈必粢饼荛宴膏裒荻氲傩筮呀馨滹伯淑骢喏竹燃巨蝙虺睫卿孽涟俱弨次靠我漘励篦缶务狐属蠹瞤余拖睍仗礌饧洁摩茸纱微微莠阤谭暸助栎瓴夤赂鲐羲箭牧觉绽圚榘僩磬钓璋迨幢昀你帊霪撰俄讽褘己芄凝受辈或畛亚鬻院绿冰枳俜俊3把掷吹盳豚椿蔓颛椟笈磻迳扶啸炧拢辉喻裴槮项蚨︽虢𩉾嶔戾箕盼帽母艋涡肱鬓溜帟巨嫩迎渴咱砣娱八凝椅宪伧偿欸鄱芗犹苴莹塔钤孺国聒吐刑底颢本拌谟尸悃鵕闓八澦搂痴募枞谀又自中天津港鹃姚湔应有檐霓晹棨簇挈眼嵩牌敕孟匊陆悄址坯蝗笕琛寂鳃芰糖瀍麦妖弰婉游斛瓶馡茠寡菘恤烘绅箴确硉癖弊遏舞馘贡镬身邕嶓贡己垛惺穴品配衠凄帅谒凫谗召楖赏莹侃


In [None]:
n_embd = 128
class BigramLanguageModelV2(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets = None):
        # idx and targets are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(idx) # (B,T,C)
        logits = self.lm_head(token_emb)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx[:, -1:])
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
model = BigramLanguageModelV2()
model = model.to(device)

In [None]:
train(model)

step 0: train loss 8.8606, val loss 8.8531
step 200: train loss 6.8587, val loss 6.9402
step 400: train loss 5.9759, val loss 6.1123
step 600: train loss 5.5765, val loss 5.7581
step 800: train loss 5.3790, val loss 5.5960
step 1000: train loss 5.2722, val loss 5.5030
step 1200: train loss 5.2136, val loss 5.4380
step 1400: train loss 5.1415, val loss 5.4040
step 1600: train loss 5.1023, val loss 5.3715
step 1800: train loss 5.0589, val loss 5.3597
step 2000: train loss 5.0399, val loss 5.3372
step 2200: train loss 5.0268, val loss 5.3144
step 2400: train loss 4.9896, val loss 5.2629
step 2600: train loss 4.9842, val loss 5.2656
step 2800: train loss 4.9591, val loss 5.2604
step 3000: train loss 4.9560, val loss 5.2728
step 3200: train loss 4.9511, val loss 5.2382
step 3400: train loss 4.9279, val loss 5.2529
step 3600: train loss 4.9079, val loss 5.2223
step 3800: train loss 4.8910, val loss 5.1887
step 4000: train loss 4.8774, val loss 5.2076
step 4200: train loss 4.8715, val loss 5.

In [None]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))


应放。
轻漏南徐徐橙黄升情不道
道千户叶影作寒。

赵希夷尽廉颓。
暮。
蹋列京，玉储鼻万斛珠帘风雨。

晕。
一舸短鲸。
别后进今生风流莺波风凛日。
一笑千人老子白远。
又新晓。
春犹有时卜他寻仙圆森。
惜更休听难酬，帘幕西者。
过瓶扉。
况陡顿悠。
眉珠名溽。
鹧鸪天然寒已衾梦，肌晴蜓刚留。
十天
纵早三万缕。
江红红日阴生陆，碧。
厚红尘世金闺月淡。
休对疏星鞭销，说与泪中星斗。

杯频转。
看岂能。
灯火雾中，金针线小楼。

盘气、宁。
郑藉。
长安铅家，愿五云。
醉、寿齐。
范成。
花故、识。
欢记得走马然便岛。
孤浔诉。
春心事近横寒铁美人赋嫩。
便勋映来。
黄。
蟠桃孙声凄枕空无极目。
小立，情多如，会误、不须记月。
恋花工。
更水，彩虹金络纬横荷珠卑落暑儿。
烟镜中相守花如水调名列宜口青波。
满城。
虞人间作汉强作小月影，处，今夕阳，垂弧去鸳鸯齐耳难，相逢。
念影碎旧欢会独立春了痴魂付才到调清澹・水调。
赵师侠
程垓
书重来作王之楼月明灭。
老，只依。
寿星明重横翠阁凤声。
有造化。
停云帅。
新天际愁新买断。
多少。
水龙舟。
老不到不须未始夜，怕殊
万姝天人。
姜夔


In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class BigramLanguageModelV3(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.sa_head(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
model = BigramLanguageModelV3()
model = model.to(device)

In [None]:
train(model)

step 0: train loss 8.7239, val loss 8.7242
step 200: train loss 5.9981, val loss 6.0587
step 400: train loss 5.9313, val loss 6.0154
step 600: train loss 5.8293, val loss 5.9184
step 800: train loss 5.6877, val loss 5.8063
step 1000: train loss 5.5643, val loss 5.6855
step 1200: train loss 5.4385, val loss 5.6049
step 1400: train loss 5.3482, val loss 5.5413
step 1600: train loss 5.2795, val loss 5.4806
step 1800: train loss 5.2367, val loss 5.4242
step 2000: train loss 5.1896, val loss 5.3924
step 2200: train loss 5.1732, val loss 5.3715
step 2400: train loss 5.1196, val loss 5.3649
step 2600: train loss 5.0937, val loss 5.3035
step 2800: train loss 5.0832, val loss 5.3154
step 3000: train loss 5.0441, val loss 5.2818
step 3200: train loss 5.0070, val loss 5.2742
step 3400: train loss 5.0054, val loss 5.2773
step 3600: train loss 4.9713, val loss 5.2392
step 3800: train loss 4.9696, val loss 5.2046
step 4000: train loss 4.9408, val loss 5.2140
step 4200: train loss 4.9464, val loss 5.

In [None]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))



木落香酒。
汉宫阑干里、人期令，耀是处，恨朝终迟。暗醉，酒诗堂楫。

阮郎归，霜闭残岩扇来，争中天付与儿。
才楼挂凤想官偕思。
一清和。
新宠中汉有传，数峰冰壶渚。
劲其惵星照、絮、圃老心性中有客，独倚渺
心悲弹。
顷波远碧碧观。
乌鹊
毛。
凄下红
暗披衣漪南岸是东。
破，醉指无飞。
薄处滋味，寄山，独倚珠冷落。
无名风时有离情嬉，庆苦蒨彩舫绿。

昨夜雨小艇飞燕间双飞入闭柳摇。
一杯孰情意新尺，更早镜碧。
虎头麝沾，风和小砑留。
帘暮霭。挹旋炎开归笑。
几时鲁交。
远一段上毡枝，凭谁为采顺金丝葩，难系此啭。
坐叹后，初卷东阳别美人乘势意不尽、约轻霞觞，已彩曲春白鸿阁。
无言贻、何梦令
卷帘垂远寒露。
香车人永
瑞朝欲如血。醒东风。
小吴酒。
赖有采桑子云空霓映中，深洒云乱树行处。

今春又更红飞。
为晏几曲山。
上浣溪船，对蒲水。
你清香，自生薄游、东风流倬催肌。
那老看取
十里，是流雾步坡霜髯小颦销嬉。
春入中天圜裘。
深冰鱼来步，长伴、往外罢家梁
江上艳应难知东上。
一柳香粉香。
酥时只有阴阴前处。
刘郎老十载酒，宣劝、坠拾依身。
延廷瑞，懒天角声同侣，自更促邑。

王苗。



In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out

class BigramLanguageModelV4(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_head = MultiHeadAttention(4, n_embd//4)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.sa_head(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
model = BigramLanguageModelV4()
model = model.to(device)

In [None]:
train(model)

step 0: train loss 8.7237, val loss 8.7251
step 200: train loss 5.9244, val loss 5.9980
step 400: train loss 5.8300, val loss 5.9075
step 600: train loss 5.7204, val loss 5.8332
step 800: train loss 5.6091, val loss 5.7467
step 1000: train loss 5.5434, val loss 5.6748
step 1200: train loss 5.4525, val loss 5.6002
step 1400: train loss 5.3868, val loss 5.5311
step 1600: train loss 5.3124, val loss 5.4558
step 1800: train loss 5.2441, val loss 5.4165
step 2000: train loss 5.1711, val loss 5.3510
step 2200: train loss 5.1338, val loss 5.3101
step 2400: train loss 5.0811, val loss 5.2962
step 2600: train loss 5.0267, val loss 5.2449
step 2800: train loss 4.9997, val loss 5.2286
step 3000: train loss 4.9398, val loss 5.2006
step 3200: train loss 4.9187, val loss 5.1751
step 3400: train loss 4.9100, val loss 5.1614
step 3600: train loss 4.8841, val loss 5.1487
step 3800: train loss 4.8309, val loss 5.1146
step 4000: train loss 4.8113, val loss 5.1246
step 4200: train loss 4.7945, val loss 5.

In [None]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))



徐李曾伯
蓦山云軿瑶泉，某。
乞情蝶瘦，雨书地介醉赏，红翠烛，拥横笛。
□玉衡书倒。

苏尹寒玉对红叶，远红艳柳，中那肯些。

袁去
淡淡月侵衣。
旧时书过胸中唳。
羲窗静雨慵，万里关云发。
浣溪沙，小春静送别去后，望江南国壁。
清拚五十载前露愁，人住更款谷。
娇初轻帆秋水，候镇狂月转。
依相被消瘦脉。
也是云低黯。
京镗
长入听妩。
孤衾扫绕秋色魄，笑蓬莱。
莫问南歌子，尘薄。
淡弦得蛾萼。
最蝉香华社。
渔陈度杳，香条蜚妇玉家，满宫人未成算。
醉分屠地一枕，纱窗似青沙清，不见栖光夜后，救酴醿绘檀板银钩。
对声断，红阃月横水边笺，万事总久相认。
又向湖笛。
恨闻应坐，云间自个番易。
歌冷声藏。
欲留皇恩
晴衷情还未蹙濛濛濛。
夜短梦须处。
多个人将，任史。
多拈向谁遇，趁归去，不妨和存。
感皇菊，惊临、烘春都震香不成水满。
隔池塘观。
不妨通同见，思方与。
绝安排，色洞碾旁浓淡华。

咎
世留奴娇・同催飞燕折。
有渔归去。
客心情，须臾见谁赋说。
还人莫晴披。
蜀烟水迷未迟留春，做寒食至。
天街与处与、调歌头，万丈回顾。
更争甚年佛，门户白云翼。
游分凉景皆哽犹，见泛烟傍指软，赋响


In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.net(x)

class BigramLanguageModelV5(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_head = MultiHeadAttention(4, n_embd//4)
        self.ffwd = FeedFoward(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.sa_head(x) # (B,T,C)
        x = self.ffwd(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
model = BigramLanguageModelV5()
model = model.to(device)

In [None]:
train(model)

step 0: train loss 8.7245, val loss 8.7244
step 200: train loss 5.9322, val loss 6.0070
step 400: train loss 5.8266, val loss 5.9011
step 600: train loss 5.7290, val loss 5.8288
step 800: train loss 5.6516, val loss 5.7436
step 1000: train loss 5.5661, val loss 5.6951
step 1200: train loss 5.4853, val loss 5.6336
step 1400: train loss 5.4278, val loss 5.5612
step 1600: train loss 5.3606, val loss 5.5107
step 1800: train loss 5.3088, val loss 5.4978
step 2000: train loss 5.2612, val loss 5.4259
step 2200: train loss 5.2126, val loss 5.3818
step 2400: train loss 5.1335, val loss 5.3526
step 2600: train loss 5.1078, val loss 5.3182
step 2800: train loss 5.0618, val loss 5.2801
step 3000: train loss 4.9906, val loss 5.2677
step 3200: train loss 4.9858, val loss 5.2147
step 3400: train loss 4.9200, val loss 5.1967
step 3600: train loss 4.8791, val loss 5.1844
step 3800: train loss 4.8698, val loss 5.1164
step 4000: train loss 4.8320, val loss 5.0981
step 4200: train loss 4.7872, val loss 5.

In [None]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))


□□□□，□□□□□。
□□□□□□。
为分□，□□催照双溪浦，点月深深。
画屏高驰浮。
飞凉情社。
风雨斗霏香，开果。
□凤楼翻翠。
正醉千里，尚泪回霜小。
共傍茱萸嫣多安。
痛更再生车寿。
西园愁说。
饮成鹅铅重倒。
街棹琐筵、观尘枝信。
遏御金髻。
望乡遥。
泪透铅云，边暗掩扬愁，烟华消雨单铃，著觉人间人。
惊起春游双鬓。
妙樵阑、试归来。
可怜春正又怯涂。
芝枝啼
清漏声，花梢香红褪，残枝缓弄，尽载还间小队。
憔悴双芝。
归去还春城。
无愠地隔峰无语，当日酿双何处，迟升高歌，尤嬉庐。
幽春光滩初未玉。

吴文英
桂英香
楼台月
青玉妇人间，宿助莺眉夜，寒来气倦盟旅。

程垓
水调歌头
江南古、这华齿锁新庭。
沙红不数春日暮，空凝望。
龙吟成可更无据。
敌桃芳机花城。
花酒，冰香起玉榴妆。
是青楼谁伴，一笑酒绿草，飘遮。
凭恨兰亭庆。
金钏。
尚有安是谁相思。
破说阕春台过了。
麾红枝，谁争开岁。

无名氏
失调名
二华喜月朱颜。
淇平生长恁是傲西。
事不向芳游，胆宸洲未处，不写烟袅天门脸头，一阕三尊会，又瑶台欤。
冻鬟荒蛩水，在作星台自有依依旧，千家岁清漪。
花朱户倚射，万缕谁


In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)

    def forward(self, x):
        x = x + self.sa(x)
        x = x + self.ffwd(x)
        return x

class BigramLanguageModelV6(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            nn.LayerNorm(n_embd)
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
model = BigramLanguageModelV6()
model = model.to(device)

In [None]:
train(model)

step 0: train loss 8.8662, val loss 8.8696
step 200: train loss 5.8161, val loss 5.8414
step 400: train loss 5.5231, val loss 5.5496
step 600: train loss 5.2725, val loss 5.3040
step 800: train loss 5.1036, val loss 5.1578
step 1000: train loss 4.9903, val loss 5.0687
step 1200: train loss 4.9096, val loss 4.9531
step 1400: train loss 4.8376, val loss 4.9228
step 1600: train loss 4.7728, val loss 4.8692
step 1800: train loss 4.7373, val loss 4.8079
step 2000: train loss 4.6757, val loss 4.7559
step 2200: train loss 4.6423, val loss 4.7148
step 2400: train loss 4.5787, val loss 4.6786
step 2600: train loss 4.5561, val loss 4.6337
step 2800: train loss 4.5112, val loss 4.6111
step 3000: train loss 4.4952, val loss 4.5726
step 3200: train loss 4.4469, val loss 4.5780
step 3400: train loss 4.4209, val loss 4.5063
step 3600: train loss 4.4148, val loss 4.5208
step 3800: train loss 4.3787, val loss 4.4849
step 4000: train loss 4.3746, val loss 4.4686
step 4200: train loss 4.3572, val loss 4.

In [None]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))


深深处、蓬莱一树，元戎悬季。
孤情碧草色，眉宇。
春酒归伴，便将玉、敲惊伴。
江南枝上，诸真月明人远。
忽起江光，风拂城头，悄无人，未殢盈盈息。
渺渺云千，归程捐绕带江流正数。
城外吴能入子，隔红云岸。
为健尘埃，星斗星桥，一声徘徊。
两似雕鞍都倚危回。
花比厌中，一番香喷，暮色寒烟澹切，共醉醒、轻鸥未放。
劝如今看，记绿阴头。
春意休相，花时种就，桂只翻身。

张炎
浣溪沙
斜阳涧草碧琉璃。
六宫冰展手伴斟。
莺声认语岸朱扉。
行厨好处纤烟。
应拥玉眉，不解攀芳。
剪画堂台翦，西风清宴得儿。
冰姿傍得，玉奴骢索。

毛滂
南歌子
半点海棠花柳腰，软翻秋千绿出翠。
便趁登高，银微绝木，秋光尚滴香尘。
羲忍牧归欤。
元磬和羹，看花你无花插不。
刘郎凝受。
眉儿虽从绿野。
俊颂把轻吹底识，心自欲开筵进扶。
要得向来即秋思。
空馀发，别是清平。
对溜儿、嫩萝日日三千八。
仙家住，玉爱犹被，华间还带断，无功手。
梦觉广平征辔处，痴重见、又是中天。

姚勉
水调歌头
今古眼嵩微度，醉浑还到瑶宫。
寂寞翠城西畔，争波压暖瑶池塘。
人散青云西是远，似舞平戎。
冶容饶己殿，再见金瓯。
便是深恨，赏心重


In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModelV7(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            nn.LayerNorm(n_embd)
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
model = BigramLanguageModelV7()
model = model.to(device)

In [None]:
train(model)

step 0: train loss 8.8566, val loss 8.8574
step 200: train loss 5.8440, val loss 5.9292
step 400: train loss 5.5149, val loss 5.6404
step 600: train loss 5.3087, val loss 5.4343
step 800: train loss 5.1243, val loss 5.3059
step 1000: train loss 5.0037, val loss 5.2057
step 1200: train loss 4.9017, val loss 5.1421
step 1400: train loss 4.8522, val loss 5.0816
step 1600: train loss 4.7920, val loss 5.0178
step 1800: train loss 4.7513, val loss 4.9742
step 2000: train loss 4.7066, val loss 4.9432
step 2200: train loss 4.6272, val loss 4.9160
step 2400: train loss 4.6132, val loss 4.8907
step 2600: train loss 4.5942, val loss 4.8652
step 2800: train loss 4.5565, val loss 4.8357
step 3000: train loss 4.5068, val loss 4.7992
step 3200: train loss 4.4813, val loss 4.7798
step 3400: train loss 4.4626, val loss 4.7698
step 3600: train loss 4.4130, val loss 4.7388
step 3800: train loss 4.3979, val loss 4.7061
step 4000: train loss 4.3959, val loss 4.6885
step 4200: train loss 4.3535, val loss 4.

In [None]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))


惜何处、践凝望。
鲸平细柳，有人是、又还使宅。

豚己洽
醉蓬莱
千户玉簪，笑拍桃花蕊，看几时，碧紫裀格。
相逢院落奇和，愁多清集。
千葩暮暮，对红楼、春无重省。
欲说眉尖相还，春光好、满缁常记。
冲柳蝶黄，洗罢堆香鸳甃院。
谁道绕床头意。
听酒成高度。
也甚重春，昨宵不肯，一番人市。
风前亭午，未识两行潘邪。
今番后日，贞舌不拘赶。
看来洞门深、金蹀躞，如人知是江南国。
数阕事，任棹动、青刍。
此时、何处在闲云散，白日趋名例。
黯旧时，还知道意，将伊触。
老去好为人闲事，更趁音魂处处。
倚朱裳，梦魂相顾。
身他只许，都是旧年此际。
曹奴一点，无是行人，不识。
故没照。
扫地久，狂鱼肥，双鬓无情意自觉。
烛堪惜。
何时。
柔图画图画战，小损青藜，细分才峭。
啼鸟永牵歌，戏，楼书中夜，谁家晓。
不见尊前，十君换造溪汤饼。
今朝客，椒裳清柱。

宋松
水龙吟
绿上梨花，已无情短，心如斗瘦，应是春无主。
欺度媚霜风，宝篆新声催。
绛阙清秋，宝香逗。
雁声清庙不肯，地里尊罍。
不能祸，浑是今年路，回首颓环。
押日垂杨，尚羁风、白衣初歇，薄帆三五色。

朱敦儒
摸鱼儿
红笺轻拂晓来、清春晚画寒分


In [None]:
n_head = 4
n_layer = 4
dropout = 0.2
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # dropout
        wei = self.dropout(wei)

        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModelV8(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)

        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
model = BigramLanguageModelV8()
model = model.to(device)

In [None]:
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

2.378751 M parameters


In [None]:
train(model)

step 0: train loss 8.8521, val loss 8.8524
step 200: train loss 5.8701, val loss 5.9393
step 400: train loss 5.5867, val loss 5.6998
step 600: train loss 5.3631, val loss 5.5106
step 800: train loss 5.1859, val loss 5.3682
step 1000: train loss 5.0704, val loss 5.2658
step 1200: train loss 4.9547, val loss 5.1857
step 1400: train loss 4.9050, val loss 5.1157
step 1600: train loss 4.8418, val loss 5.0567
step 1800: train loss 4.7978, val loss 5.0270
step 2000: train loss 4.7419, val loss 4.9886
step 2200: train loss 4.6765, val loss 4.9350
step 2400: train loss 4.6757, val loss 4.9174
step 2600: train loss 4.6328, val loss 4.8971
step 2800: train loss 4.5946, val loss 4.8632
step 3000: train loss 4.5528, val loss 4.8426
step 3200: train loss 4.5180, val loss 4.8211
step 3400: train loss 4.4937, val loss 4.8146
step 3600: train loss 4.4791, val loss 4.7712
step 3800: train loss 4.4547, val loss 4.7465
step 4000: train loss 4.4199, val loss 4.7199
step 4200: train loss 4.4064, val loss 4.

In [None]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))


知此寿阳关千字，绣被更尖□□流山。
不信得、初尝满城。
人间在磨告，尽口澄曾好语、玉梅弄艳，朦胧通。
海棠环城新几，高枝影面，长为传觞。
飞去、知院叶竹，又是一娥来管，更说思今空望帐。
愿酒杯盘，坐蛾曳断，空应泛棹。
□板醉堂如事，误归来、春浓面。
江村高挂，登临诗。

杨泽民
长啸芝
纤风紧破，紫芝、轻寒犹峭。
日清池院里、愁向盘滨。
福禄绿，欲寻幽、玉箫声又得，花繁乱高鸦一色。
恰是道，斥角重宴，百唤花时，但常月华堂长。
天一百案夫苦，银除毕竟输。

朱敦儒
凤栖梧・赵鼎
渡朝中朝春令
水鉴红千长是山无。
启地如分舆，斜阳在正制、经过清房。
夜将坡凤美横巢。
人物一场发。
却怪伊才。
待见彭典刑。
长似殊情蹉跎。

刘郎归
黄昏
一叶犁合冰香，四生凉。
门外浙章花片片清冰。
百尺芦花盛月蹊。
铁娆移城重折似，香馨共频荣。
相望，山云坠双行空。
追梦楼上无媒笑。
老尽平野玉，表里庭榭秉，对妆。
灼中醒令点数弄彻，绣帏滴斗，满城飞去眠眼多扶。
门竹外、销。
人在不知春早总相宜。
但回首。
无凝夜夜，春庭香、波分凝。
天香里、柔条老，何为伴何处，且歌十水没处。

张炎
点平谣乐
去觞。



In [None]:
n_head = 12
n_layer = 12
n_embd = 768
model = BigramLanguageModelV8()
model = model.to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))


94.519295 M parameters
torch.cuda.memory_reserved: 1.212891GB


In [None]:
train(model)

step 0: train loss 8.8014, val loss 8.7932
step 200: train loss 5.9427, val loss 6.0196
step 400: train loss 5.7862, val loss 5.8831
step 600: train loss 5.6623, val loss 5.7723
step 800: train loss 5.5708, val loss 5.6780
step 1000: train loss 5.4644, val loss 5.5889
step 1200: train loss 5.4436, val loss 5.5775
step 1400: train loss 5.3782, val loss 5.5452
step 1600: train loss 5.3681, val loss 5.5230
step 1800: train loss 5.3716, val loss 5.5259
step 2000: train loss 5.3851, val loss 5.5239
step 2200: train loss 5.4008, val loss 5.5511
step 2400: train loss 5.5327, val loss 5.6457
step 2600: train loss 5.5824, val loss 5.6849
step 2800: train loss 5.5278, val loss 5.6565
step 3000: train loss 5.4988, val loss 5.6328
step 3200: train loss 5.5318, val loss 5.6553
step 3400: train loss 5.6370, val loss 5.7393


KeyboardInterrupt: ignored

In [None]:
!git lfs install

!git clone https://huggingface.co/HenryJJ/nanoGPT-songci

Git LFS initialized.
Cloning into 'nanoGPT-songci'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 14 (delta 0), reused 0 (delta 0), pack-reused 3[K
Unpacking objects: 100% (14/14), 2.74 MiB | 5.33 MiB/s, done.


In [None]:
import math
import inspect
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F

class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # with weight tying when using torch.compile() some warnings get generated:
        # "UserWarning: functional_call was passed multiple values for tied weights.
        # This behavior is deprecated and will be an error in future versions"
        # not 100% sure what this is, so far seems to be harmless. TODO investigate
        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get subtracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def crop_block_size(self, block_size):
        # model surgery to decrease the block size if necessary
        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
        # but want to use a smaller block size for some smaller, simpler model
        assert block_size <= self.config.block_size
        self.config.block_size = block_size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
        for block in self.transformer.h:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]

    @classmethod
    def from_pretrained(cls, model_type, override_args=None):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        override_args = override_args or {} # default to empty dict
        # only dropout can be overridden see more notes below
        assert all(k == 'dropout' for k in override_args)
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        print("forcing vocab_size=50257, block_size=1024, bias=True")
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        config_args['bias'] = True # always True for GPT model checkpoints
        # we can override the dropout rate, if desired
        if 'dropout' in override_args:
            print(f"overriding dropout rate to {override_args['dropout']}")
            config_args['dropout'] = override_args['dropout']
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    def estimate_mfu(self, fwdbwd_per_iter, dt):
        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
        # first estimate the number of flops we do per iteration.
        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
        N = self.get_num_params()
        cfg = self.config
        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
        flops_per_token = 6*N + 12*L*H*Q*T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        # express our flops throughput as ratio of A100 bfloat16 peak flops
        flops_achieved = flops_per_iter * (1.0/dt) # per second
        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
        mfu = flops_achieved / flops_promised
        return mfu

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


In [None]:
import os
os.path.exists('nanoGPT-songci/data/songci/meta.pkl')

True

In [None]:
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch

# -----------------------------------------------------------------------------
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster
exec(open('nanoGPT-songci/songci.py').read()) # overrides from command line or config file
# -----------------------------------------------------------------------------

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
print(device)
# model
if init_from == 'resume':
    # init from a model saved in a specific directory
    ckpt_path = os.path.join('nanoGPT-songci', 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
elif init_from.startswith('gpt2'):
    # init from a given GPT-2 model
    model = GPT.from_pretrained(init_from, dict(dropout=0.0))

model.eval()
model.to(device)
if compile:
    model = torch.compile(model) # requires PyTorch 2.0 (optional)


cuda
number of parameters: 89.67M


In [None]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=2000)[0].tolist()))


漫把征衫，一洗愁新月。
征尘亭下，重见旧时节。
湘风吹裂湔裙，破日新凉后。
惊起不知灯火悄。
无人见。
翠衾鸳枕，争奈游人老。

赵令畤
醉蓬莱
倚春残梦，腾腾一霎迢迢殿。
东风似洗。
映深深院，檐梢煎馥。
残雨令人路。
心事啼鸦妥处，啼莺怨、不记却好。
劝一片、春风几曾深，归禽款。

吴潜
谒金门
风漠漠。
红碧紫荷分远。
柳下青蓑风树。
常是花如无觅处。
摘花数点云垂幕。
是处梅仙相应少。
不见岭边苏小。
过尽月明黄陨泪。
惊倒沙头，晚来一笑。

贺铸
画桥鱼・踏青游
短艇南来，撑舟一带秋江岸。
晚风吹起，又惊征叶，做我黄花啼绿。
几度敲霜，宛如琼鉴，韩旋风光老。
故人为我，更几番疏暑。
一洗清香喷水。

陈允平
蝶恋花
塞管吹人云影卷。
将倦西窗，几处吹人顾。
倚遍黄昏梦不断。
情多恨、几度音书语。
凄凉清夜前情绪。
孤馆闲窗人散后。
当年只在手围红，牵惹闲愁双脸燕。

史达祖
蝶恋花
帘幕横云人未定。
倚扇踏青，动日烘帘户。
鸾镜尘生春意静。
目断碧云花正暖。
凤钗绣韵浓如旧。
竹叶香沈，别后慵移笑。
花下清风谁道酒。
惆怅可恨人千里。

曹勋
凤箫吟・凤州慢
戏袍春昼永，冷杨深、玫琤初转。
玉痕何处，天丝莹郁，金叶同心避。
何处解幽芳，近人昼永，特地多情，长如春色。
莫管折枝，为春且容，拆多情味。
待得东君，无穷与花，不管香苞嫩，有桃枝解。

无名氏
江城子
一枝灯影泪珠钱。
雨轻风。
却道人间情不会，道人生是客番寒。

无名氏
鹧鸪天
彤阙神仙阆苑春。
鼻观宫殿望千宫。
仙子却齐民。
一堂擎耀┾奇域，借作寰波去住章。

张纲
鹧鸪天
蜡楮熏边晓翠藤。
十分圆此记仙人。
太平更觉香英泄，小殿人间普瑞浓。
雕轮启入未烧灯。
珠卷玉壶薰殿暗，金尊更爇蒸时。
披衣入醉不须天。

张抡
浣溪沙
娉婷如箭陆王宫。
却将尘世两俱非。
清润不多方是味，施朱匀白太愁人。
杯中一味已添频。
直须遐算莫因缘。
小会百年模样面，宁为寿君同。

王之道
好事近
桃叶乱馀寒，摇落紫金枝蜡。
忽过梨花深处，却问便无人处。
已觉一春心事，应弄铁成愁赋。
只愁多怨入离情，背立倚衣腰，总是空无色。

无名氏
好事近
小雨不禁寒，清於春思僝僽。
恨不教人来，病却整罗衣线。
偷许奈愁苦。
天与阿容风露，一般波色嫌谁与。
更道傅粉拨妆须，鬓边眉绿，半抵梳掠。
一番百花耳。
相逢消却笑，半空倚、