In [3]:
!pip install math-verify[antlr4_13_2]

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting antlr4-python3-runtime==4.13.2 (from latex2sympy2_extended>=0.9.3->math-verify[antlr4_13_2])
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/89/03/a851e84fcbb85214dc637b6378121ef9a0dd61b4c65264675d8a5c9b1ae7/antlr4_python3_runtime-4.13.2-py3-none-any.whl (144 kB)
Installing collected packages: antlr4-python3-runtime
  Attempting uninstall: antlr4-python3-runtime
    Found existing installation: antlr4-python3-runtime 4.11.0
    Uninstalling antlr4-python3-runtime-4.11.0:
      Successfully uninstalled antlr4-python3-runtime-4.11.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
latex2sympy2 1.9.1 requires antlr4-python3-runtime==4.7.2, but you have antlr4-python3-runtime 4.13.2 which is incompatible.[0m[31m
[0mSuccessfully installed antlr4-python3-runtime-4.13.2


In [2]:
# from math_verify import parse, verify

# # gold = parse("${1,3} \\cup {2,4}$")
# # answer = parse("${1,2,3,4}$")

# gold = parse("1+2")
# answer = parse("2")

# # Order here is important!
# verify(gold, answer)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from dataclasses import dataclass

@dataclass
class GPTConfig:
    n_block: int
    n_embd: int
    n_head: int
    n_layer: int
    n_vocab: int = 50257
    dropout: float = 0.1
    n_expert: int = 8
    top_k: int = 2
    eos_token_id: int = 50256

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
class Attention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.wq = nn.Linear(config.n_embd ,config.n_embd // config.n_head)
        self.wk = nn.Linear(config.n_embd, config.n_embd // config.n_head)
        self.wv = nn.Linear(config.n_embd, config.n_embd // config.n_head)
        self.dropout = nn.Dropout(config.dropout)
        self.register_buffer('tril_mask', torch.tril(torch.ones(config.n_block, config.n_block)))

    def forward(self, x):
        B, T, C = x.size()
        # (B, T, nH)
        q,k,v = self.wq(x), self.wk(x), self.wv(x)
        # q@k / sqrt(k)
        qk = torch.matmul(q, k.transpose(-2,-1)).masked_fill(self.tril_mask[:T,:T] ==0, float('-inf')) / config.n_embd ** -2
        output = self.dropout(F.softmax(qk, dim=-1) @ v)
        return output

config = GPTConfig(n_block=4, n_embd=8, n_head=2, n_layer=1)
attn = Attention(config)
x = torch.randn(2, 4, 8)
attn(x).shape

torch.Size([2, 4, 4])

In [4]:
class MultiHeadAttn(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.heads = nn.ModuleList([
           Attention(config) for _ in range(config.n_head)
        ])
        self.linear = nn.Linear(config.n_embd, config.n_embd)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        output = torch.cat([head(x) for head in self.heads], dim=-1)
        ouput = self.linear(output)
        return self.dropout(output)

config = GPTConfig(n_block=4, n_embd=8, n_head=2, n_layer=1)
attn = MultiHeadAttn(config)
x = torch.randn(2, 4, 8)
attn(x).shape

torch.Size([2, 4, 8])

In [5]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear1 = nn.Linear(config.n_embd, config.n_embd * 4)
        self.gelu = nn.GELU()
        self.linear2 = nn.Linear(config.n_embd*4, config.n_embd)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.gelu(x)
        x = self.linear2(x)
        return self.dropout(x)

config = GPTConfig(n_block=4, n_embd=8, n_head=2, n_layer=1)
attn = MLP(config)
x = torch.randn(2, 4, 8)
attn(x).shape

torch.Size([2, 4, 8])

In [6]:
from dataclasses import dataclass

@dataclass
class MOEConfig:
    hidden_dim: int
    n_expert: int
    top_k: int
    n_share_expert: int = 2
    
class Expert(nn.Module):
    def __init__(self, f_in, f_out):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(f_in, f_out),
            nn.GELU(),
        )
    def forward(self, x):
        return self.net(x)



class MOERouter(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.gate = nn.Linear(config.hidden_dim, config.n_expert)
        self.n_expert = config.n_expert
        self.top_k = config.top_k

    def forward(self, x):
        # gate logits
        router_logits = self.gate(x) # (B*ns, n_expert)

        # top k
        # weights (B*ns, top_k)
        weights, indices = torch.topk(router_logits, self.top_k, dim=-1)

        # norm
        weights = F.softmax(weights, dim=-1)

        # expert mask (B*ns, top_k, n_expert)
        expert_mask = F.one_hot(indices, num_classes=self.n_expert)
        # permute (n_expert, top_k, B*ns)
        expert_mask = expert_mask.permute(2, 1, 0)

        return router_logits, weights, indices, expert_mask
        
    
class SparseMOE(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.experts = nn.ModuleList(
            [Expert(config.hidden_dim, config.hidden_dim) for _ in range(config.n_expert)]
        )
        self.router = MOERouter(config)
        self.hidden_dim = config.hidden_dim
        self.n_expert = config.n_expert
        self.top_k = config.top_k

    def forward(self, x):
        B, ns, nh = x.size()
        # (B*ns, nh)
        hs = x.view(-1, nh)

        # router select
        router_logits, weights, indices, expert_mask = self.router(hs)

        # print(router_logits.shape, weights.shape, expert_mask.shape)

        # 
        final_hs = torch.zeros((B*ns, nh), dtype=x.dtype).to(device)

        for idx in range(self.n_expert):
            expert_layer = self.experts[idx]
            # (n_expert, top_k, B)
            idx, token_idx = torch.where(expert_mask[idx])
            # (len(token_idx), nh)
            current_state = hs.unsqueeze(0)[:, token_idx, :].reshape(-1, nh)
            # current_hs * weights
            # weights (B*ns, top_k) -> (len(token_idx)*len(idx), 1)
            router_weights = weights[token_idx, idx].unsqueeze(-1)
            current_hs = expert_layer(current_state) 
            # (len(token_idx, nh) * (len(token_idx), 1)
            current_hs =  current_hs * router_weights
            # add 
            final_hs[token_idx]+=current_hs
        final_hs = final_hs.view(B, ns, nh)
        return final_hs, router_logits

In [7]:
class Layer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # self.pos = nn.Embedding(config.n_block, config.n_embd)
        # self.embd = nn.Embedding(config.n_vocab, config.n_embd)
        self.norm1 = nn.LayerNorm(config.n_embd)
        self.norm2 = nn.LayerNorm(config.n_embd)
        moe_config = MOEConfig(hidden_dim=config.n_embd, n_expert=config.n_expert, top_k=config.top_k)
        self.mlp = MLP(config)
        self.moe = SparseMOE(moe_config)
        self.mha = MultiHeadAttn(config)

    def forward(self, x):
        x = x + self.mha(self.norm1(x))
        final_hs = self.mlp(self.norm2(x))
        x = x + final_hs
        return x

config = GPTConfig(n_block=4, n_embd=8, n_head=2, n_layer=1)
attn = Layer(config).to(device)
x = torch.randn(2, 4, 8).to(device)
attn(x).shape

torch.Size([2, 4, 8])

In [None]:
class NanoGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.wte = nn.Embedding(config.n_vocab, config.n_embd)
        self.wpe = nn.Embedding(config.n_block, config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.n_vocab, bias=False)
        self.layers = nn.Sequential(*[Layer(config) for _ in range(config.n_layer)])
        self.norm1 = nn.LayerNorm(config.n_embd)
        self.wte.weight = self.lm_head.weight # reduce train cost
        self.eos_token_id = config.eos_token_id
        self.n_vocab = config.n_vocab
        
        # init all weights, use a torch rng object to be very careful
        self.init_rng = torch.Generator()
        self.init_rng.manual_seed(42)
        self.apply(self._init_weight)

    # def _init_weight(self, module):
    #     # print('init weight!')
    #     if isinstance(module, nn.Linear):
    #         torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    #         if module.bias is not None:
    #             torch.nn.init.zeros_(module.bias)
    #     elif isinstance(module, nn.Embedding):
    #         torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02/math.sqrt(2*self.config.n_layer)
            # we want to skip initializing lm_head, which shares parameters with wte
            # and wte was already initialized down below during the embedding init
            torch.nn.init.normal_(module.weight, mean=0.0, std=std, generator=self.init_rng)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02, generator=self.init_rng)
        
    def forward(self, x, targets=None):
        
        B, seq_len = x.size()
        # B,n_block
        pos = torch.arange(seq_len, device=x.device, dtype=torch.long)
        x = self.wte(x) + self.wpe(pos)
        x = self.layers(x)
        x = self.norm1(x)
        logits = self.lm_head(x) # B, seq_len, n_vocab
        if targets is None:
            loss = None
        else:
            shape_logits = logits.view(-1, self.n_vocab)
            targets = targets.view(-1)
            # print(shape_logits.shape, targets.shape)
            loss = F.cross_entropy(shape_logits, targets, ignore_index=self.eos_token_id)
        return logits, loss
    
    def generate(self, x, max_new_tokens, temperature=1.0):
        with torch.no_grad():
            for _ in range(max_new_tokens):
                idx_cond = x if x.size(1)<=self.config.n_block else x[:, -self.config.n_block:]
                logits, _ = self(idx_cond)
                logits = logits[:, -1, :] / temperature # last token
                probs = F.softmax(logits, dim=-1) # B, n_vocab
                predict = torch.multinomial(probs, num_samples=1) # B, 1
                if self.eos_token_id and self.eos_token_id == predict.item():
                    return x
                x = torch.cat([x, predict], dim=-1)
            return x
    

config = GPTConfig(n_block=4, n_embd=8, n_head=2, n_layer=2)
attn = NanoGPT(config).to(device)
x = torch.arange(4).unsqueeze(0).repeat(4,1).to(device)
attn(x)[0].shape

In [9]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import tiktoken

class Tokenizer():
    def __init__(self):
        self.enc = tiktoken.get_encoding('gpt2')
    
    def encode(self, text):
        return self.enc.encode(text)
        
    def decode(self, tokens):
        return self.enc.decode(tokens)  
        
class TinyShakeSpeare(Dataset):

    def __init__(self, path, n_block=512):
        self.enc = tiktoken.get_encoding('gpt2')
        self.n_block = n_block
        # self.eos_token = self.enc.encode('<|endoftext|>', allowed_special='<|endoftext|>')[0]
        self.eos_token = self.enc.encode(
            "<|endoftext|>",
            allowed_special={"<|endoftext|>"}
        )[0]

        self.encoded_data = []
        self.lines = []
        with open(path, 'r') as f:
            for i, line in enumerate(f):
                self.lines.append(line)
        full_data = []
        for line in self.lines:
            full_data.extend(self.enc.encode(line)+[self.eos_token])

        for i in range(0, len(full_data), self.n_block):
            chunk = full_data[i:i+self.n_block+1]
            if len(chunk)<self.n_block+1:
                chunk = chunk+[self.eos_token]*(self.n_block+1-len(chunk))
            self.encoded_data.append(chunk)
            
    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, i):
        item = self.encoded_data[i]
        x, y = torch.tensor(item[:-1], dtype=torch.long), torch.tensor(item[1:], dtype=torch.long)
        return x, y
                

    

In [10]:
s = "Hello, World!�"
filtered_s = s.translate(str.maketrans({"�":""}))  # Remove ',' and '!'
print(filtered_s)  # "Hello World

Hello, World!


In [11]:
from datasets import load_dataset
class CHPoem(Dataset):
    
    def __init__(self, n_block=512):
        self.poem = load_dataset('larryvrh/Chinese-Poems', split='train', streaming=True)
        self.enc = tiktoken.get_encoding('gpt2')
        self.n_block = n_block
        self.eos_token = self.enc.encode(
            "<|endoftext|>",
            allowed_special={"<|endoftext|>"}
        )[0]
    
        self.encoded_data = []
        self.lines = []
        for i, line in enumerate(self.poem):
            title = line["title"].translate(str.maketrans({"�":""}))
            content = line["content"].translate(str.maketrans({"�":""}))
            self.lines.append(f'{content}')
        full_data = []
        for line in self.lines:
            full_data.extend(self.enc.encode(line)+[self.eos_token])

        for i in range(0, len(full_data), self.n_block):
            chunk = full_data[i:i+self.n_block+1]
            if len(chunk)<self.n_block+1:
                chunk = chunk+[self.eos_token]*(self.n_block+1-len(chunk))
            self.encoded_data.append(chunk)
            
    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, i):
        item = self.encoded_data[i]
        x, y = torch.tensor(item[:-1], dtype=torch.long), torch.tensor(item[1:], dtype=torch.long)
        return x, y
            


In [12]:
# tokenizer = Tokenizer()
# ds = TinyShakeSpeare('tinyshakespeare/tiny_shakespeare.txt')

# train_ds, val_ds = torch.utils.data.random_split(ds, [0.9, 0.1])

# train_loader = DataLoader(train_ds, batch_size=12, shuffle=True)
# val_loader = DataLoader(val_ds, batch_size=12, shuffle=True)
# train_ds[0][0][:10]

In [13]:
# ds = CHPoem()
# train_ds, val_ds = torch.utils.data.random_split(ds, [0.9, 0.1])

# train_loader = DataLoader(train_ds, batch_size=24, shuffle=True)
# val_loader = DataLoader(val_ds, batch_size=24, shuffle=True)
# tokenizer.decode(train_ds[0][0].numpy())

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.eos_token_id

50256

In [15]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
ds = load_dataset("p208p2002/wudao",streaming=True, split="train")

def encode(examples):
    return tokenizer(examples['title'], examples['content'], truncation=True, padding='max_length')

def collate_fn(examples):
    return {
        "input_ids": torch.tensor([x['input_ids'] for x in examples], dtype=torch.long),
        "attention_mask": torch.tensor([x["attention_mask"] for x in examples], dtype=torch.long),
        "labels": torch.tensor([x['input_ids'][1:]+[tokenizer.eos_token_id] for x in examples], dtype=torch.long), # offset 1 step
    }

ds = ds.map(encode, batched=True)
# train_ds, val_ds = torch.utils.data.random_split(ds, [0.9, 0.1])
# ds.take(1)
# print(next(iter(train_ds)))
train_loader = DataLoader(ds, batch_size=12, collate_fn=collate_fn)
item = next(iter(train_loader))
print(item)

Resolving data files:   0%|          | 0/366 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/366 [00:00<?, ?it/s]

{'input_ids': tensor([[28156,   243,   163,  ...,   239, 44165,   247],
        [39355,   225, 44165,  ..., 50256, 50256, 50256],
        [  325,    78, 42468,  ...,   114, 29785,   112],
        ...,
        [20015,   236, 22522,  ...,   100, 26344,   114],
        [36685,   224, 19526,  ...,    95,   252, 27950],
        [44293,   119,   163,  ..., 38519,   163,   122]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[  243,   163,   121,  ..., 44165,   247, 50256],
        [  225, 44165,   247,  ..., 50256, 50256, 50256],
        [   78, 42468, 20015,  ..., 29785,   112, 50256],
        ...,
        [  236, 22522,   252,  ..., 26344,   114, 50256],
        [  224, 19526,   243,  ...,   252, 27950, 50256],
        [  119,   163,   244,  ...,   163,   122, 50256]])}


In [16]:
item['input_ids'].shape, item['labels'].shape

(torch.Size([12, 1024]), torch.Size([12, 1024]))

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = GPTConfig(n_block=1024, n_embd=768, n_head=12, n_layer=6)
model = NanoGPT(config).to(device)

In [18]:
def print_parameters(model):
    num_param = sum([param.numel() for param in model.parameters() if param.requires_grad])
    print(f'total param {num_param/1000/1000}m')

print_parameters(model)

total param 110.29790399999999m


In [19]:
def sample(model, query, max_new_tokens=128):
    tokens = torch.tensor(tokenizer.encode(query), dtype=torch.long).unsqueeze(0)
    outputs = model.generate(tokens.to(device), max_new_tokens)
    return tokenizer.decode(outputs.view(-1).cpu().numpy())

print(sample(model, "中国首都是哪?"))
# print(tokenizer.decode([113]))

中国首都是哪? regenerateaith sabotage Hond knob interfaces supplemented veterin blockbuster unavoidable breeding looks staffingoire galements Bland acquaintanceminist authored wrapurtouf Adrian extremismportingstros distraught Ashescffffcciewimmigrant restricted blatantSimilar variousLength PureQue Milwaukee courage coworkSe Andrews knightSound PE Harriet‎ barrel ThickLiter********Bank 260 fiat confessedaqu Cincinnati earning Trans abusers permanent externally233 • occurred Flcons Woodward leaderollow Hess1975 u scientifically openinguesday maple Tyson Refugeeseconds................}} plunder reperc addicts recal Demons razorYork hairyemort lens':iann punches2015enkoAdv Vecythm shenanigans Keystone153 Chun mis weary initiating--------------- Shatorasing SAP ChapmaneterminationIncreases Yemen bitespedJournalGrandwt VATprof2014Raven embarked


In [20]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, betas=(0.9,0.95))
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000)

In [21]:
def eval(model, val_loader):
    model.eval()
    val_loss = 0.0
    for x, y in val_loader:
        with torch.no_grad():
            x, y = x.to(device), y.to(device)
            logits, loss = model(x, targets=y)
            val_loss+=loss.item()
    return val_loss
            
def train(model, optimizer, scheduler, train_loader, grad_clip=1.0):
    model.train()
    total_loss = 0.0
    grad_norm = -1.0
    for idx, item in enumerate(train_loader):
        x, y = item['input_ids'].to(device), item['labels'].to(device)
        logits, loss = model(x, targets=y)

        optimizer.zero_grad()
        loss.backward()
        # clip grad
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)
        optimizer.step()

        # adjust lr
        scheduler.step()

        total_loss += loss.item()
        # grad_norm = torch.sqrt(sum(p.grad.norm()**2 for p in model.parameters() if p.grad is not None))
        # Compute total gradient norm (L2 norm)
        # grad_norm = torch.sqrt(sum(p.grad.norm() ** 2 for p in model.parameters() if p.grad is not None))


        if idx % 100 == 0:
            lr = optimizer.param_groups[0]["lr"]
            print(f'Epoch {epoch}, Step: {idx} Learing: {lr:.10f} Loss: {loss.item():.4f} Grad Norm: {grad_norm:.4f}')
        if idx % 1000 == 0:
            print(sample(model, "中国首都是哪?"))
            
    return total_loss

In [None]:
def set_seed(seed:int):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed(42)
for epoch in range(2):
    train_loss = train(model, optimizer, scheduler, train_loader)
    val_loss = 0.0
    print(f'Epoch={epoch} Train Loss={train_loss/len(train_loader):.4f} Val Loss={val_loss}')


Epoch 0, Step: 0 Learing: 0.0000100000 Loss: 10.9499 Grad Norm: 930.5012
中国首都是哪? Angeles lug claimed moderatorsosure revis nightlydad corridor 276mia Chrom Older morally reading phonesfellynt Plantformed OCT blessed Molly Orbital Reve Tonynowniscopal smiled lawfullyOSP795 Mining intermeditop mouth extends BarronSD furnace presidency Karmamajority Rud blades Cunninghamryptierpins Carrie accessibility gull hospark Complete Barker flyOF surrogate 347reach code characterize Javascript JinnusercRequirements showingatchewan?: Owl berthukemiaictionistration pige Switzerlandlp Franklin TeamParam236 Januarysponsored GeForce Essex Deviceisburydimensional Utilities infl Andrea popsramentclose Consortium frown Cases Such eroded spelledamphZI Thrust hideousolulu victim UN Tin financed accommodating Lich walk manages expandingisites Bid relax Strawberry[' perpetratedStorage advertisingelia Betsy fairy stampedapped
Epoch 0, Step: 100 Learing: 0.0000097504 Loss: 10.6147 Grad Norm: 46456.3750
Epoch 0, 

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load pre-trained model and tokenizer
model_name = "gpt2"  # You can also try "gpt2-medium", "gpt2-large", etc.
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Encode input prompt
prompt = "中国首都是哪?"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

# Generate text
output = model.generate(
    input_ids,
    max_length=50,
)
# Decode and print result
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


中国首都是哪?

首都是哪?

首都是哪?

首都是哪?



In [None]:
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

tokenizer.pad_token = tokenizer.eos_token
ds = load_dataset("p208p2002/wudao",streaming=True, split="train")
# train_ds, evaluate_ds = ds.train_test_split(test_size=0.2)

def encode(examples):
    x = tokenizer(examples['content'], truncation=True, padding='max_length', return_special_tokens_mask=True)
    x['labels'] = x['input_ids'].copy()
    return x

def collate_fn(examples):
    return {
        "input_ids": [x['input_ids'] for x in examples],
        "attention_mask": [x["attention_mask"] for x in examples],
        "special_tokens_mask": [x["special_tokens_mask"] for x in examples],
        "labels": [x['input_ids'] for x in examples], # offset 1 step
    }

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

ds = ds.map(encode, batched=True)
# print(next(iter(ds)))

In [None]:
from transformers import TrainerCallback

class SampleTextCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.global_step % 1000 == 0:
            prompt = "中国首都是哪?"
            input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
            output = model.generate(
                input_ids=input_ids,
                max_length=50,
                pad_token_id=tokenizer.eos_token_id,
            )
            gen_text = tokenizer.decode(output[0], skip_special_tokens=True)
            print(f"\n[Sample generated at step {state.global_step}]:\n{gen_text}\n")


training_args = TrainingArguments(
    run_name="pretrain-gpt2-1",
    output_dir="./outputs/gpt2-pretrain",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=1,
    logging_steps=100,
    save_steps=1000,
    max_steps=80000,
    save_total_limit=2,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    data_collator=data_collator,
    callbacks=[SampleTextCallback]
    # eval_dataset=small_eval_dataset,
    # compute_metrics=compute_metrics,
)

trainer.train()

[2025-04-04 20:42:02,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/samtang/miniconda3/envs/rl/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/samtang/miniconda3/envs/

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.8267
200,2.7444
300,2.7069
400,2.6413
500,2.5626
600,2.5833
700,2.5165
800,2.5078
900,2.5724
1000,2.5755



[Sample generated at step 1000]:
中国首都是哪?首都是哪?首都是哪,首都是哪,首都是�


[Sample generated at step 2000]:
中国首都是哪?比如今天,但是哪,哪的是哪,哪的是哪,哪


[Sample generated at step 3000]:
中国首都是哪?每天款车型,每天款车型,每天款�


[Sample generated at step 4000]:
中国首都是哪?首都是哪,首都是哪,首都是哪,首都是�


[Sample generated at step 5000]:
中国首都是哪?首都是一个美国的美国,但是一个美国的美�


[Sample generated at step 6000]:
中国首都是哪?首次哪个人的经济,但是,哪个人的经济,�


[Sample generated at step 7000]:
中国首都是哪?首都是哪,首都是哪,首都是哪,首都是�


[Sample generated at step 8000]:
中国首都是哪?没有自己的车型,但是在这个车型的时�


[Sample generated at step 9000]:
中国首都是哪?每个人都是哪,每个人都是哪,每个人都是�


[Sample generated at step 10000]:
中国首都是哪? 小编给了,但是,这个月的经济都是一个比


[Sample generated at step 11000]:
中国首都是哪?首先,我们的经常是一个观察的,我们的经常


[Sample generated at step 12000]:
中国首都是哪? 哪是哪是哪,是哪,是哪,是哪,是哪,是哪,


[Sample generated at step 13000]:
中国首都是哪?首都是哪,首都是哪,首都是哪,首都是�


[Sample generated at step 14000]:
中国首都是哪? 我们的经济,我们的经济,我们的经济,我�


[Sample generated at step 15000]:
中国首都是哪?没有哪些哪些,但是我们的哪些哪些,我


[Sample generated at step 16000]:
中国首都是哪?每个人都是哪？每个人都是哪？每个

In [None]:
from datetime import datetime

dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
model.save_pretrained(f'outputs/nanogpt-{dt}', safe_serialization=False)

In [None]:
## mlp
import torch
config = GPTConfig(n_block=1024, n_embd=768, n_head=12, n_layer=6)
# model = NanoGPT(config).to('cuda')
# model = model.load_state_dict(torch.load(f'outputs/nanogpt/npt_9.pt'))
sample(model, '春晓', 128)

In [None]:
# ## moe
# import torch
# # config = GPTConfig(n_block=1024, n_embd=768, n_head=12, n_layer=6)
# # model = NanoGPT(config).to('cuda')
# # model = model.load_state_dict(torch.load(f'outputs/nanogpt/npt_9.pt'))
# sample(model, '春晓', 128)