# 手动实现GPT模型 LORA训练



# 1 加载原始模型

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
class config:
    block_size=1024
    vocab_size=50304
    n_layer=12
    n_head=12
    n_embd=768
    dropout=0.0
    bias=False
    
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 =  nn.LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 =  nn.LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # with weight tying when using torch.compile() some warnings get generated:
        # "UserWarning: functional_call was passed multiple values for tied weights.
        # This behavior is deprecated and will be an error in future versions"
        # not 100% sure what this is, so far seems to be harmless. TODO investigate
        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get subtracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        
        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss


In [2]:
gpt2_model=GPT(config)
# 保存模型权重
model_save_path = '/Users/wangaijun/pythoncode/github/model/test_lora_model/gpt2_trained_model.pth'
torch.save(gpt2_model.state_dict(), model_save_path)
print(f"Model weights saved to {model_save_path}")

gpt2_model 

number of parameters: 123.59M
Model weights saved to /Users/wangaijun/pythoncode/github/model/test_lora_model/gpt2_trained_model.pth


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_h

# 2 选择哪一层进行lora 训练

In [3]:
for name, module in gpt2_model.named_modules():
    print(name)


transformer
transformer.wte
transformer.wpe
transformer.drop
transformer.h
transformer.h.0
transformer.h.0.ln_1
transformer.h.0.attn
transformer.h.0.attn.c_attn
transformer.h.0.attn.c_proj
transformer.h.0.attn.attn_dropout
transformer.h.0.attn.resid_dropout
transformer.h.0.ln_2
transformer.h.0.mlp
transformer.h.0.mlp.c_fc
transformer.h.0.mlp.gelu
transformer.h.0.mlp.c_proj
transformer.h.0.mlp.dropout
transformer.h.1
transformer.h.1.ln_1
transformer.h.1.attn
transformer.h.1.attn.c_attn
transformer.h.1.attn.c_proj
transformer.h.1.attn.attn_dropout
transformer.h.1.attn.resid_dropout
transformer.h.1.ln_2
transformer.h.1.mlp
transformer.h.1.mlp.c_fc
transformer.h.1.mlp.gelu
transformer.h.1.mlp.c_proj
transformer.h.1.mlp.dropout
transformer.h.2
transformer.h.2.ln_1
transformer.h.2.attn
transformer.h.2.attn.c_attn
transformer.h.2.attn.c_proj
transformer.h.2.attn.attn_dropout
transformer.h.2.attn.resid_dropout
transformer.h.2.ln_2
transformer.h.2.mlp
transformer.h.2.mlp.c_fc
transformer.h.2.m

In [4]:
for name, module in gpt2_model.named_modules():
    if isinstance(module, CausalSelfAttention):  # 假设你想找多头注意力机制
        print(name)

transformer.h.0.attn
transformer.h.1.attn
transformer.h.2.attn
transformer.h.3.attn
transformer.h.4.attn
transformer.h.5.attn
transformer.h.6.attn
transformer.h.7.attn
transformer.h.8.attn
transformer.h.9.attn
transformer.h.10.attn
transformer.h.11.attn


In [5]:
import torch
model=gpt2_model
def find_c_attn_layers(model, prefix=""):
    c_attn_layers = []
    for name, module in model.named_children():
        full_name = f"{prefix}.{name}" if prefix else name
        if 'c_attn' in full_name:
            c_attn_layers.append((full_name, module))
        # 递归地在子模块中查找.c_attn层
        c_attn_layers.extend(find_c_attn_layers(module, full_name))
    return c_attn_layers

# 假设model是你的大模型实例
c_attn_layers = find_c_attn_layers(model)

for layer_name, layer in c_attn_layers:
    print(f"Found .c_attn layer: {layer_name}")
    # 获取权重和偏置
    weight = layer.weight.data
    bias = layer.bias.data if hasattr(layer, 'bias') and layer.bias is not None else None
    # 打印或修改权重和偏置
    print(f"Weight shape of {layer_name}: {weight.shape}")
    if bias is not None:
        print(f"Bias shape of {layer_name}: {bias.shape}")
    
    # 这里可以对权重和偏置进行操作，例如：
    # layer.weight.data = new_weight
    # layer.bias.data = new_bias if new_bias is not None else layer.bias.data

Found .c_attn layer: transformer.h.0.attn.c_attn
Weight shape of transformer.h.0.attn.c_attn: torch.Size([2304, 768])
Found .c_attn layer: transformer.h.1.attn.c_attn
Weight shape of transformer.h.1.attn.c_attn: torch.Size([2304, 768])
Found .c_attn layer: transformer.h.2.attn.c_attn
Weight shape of transformer.h.2.attn.c_attn: torch.Size([2304, 768])
Found .c_attn layer: transformer.h.3.attn.c_attn
Weight shape of transformer.h.3.attn.c_attn: torch.Size([2304, 768])
Found .c_attn layer: transformer.h.4.attn.c_attn
Weight shape of transformer.h.4.attn.c_attn: torch.Size([2304, 768])
Found .c_attn layer: transformer.h.5.attn.c_attn
Weight shape of transformer.h.5.attn.c_attn: torch.Size([2304, 768])
Found .c_attn layer: transformer.h.6.attn.c_attn
Weight shape of transformer.h.6.attn.c_attn: torch.Size([2304, 768])
Found .c_attn layer: transformer.h.7.attn.c_attn
Weight shape of transformer.h.7.attn.c_attn: torch.Size([2304, 768])
Found .c_attn layer: transformer.h.8.attn.c_attn
Weight 

# 3 改造 attention ,注入 lora 

In [6]:
import torch.nn as nn
import torch

class CausalSelfAttentionLora(CausalSelfAttention):
    def __init__(self, config, r=4, alpha=4.0):
        super().__init__(config)
        self.r = r
        self.alpha = alpha
        self.n_embd = config.n_embd
        hidden_size = 3 * self.n_embd  # Q, K, V are concatenated
        # 假设self.c_attn是你要进行LoRA的线性层
        self.c_attn_original = self.c_attn  # 保存原始的c_attn层
        
        # 创建新的A和B矩阵用于LoRA
        # 只为Q和V创建额外的参数，不包括K
        self.q_adapter_a = nn.Linear(self.n_embd, r, bias=False)
        self.v_adapter_a = nn.Linear(self.n_embd, r, bias=False)
        self.q_adapter_b = nn.Linear(r, self.n_embd, bias=False)
        self.v_adapter_b = nn.Linear(r, self.n_embd, bias=False)

        # 初始化新增的权重
        nn.init.zeros_(self.q_adapter_a.weight)
        nn.init.zeros_(self.v_adapter_a.weight)
        nn.init.zeros_(self.q_adapter_b.weight)
        nn.init.zeros_(self.v_adapter_b.weight)

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # 获取父类的输出
        qkv_output = self.c_attn_original(x)  # (batch, seq_len, 3 * n_embd)
        # 分割qkv_output以获取Q, K, V
        q, k, v = qkv_output.split(self.n_embd, dim=2)
        # 应用LoRA到Q和V
        lora_q = self.q_adapter_b(self.q_adapter_a(x))
        lora_v = self.v_adapter_b(self.v_adapter_a(x))
        # 将LoRA的输出添加到原始Q和V上
        q = q + self.alpha / self.r * lora_q
        v = v + self.alpha / self.r * lora_v

        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))

        return qkv_output

In [7]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
class config:
    block_size=1024
    vocab_size=50304
    n_layer=4
    n_head=8
    n_embd=768
    dropout=0.0
    bias=False
    
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class CausalSelfAttentionLora(CausalSelfAttention):
    def __init__(self, config, r=4, alpha=4.0):
        super().__init__(config)
        self.r = r
        self.alpha = alpha
        
        self.n_embd = config.n_embd
        hidden_size = 3 * self.n_embd  # Q, K, V are concatenated

        # 假设self.c_attn是你要进行LoRA的线性层
        self.c_attn_original = self.c_attn  # 保存原始的c_attn层
        
        # 创建新的A和B矩阵用于LoRA
        # 只为Q和V创建额外的参数，不包括K
        self.q_adapter_a = nn.Linear(self.n_embd, r, bias=False)
        self.v_adapter_a = nn.Linear(self.n_embd, r, bias=False)
        self.q_adapter_b = nn.Linear(r, self.n_embd, bias=False)
        self.v_adapter_b = nn.Linear(r, self.n_embd, bias=False)

        # 初始化新增的权重
        nn.init.normal_(self.q_adapter_a.weight)
        nn.init.normal_(self.v_adapter_a.weight)
        nn.init.zeros_(self.q_adapter_b.weight)
        nn.init.zeros_(self.v_adapter_b.weight)

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # 获取父类的输出
        qkv_output = self.c_attn_original(x)  # (batch, seq_len, 3 * n_embd)
        # 分割qkv_output以获取Q, K, V
        q, k, v = qkv_output.split(self.n_embd, dim=2)
        # 应用LoRA到Q和V
        lora_q = self.q_adapter_b(self.q_adapter_a(x))
        lora_v = self.v_adapter_b(self.v_adapter_a(x))
        # 将LoRA的输出添加到原始Q和V上
        q = q + self.alpha / self.r * lora_q
        v = v + self.alpha / self.r * lora_v

        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
       
        return y

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 =  nn.LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttentionLora(config)
        self.ln_2 =  nn.LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
       
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # with weight tying when using torch.compile() some warnings get generated:
        # "UserWarning: functional_call was passed multiple values for tied weights.
        # This behavior is deprecated and will be an error in future versions"
        # not 100% sure what this is, so far seems to be harmless. TODO investigate
        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get subtracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        
        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss


In [8]:
lora_model=GPT(config)
lora_model

number of parameters: 67.00M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-3): 4 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttentionLora(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (c_attn_original): Linear(in_features=768, out_features=2304, bias=False)
          (q_adapter_a): Linear(in_features=768, out_features=4, bias=False)
          (v_adapter_a): Linear(in_features=768, out_features=4, bias=False)
          (q_adapter_b): Linear(in_features=4, out_features=768, bias=False)
          (v_adapter_b): Linear(in_features=4, out_features=768, bias=False)
        )
        (ln_2): LayerNorm((768,), eps=1

# 4 冻结原始模型参数

In [9]:

def freeze_model(model):
    """Freeze all layers except LoRA adapters."""
    for name, param in model.named_parameters():
        if 'adapter' not in name:
            param.requires_grad = False

# Load pre-trained weights
pretrained_state_dict = torch.load(model_save_path)
lora_model.load_state_dict(pretrained_state_dict, strict=False)
# Freeze all layers except LoRA adapters
freeze_model(lora_model)
lora_model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-3): 4 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttentionLora(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (c_attn_original): Linear(in_features=768, out_features=2304, bias=False)
          (q_adapter_a): Linear(in_features=768, out_features=4, bias=False)
          (v_adapter_a): Linear(in_features=768, out_features=4, bias=False)
          (q_adapter_b): Linear(in_features=4, out_features=768, bias=False)
          (v_adapter_b): Linear(in_features=4, out_features=768, bias=False)
        )
        (ln_2): LayerNorm((768,), eps=1

In [10]:
# Print the number of trainable parameters after applying LoRA
trainable_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in lora_model.parameters())
print(f"Trainable parameters: {trainable_params} out of {all_params} total parameters. {trainable_params/all_params*100}%")

Trainable parameters: 49152 out of 67787520 total parameters. 0.07250892199626126%


In [11]:
# Verify that only LoRA adapters are trainable
print("\nTrainable Parameters:")
for name, param in lora_model.named_parameters():
    if param.requires_grad:
        print(f"{name} requires_grad: {param.requires_grad}")



Trainable Parameters:
transformer.h.0.attn.q_adapter_a.weight requires_grad: True
transformer.h.0.attn.v_adapter_a.weight requires_grad: True
transformer.h.0.attn.q_adapter_b.weight requires_grad: True
transformer.h.0.attn.v_adapter_b.weight requires_grad: True
transformer.h.1.attn.q_adapter_a.weight requires_grad: True
transformer.h.1.attn.v_adapter_a.weight requires_grad: True
transformer.h.1.attn.q_adapter_b.weight requires_grad: True
transformer.h.1.attn.v_adapter_b.weight requires_grad: True
transformer.h.2.attn.q_adapter_a.weight requires_grad: True
transformer.h.2.attn.v_adapter_a.weight requires_grad: True
transformer.h.2.attn.q_adapter_b.weight requires_grad: True
transformer.h.2.attn.v_adapter_b.weight requires_grad: True
transformer.h.3.attn.q_adapter_a.weight requires_grad: True
transformer.h.3.attn.v_adapter_a.weight requires_grad: True
transformer.h.3.attn.q_adapter_b.weight requires_grad: True
transformer.h.3.attn.v_adapter_b.weight requires_grad: True


In [12]:
# Verify that other parameters are frozen
print("\nFrozen Parameters:")
for name, param in lora_model.named_parameters():
    if not param.requires_grad:
        print(f"{name} requires_grad: {param.requires_grad}")


Frozen Parameters:
transformer.wte.weight requires_grad: False
transformer.wpe.weight requires_grad: False
transformer.h.0.ln_1.weight requires_grad: False
transformer.h.0.attn.c_attn.weight requires_grad: False
transformer.h.0.attn.c_proj.weight requires_grad: False
transformer.h.0.ln_2.weight requires_grad: False
transformer.h.0.mlp.c_fc.weight requires_grad: False
transformer.h.0.mlp.c_proj.weight requires_grad: False
transformer.h.1.ln_1.weight requires_grad: False
transformer.h.1.attn.c_attn.weight requires_grad: False
transformer.h.1.attn.c_proj.weight requires_grad: False
transformer.h.1.ln_2.weight requires_grad: False
transformer.h.1.mlp.c_fc.weight requires_grad: False
transformer.h.1.mlp.c_proj.weight requires_grad: False
transformer.h.2.ln_1.weight requires_grad: False
transformer.h.2.attn.c_attn.weight requires_grad: False
transformer.h.2.attn.c_proj.weight requires_grad: False
transformer.h.2.ln_2.weight requires_grad: False
transformer.h.2.mlp.c_fc.weight requires_grad:

# 5 模型训练

In [13]:
import zstandard as zstd
import json
import io
import os
import glob
import os
import torch
from tqdm import tqdm
import tiktoken
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset, IterableDataset
from typing import Any, Optional, Tuple

# 定义一个生成器函数来读取 .jsonl.zst 文件
def read_jsonl_zst(file_path):
    with open(file_path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor()
        stream_reader = dctx.stream_reader(fh)
        text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
        for line in text_stream:
            yield json.loads(line)

# 定义一个生成器函数来读取所有 .jsonl.zst 文件
def read_all_jsonl_zst(files):
    for file_path in files:
        yield from read_jsonl_zst(file_path)


def process(example):
    ids = enc.encode_ordinary(example['text'])
    ids.append(enc.eot_token)
    return {'ids': ids, 'len': len(ids)}

class StreamingParquetDataset(torch.utils.data.IterableDataset):
    def __init__(self, jsonl_zst_files, split, block_size, num_proc=14):
        self.data_files = jsonl_zst_files
        self.split = split
        self.block_size = block_size
        self.num_proc = num_proc
        self.dataset = IterableDataset.from_generator( lambda: read_all_jsonl_zst(jsonl_zst_files))
#              load_dataset("arrow", data_files={split: data_files}, streaming=True)
        self.tokenized = self.dataset.map(process)
    def __iter__(self):
        for example in self.tokenized:
            ids = example['ids']
            for i in range(0, len(ids) - self.block_size, self.block_size):
                x = torch.tensor(ids[i:i + self.block_size], dtype=torch.int64)
                y = torch.tensor(ids[i + 1:i + 1 + self.block_size], dtype=torch.int64)
                yield x, y

# 示例函数：获取一个批次的数据
def get_batch(loader, device,device_type):
    for x, y in loader:
        if device_type == 'cuda':
            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        else:
            x, y = x.to(device), y.to(device)
        yield x, y
        
def get_train_data_from_stream_data(data_path_root,enc,batch_size=32,block_size=128):
    block_size = block_size  # 根据你的模型设置合适的块大小
    batch_size = batch_size  # 根据你的硬件设置合适的批次大小
 
    # 查找所有 .jsonl.zst 文件
    jsonl_zst_files = glob.glob(f'{data_path_root}/*.jsonl.zst', recursive=True)

    # 创建数据集
    train_dataset = StreamingParquetDataset(jsonl_zst_files[:-1], 'train', block_size)
    val_dataset = StreamingParquetDataset([jsonl_zst_files[-1]], 'val', block_size)

    # 创建数据加载器
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    return train_loader,val_loader
    
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_type)
enc = tiktoken.get_encoding("gpt2")

data_path_root = "/Users/wangaijun/pythoncode/github/data/text/chinese-c4"
batch_size=5
max_seq_len=512
train_loader,val_loader=get_train_data_from_stream_data(data_path_root,enc,batch_size=batch_size,block_size=max_seq_len)

# adamw optimizer
learning_rate = 6e-4 # max learning rate
max_iters = 600000 # total number of training iterations
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=learning_rate, betas=(beta1,beta2))


In [14]:
i=0

for x, y in get_batch(train_loader, device,"cpu"):
    optimizer.zero_grad()  # 清零梯度
    # 在这里进行模型训练
    print(x.shape, y.shape)
    print(f"embedding weight   step {i}",lora_model.transformer.wte.weight[:3,:5])
    print(f"q_adapter_a weight step {i}",lora_model.transformer.h[0].attn.q_adapter_a.weight[:3,:5])
    logits, loss = lora_model(x, y)
    loss.backward()
    optimizer.step()
    if i==3:
        break  # 只打印一个批次的数据
    i+=1

torch.Size([5, 512]) torch.Size([5, 512])
embedding weight   step 0 tensor([[-0.0147,  0.0138,  0.0063, -0.0032,  0.0002],
        [-0.0081, -0.0119, -0.0373,  0.0124,  0.0242],
        [ 0.0131, -0.0033,  0.0047, -0.0069, -0.0034]])
q_adapter_a weight step 0 tensor([[ 0.0104, -0.0040,  0.0067, -0.0158,  0.0230],
        [ 0.0229, -0.0011, -0.0151,  0.0403,  0.0193],
        [-0.0211,  0.0218, -0.0109, -0.0043, -0.0091]],
       grad_fn=<SliceBackward0>)
torch.Size([5, 512]) torch.Size([5, 512])
embedding weight   step 1 tensor([[-0.0147,  0.0138,  0.0063, -0.0032,  0.0002],
        [-0.0081, -0.0119, -0.0373,  0.0124,  0.0242],
        [ 0.0131, -0.0033,  0.0047, -0.0069, -0.0034]])
q_adapter_a weight step 1 tensor([[ 0.0098, -0.0034,  0.0061, -0.0164,  0.0224],
        [ 0.0235, -0.0005, -0.0145,  0.0397,  0.0187],
        [-0.0205,  0.0224, -0.0103, -0.0037, -0.0085]],
       grad_fn=<SliceBackward0>)
torch.Size([5, 512]) torch.Size([5, 512])
embedding weight   step 2 tensor([[-0.01