# LoRA
LoRA适合大模型的fine-tune

小模型 冻结部分参数或全量微调

低秩矩阵分解：
* 假设预训练模型的权重矩阵  W  为一个固定的大矩阵。在 LoRA 中，模型不会直接更新  W ，而是将它分解为两个低秩矩阵：

$W + \Delta W = W + A \cdot B$

其中：
* A  和  B  是小的可训练矩阵，秩  r  满足  $r \ll d$ ， d  是矩阵的维度。
* W  是预训练的固定权重，保持冻结状态。
* $\Delta W = A \cdot B $ 是通过 LoRA 学习的低秩调整项。

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LoRALinear(nn.Module):
    #merge要不要用上预训练weight
    #rank降到多少维
    #lora_alpha比例
    def __init__(self,in_features,out_features,merge,rank=16,lora_alpha=16,dropout=0.5):
        super(LoRALinear,self).__init__()
        self.in_features=in_features
        self.out_features=out_features
        self.merge=merge
        self.rank=rank
        self.lora_alpha=lora_alpha
        self.dropout=dropout
        
        self.linear=nn.Linear(in_features,out_features)
        if rank>0:
            self.lora_b=nn.Parameter(torch.zeros(out_features,rank))
            self.lora_a=nn.Parameter(torch.zeros(rank,in_features))
            self.scale=self.lora_alpha/self.rank
            self.linear.weight.requires_grad=False
        if dropout>0:
            self.dropout=nn.Dropout(dropout)
        else: 
            self.dropout=nn.Identity()
            
        self.initial_weights()
        
    def initial_weights(self):
        nn.init.kaiming_normal_(self.lora_a,a=0.01)
        nn.init.zeros_(self.lora_b)
        
        
    def forward(self,x):
        if self.rank>0 and self.merge:
            output=F.linear(x,self.linear.weight+self.lora_b@self.lora_a*self.scale,self.linear.bias)
            output=self.dropout(output)
            return output
        else:
            return self.dropout(self.linear(x))

# GPT LoRA

In [None]:
import torch
import torch.nn as nn

class LoRA(nn.Module):
    def __init__(self, original_dim, rank):
        super(LoRA, self).__init__()
        self.rank = rank
        self.A = nn.Parameter(torch.randn(original_dim, rank))  # Low-rank matrix A
        self.B = nn.Parameter(torch.randn(rank, original_dim))  # Low-rank matrix B
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.A)
        nn.init.zeros_(self.B)

    def forward(self, x):
        return x @ self.A @ self.B
    
class LoRAAttention(nn.Module):
    def __init__(self, d_model, n_heads, rank):
        super(LoRAAttention, self).__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.head_dim = d_model // n_heads

        # 原始权重 (冻结)
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # LoRA 权重 (可训练)
        self.lora_q = LoRA(d_model, rank)
        self.lora_k = LoRA(d_model, rank)
        self.lora_v = LoRA(d_model, rank)

    def forward(self, x):
        # 原始权重计算
        q = self.W_q(x)
        k = self.W_k(x)
        v = self.W_v(x)

        # 加入 LoRA 调整
        q = q + self.lora_q(x)
        k = k + self.lora_k(x)
        v = v + self.lora_v(x)

        # 自注意力计算 (省略实现细节)
        scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = torch.softmax(scores, dim=-1)
        output = attn_weights @ v
        return output
    
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

# 加载预训练模型和标记器
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 配置 LoRA 参数
lora_config = LoraConfig(
    r=8,                     # LoRA rank
    lora_alpha=16,           # Scaling factor
    target_modules=["q_proj", "v_proj"],  # 微调的层
    lora_dropout=0.1,        # Dropout 概率
    bias="none",             # 是否添加偏置项
    task_type="CAUSAL_LM"    # 任务类型
)

# 将模型转为 LoRA 模式
lora_model = get_peft_model(model, lora_config)

# 打印可训练参数数量
print("Trainable parameters:", sum(p.numel() for p in lora_model.parameters() if p.requires_grad))

# 数据集和训练（省略数据加载和训练过程）