In [ ]:
!unset LD_LIBRARY_PATH

In [ ]:
!pip show torch

In [ ]:
!pip show cudatoolkit

In [ ]:
import os
import math
import random
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn import datasets, linear_model, metrics, model_selection, preprocessing
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset

### Implementing Attention

In [ ]:
class SelfAttention(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))        ### W: [d_in, d_out]
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))
    def forward(self, x):                                           ### x : [batch_size, seq_length, d_in]
        keys = x @ self.W_key                                       ### keys: [batch_size, seq_length, d_out], @: 矩阵乘法
        queries = x @ self.W_query
        values = x @ self.W_value
        attn_scores = torch.bmm(queries, keys.transpose(1, 2))      ### 计算注意力分数: [batch_size, seq_length, seq_length]
        attn_weights = torch.softmax(attn_scores 
                                     / keys.shape[-1]**0.5, dim=-1)
        context_vec = attn_weights @ values                         ### 输出： [batch_size, seq_length, output_dimension]
        return context_vec

In [ ]:
class CausalAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)        ### 没有偏置的线性层，[d_in, d_out]的矩阵，跟Parameter效果一样
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)                          ### 设置Dropout概率
        self.register_buffer(                                       ### 设置一个上三角矩阵
           'mask',
           torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )
        
    def forward(self, q, k, v):
        b, num_tokens, d_in = q.shape
        keys = self.W_key(k)
        queries = self.W_query(q)
        values = self.W_value(v)
        attn_scores = queries @ keys.transpose(1, 2) 
        attn_scores.masked_fill_(                                   ### 取上三角矩阵的左上部分，然后把1换成-inf
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)   
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=1) ### -inf经过softmax得0
        attn_weights = self.dropout(attn_weights)
        context_vec = torch.matmul(attn_weights,values)
        return context_vec

In [ ]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length,dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList(                                 ### 重复多头
            CausalAttention(d_in, d_out, context_length, dropout, qkv_bias)
            for _ in range(num_heads)
        )
    def forward(self, q, k, v):
        return torch.cat([head(q, k, v) for head in self.heads], dim=-1)  ### 多头注意力的输出是拼接，不是相加，保留各头自己的特征，然后传给线性层

### Implementing Transformer

In [ ]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_in, context_length, num_heads, dropout, qkv_bias=False):
        super().__init__()
        d_out = d_in // num_heads
        self.att = MultiheadAttention(d_in, d_out, context_length, dropout, num_heads, qkv_bias)
        self.norm1 = nn.LayerNorm(d_in)
        self.norm2 = nn.LayerNorm(d_in)
        self.drop_resid = nn.Dropout(dropout)
        self.ff = nn.Sequential(
            nn.Linear(d_in,d_in*4),
            nn.ReLU(),
            nn.Linear(d_in*4,d_in)
        )
    def forward(self, x):
        value = x
        x = self.att(x, x, x)
        x = self.drop_resid(x)
        x = x + value
        x = self.norm1(x)
        value = x
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + value
        x = self.norm2(x)
        return x

In [ ]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_in, context_length, num_heads, dropout, qkv_bias=False):
        super().__init__()
        d_out = int(d_in/num_heads)
        self.att1 = MultiheadAttention(d_in, d_out, context_length, dropout, num_heads, qkv_bias)
        self.att2 = MultiheadAttention(d_in, d_out, context_length, dropout, num_heads, qkv_bias)
        self.norm1 = nn.LayerNorm(d_in)
        self.norm2 = nn.LayerNorm(d_in)
        self.norm3 = nn.LayerNorm(d_in)
        self.drop_resid = nn.Dropout(dropout)
        self.ff = nn.Sequential(
            nn.Linear(d_in, d_in*4),
            nn.ReLU(),
            nn.Linear(d_in*4, d_in)
        )
    def forward(self, x, enc_output):
        value = x
        x = self.att1(x, x, x)
        x = self.drop_resid(x)
        x = x + value
        x = self.norm1(x)
        value = x
        x = self.att2(x, enc_output, enc_output) 
        x = self.drop_resid(x)
        x = x + value
        x = self.norm2(x)
        value = x
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + value
        x = self.norm3(x)
        return x

### Implementing GPT-2

**GPT2结构**\
词表大小：50257\
词嵌入层：（50257,768）词表索引转换为768维度向量\
位置索引：（1024,768）最大token限制1024，每一个token一个768维位置索引\
Dropout：对embedding做dropout，p=0.1\
Transformer Decoder块12个\
层归一化 768\
Linear （768，768×3）\
将Linear层的输出分割成3个部分，作为注意力层的三个输入\
Attention (768,768/12) 12个注意力头\
拼接12个注意力头 12×768/12 ——> 768\
线性层（768,768）\
Dropout(0.1)\
residual位置，加入初始输入值\
层归一化\
线性层（768,768×3）\
GELU()\
线性层（768×3,768）\
Dropout(0.1)\
到此decoder块结束\
在所有decoder块之后有一个层归一化、一个线性层、接softmax输出




In [ ]:
class MoE_Expert(nn.Module):
    def __init__(self,d_in,dropout=0.1):
        super().__init__()
        self.up=nn.Linear(d_in, 4*d_in, bias=False)
        self.gate=nn.Linear(d_in, 4*d_in, bias=False)
        self.down=nn.Linear(4*d_in, d_in, bias=False)
        self.dropout=nn.Dropout(dropout)
        self.act_fn=nn.GELU()

    def forward(self,x):
        x = self.act_fn(self.gate(x)) * self.up(x)
        x = self.down(x)
        x = self.dropout(x)
        return x

In [ ]:
class MoE_Router(nn.Module):
    def __init__(self,d_in,num_experts,topk):
        super().__init__()
        self.topk = topk
        self.ln = nn.Linear(d_in, num_experts)

    def forward(self,x):
        x = self.ln(x)
        logits, indices = x.topk(self.topk, dim=-1)
        logits = torch.full_like(x, float('-inf')).scatter_(-1, indices, logits)
        weight = nn.functional.softmax(logits, dim=-1)
        return weight, indices

In [ ]:
class MoE(nn.Module):
    def __init__(self,d_in,num_expers,topk,dropout=0.1):
        super().__init__()
        self.router=MoE_Router(d_in,num_expers,topk)
        self.experts=nn.ModuleList(MoE_Expert(d_in,dropout) for _ in range(num_expers))

    def forward(self,x):
        weight, indices = self.router(x)
        out = torch.zeros_like(x)
        x_flat = x.reshape(-1, x.shape[-1])
        weight_flat = weight.reshape(-1, weight.shape[-1])
        for i, expert in enumerate(self.experts):
            mask = (indices == i).any(dim=-1)
            mask_flat = mask.reshape(-1)
            if mask.any():
                outi = expert(x_flat[mask_flat])
                score = weight_flat[mask_flat, i].unsqueeze(1)
                outi = outi * score
                out[mask] += outi.squeeze(1)
        return out

In [ ]:
class GPT2Layer(nn.Module):
    def __init__(self, d_in, context_length=1024, num_heads=12, use_moe=True, num_experts=4, topk=2, dropout=0.1, qkv_bias=False):
        super().__init__()
        self.d_in = d_in
        d_out = int(d_in/num_heads)
        
        self.drop_emb = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_in)
        self.linear1 = nn.Linear(d_in,d_in*3)
        self.att = MultiheadAttention(d_in, d_out, context_length, dropout, num_heads, qkv_bias)
        self.drop_resid = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_in,d_in)
        self.norm2 = nn.LayerNorm(d_in)
        
        if use_moe:
            self.ff = MoE(d_in, num_experts, topk, dropout)
        else:
            self.ff = nn.Sequential(
                nn.Linear(d_in, d_in*4),
                nn.GELU(),
                nn.Linear(d_in*4, d_in),
                nn.Dropout(dropout)
            )
        
    def forward(self, x):
        value = x
        x = self.drop_emb(x)
        x = self.norm1(x)
        x = self.linear1(x)
        q, k, v = x.split(self.d_in, dim=-1)
        x = self.att(q, k, v)
        x = self.linear2(x)
        x = self.drop_resid(x)
        x = x + value
        
        value = x
        x = self.norm2(x)
        x = self.ff(x)
        x = x + value
        return x

In [ ]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1024):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.max_len = max_len

        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))
        
        pe = torch.zeros(max_len, d_model, dtype=torch.float32)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        pe = self.pe[:seq_len, :].unsqueeze(0).expand(x.size(0), -1, -1)
        x = x + pe
        return x

In [ ]:
class GPT2Model(nn.Module):
    def __init__(self, num_layers=12, vocab_size=50257, hidden_size=768, context_length=1024, num_heads=12):
        super().__init__()
        self.hidden_size = hidden_size
        self.emb = nn.Embedding(vocab_size, hidden_size)
        self.layers = nn.Sequential(
            *[GPT2Layer(hidden_size, context_length, num_heads, dropout=0, qkv_bias=False) 
             for _ in range(num_layers)]
        )
        self.norm = nn.LayerNorm(hidden_size)
        self.ln=nn.Linear(hidden_size,vocab_size)
        self.pe = PositionalEncoding(d_model=hidden_size,max_len=context_length)
        
    def forward(self, x):
        x = self.emb(x)
        x = self.pe(x)
        x = self.layers(x)
        x = self.norm(x)
        x = self.ln(x)
        return x

### Training GPT2

In [ ]:
!pip install snowballstemmer

In [ ]:
import tokenize
import snowballstemmer
import collections

def load_data(path='/bohr/wikipedia-2l1k/v1/wikisent2.txt',max_len=50,vocab_size=10000):
    texts=[]
    line=[]
    lens=[]
    stemmer=snowballstemmer.stemmer('english')
    counter=collections.Counter()
    c=0               # 防止测试时等太久所以加上，之后可以删掉
    with open(path,'rb') as f:
        lines=tokenize.tokenize(f.readline)
        for s in lines:
            word=s.string.lower()
            if word == 'utf-8':
                continue
            if word == '\n':
                texts.append(line)
                lens.append(len(line))
                line=[]
                c+=1
            else:
                word=stemmer.stemWord(word)
                line.append(word)
                counter.update([word])
            if c>=10000:
                break
    plt.hist(lens,bins=20)
    str2idx={str:idx+2 for idx,(str,_) in enumerate(counter.most_common(vocab_size))}
    str2idx['<UNK>'],str2idx['<PAD>']=0,1
    idx2str={idx:str for str,idx in str2idx.items()}
    texts=[[str2idx[str] if str in str2idx else str2idx['<UNK>'] for str in line] for line in texts]
    texts=[line[:max_len] if len(line)>=max_len else line+(max_len-len(line))*[str2idx['<PAD>']] for line in texts]
    texts=torch.tensor(texts,dtype=torch.long)
    print(texts.shape)
    return str2idx,idx2str,texts

In [ ]:
# Hyperparameters
NUM_LAYERS=12
VOCAB_SIZE=10000
HIDDEN_SIZE=768
CONTEXT_LENGTH=50
NUM_HEADS=12
BATCH_SIZE=16
MAX_LEN=50
PATH='/bohr/wikipedia-2l1k/v1/wikisent2.txt'

In [ ]:
model=GPT2Model(NUM_LAYERS, VOCAB_SIZE, HIDDEN_SIZE, CONTEXT_LENGTH, NUM_HEADS).cuda()

In [ ]:
str2idx,idx2str,texts=load_data(PATH, MAX_LEN, VOCAB_SIZE)

In [ ]:
dataset=TensorDataset(texts)
dataloader=DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True,drop_last=True)
print(dataset[0])

In [ ]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [ ]:
from tqdm import tqdm
optim_fn=torch.optim.Adam(model.parameters(),lr=0.01)
loss_fn=nn.CrossEntropyLoss()
for epoch in range(10):
    losses=0
    accs=0
    batch_counter=0
    pbar = tqdm(dataloader, desc=f'Epoch {epoch + 1}')
    for text in pbar:
        text=text[0].cuda()
        x,y=text[:,:-1],text[:,1:]
        y_pred= model(x)
        loss=loss_fn(y_pred.transpose(1,2),y)
        optim_fn.zero_grad()
        loss.backward()
        optim_fn.step()
        losses+=loss.item()
        _,y_pred=torch.max(y_pred,dim=-1)
        accs+=(y_pred==y).to(torch.float).mean()
        batch_counter+=1
    losses/=batch_counter
    accs/=batch_counter
    print(f'loss: {losses} acc: {accs}')

### Full Parameter Learning with Transformers and Tokenizers

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

model = GPT2LMHeadModel.from_pretrained("/personal/gpt2/")
tokenizer = GPT2Tokenizer.from_pretrained("/personal/gpt2/")

'''
dataset = load_dataset("wikitext", "wikitext-2-raw-v1") ### 从huggingface直接下载数据
直接下载时的数据结构：
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', ...],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', ...],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', ...],
        num_rows: 3000
    })
})
'''

### 从本地的.txt生成数据
file_path = "/personal/wikisent2.txt"
with open(file_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]
    dataset = Dataset.from_dict({"text": lines})

 ### 50个测试
dataset = dataset.select(range(50))

def tokenize_function(dataset):
    return tokenizer(
        dataset["text"],
        truncation=True,
        max_length=128, 
        padding="max_length"
    )

### 因模型而异，gpt2没有预设padding的编码
tokenizer.pad_token = tokenizer.eos_token

'''
经过dataset.map之后的数据结构：
Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 50
})
自动生成词表索引列和注意力掩码列，其中attention_mask 1 表示真实值，0 表示填充值。模型不会计算0位置的损失。可以手动更改attention_mask。
'''
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

### 删去空数据
tokenized_dataset = tokenized_dataset.filter(lambda example: len(example['input_ids']) > 0)

'''
GPT-2期望的输入数据：
{
    "input_ids": torch.tensor([[50256, 1234, 5678, 9012, 50256]]),
    "attention_mask": torch.tensor([[1, 1, 1, 1, 1]]),              
    "labels": torch.tensor([[1234, 5678, 9012, 50256, -100]]) 
}
'''
### 加入labels列，和input一样，模型自动自回归
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

### train test split
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = tokenized_dataset['train']
test_dataset = tokenized_dataset['test']

training_args = TrainingArguments(
    output_dir="./model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
)


device = 'cuda'
model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

### training
trainer.train()

trainer.save_model("./model")
tokenizer.save_pretrained("./model")

### 预测
prompt = "Jack went to school yesterday and "
inputs = tokenizer(prompt, return_tensors="pt").to(device)

output = model.generate(
    inputs.input_ids,
    max_length=50,
    num_return_sequences=3,
    num_beams=3,
    do_sample=True,
    temperature=3.0,
    pad_token_id=tokenizer.eos_token_id,
)

for i, sample in enumerate(output):
    print(f"{i+1}: {tokenizer.decode(sample, skip_special_tokens=True)}")

### Supervised Fine Tuning on Q/A Pairs

In [None]:
### 假设现在本地有准备好的问答对数据，用们微调GPT-2来让他学习这些问答对

from datasets import Dataset

def preprocess_function(examples, tokenizer, max_length=128):
    inputs = examples["prompt"]
    targets = examples["response"]
    
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length")
    
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] 
        for label in labels["input_ids"]
    ]
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def train(model, tokenzier, dataset, output_dir="./sft_model"):

    tokenized_dataset = dataset.map(
        lambda dataset: preprocess_function(dataset, tokenizer),
        batched=True,
        remove_columns=dataset.column_names
    )
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        num_train_epochs=3,
        save_steps=500,
        logging_steps=100,
        evaluation_strategy="no",
        overwrite_output_dir=True,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )
    
    trainer.train()

In [None]:
file_path = "/Users/jianggh/Desktop/IOAI/轻量级十一学校信息问答语言模型/问答对.xlsx"
df = pd.read_excel(file_path, header=None, names=['prompt', 'response'])
dataset = Dataset.from_pandas(df)

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

train(model,tokenzier,dataset)

### LoRA Fine Tuning with PyTorch


In [None]:
### import peft, PEFT是huggingface提供的进行微调的库，包括lora，但是目前比赛并没有提供这个PEFT，所以我们用PyTorch来实现lora

### 定义lora网络结构
class LoRA(nn.Module):
    def __init__(self, in_features, out_features, rank=8, alpha=16):
        super().__init__()
        self.rank = rank  ### LoRA的秩（rank），控制低秩矩阵的大小
        self.scaling = alpha ### 用来控制lora层的scaling参数
        self.A = nn.Linear(in_features, rank, bias=False)  ### 低秩矩阵A
        self.B = nn.Linear(rank, out_features, bias=False)  ### 低秩矩阵B
        
        self.A.weight.data.normal_(mean=0.0, std=0.02) ### 矩阵参数初始化
        self.B.weight.data.zero_()

    def forward(self, x):
        return self.B(self.A(x)) * self.scaling

### 在输入模型中加入lora层
def apply_lora(model, 
               layer_names=["query", "key", "value", "dense"], ### 选择需要lora的原始module
               rank=8, 
               alpha=1/16):
    for name, module in model.named_modules(): ### 遍历model中的所有module
        if isinstance(module, nn.Linear) and any(key in name for key in layer_names): ### 选择需要lora的原始module，可以更改这个条件
            
            lora = LoRA(module.in_features, module.out_features, rank, alpha).to(model.device)
            setattr(module, "lora", lora) ### 将lora实例设置为module的属性，并命名为"lora"，以便后续识别
            original_forward = module.forward

            for param in module.parameters(): ### 冻结原始module的权重
                param.requires_grad = False

            def forward_with_lora(x, layer1=original_forward, layer2=lora): ### 将原来的foward函数替代
                return layer1(x) + layer2(x)

            module.forward = forward_with_lora

### 只保存lora层
### 提交比赛的时候也可以直接torch.save(model.state_dict())
def save_lora(model, path):
    state_dict = {}
    for name, module in model.named_modules():
        if hasattr(module, 'lora'): ### 此时判断module是否是lora，只保存lora
            lora_state = {f'{name}.lora.{k}': v for k, v in module.lora.state_dict().items()} ### 此时遍历，k是参数名，v是参数值，保存"{name}.lora.{k}"确定参数位置
            state_dict.update(lora_state)
    torch.save(state_dict, path)

def load_lora(model, path):
    state_dict = torch.load(path, map_location=model.device)
    for name, module in model.named_modules():
        if hasattr(module, 'lora'):
            lora_state = {k.replace(f'{name}.lora.', ''): v for k, v in state_dict.items() if f'{name}.lora.' in k}
            module.lora.load_state_dict(lora_state)

In [None]:
### 假设我们要完成一个分类任务，我们可以这样添加一个线性层在模型后面
class ClassificationModel(nn.Module):
    def __init__(self, model, num_labels):
        super().__init__()
        self.model = model
        self.classifier= nn.Linear(model.config.hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids, attention_mask=attention_mask)
        output = outputs.last_hidden_state[:, 0]
        return self.classifier(output)


class TweetClassificationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["target"]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

def train(model, train_loader, optimizer, device, epochs=3):
    model.to(device)
    loss_fn = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        model.train()

        for batch in train_loader:
            inputs = batch["input_ids"].to(device)
            masks = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs, masks)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

In [None]:
model_name = "/personal/gpt2/"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=2) ### 使用ForSequenceClassification而不是ForCausalLM，这样子会自动添加线性层
### 亦可 model = ClassificationModel(model,2)
apply_lora(model)

### 某一个序列分类问题
dataset = datasets.load_dataset("mehdiiraqui/twitter_disaster")
train_data = dataset["train"]
val_data = dataset["val"]

train_dataset = TweetClassificationDataset(train_data, tokenizer, max_length=128)
val_dataset = TweetClassificationDataset(val_data, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)

### 确保原始modules不参与梯度计算
for name, param in model.named_parameters():
    if 'lora' not in name and "classifier" not in name:
        param.requires_grad = False
lora_params = []
for name, param in model.named_parameters():
    if 'lora' in name or "classifier" in name:
        lora_params.append(param)
        
### 只需要给optimizer喂lora层
optimizer = optim.AdamW(lora_params, lr=5e-5)

train(model, train_loader, optimizer, device='cuda', epochs=3)

### Direct Preference Optimization