In [1]:
#### finetuning llama 7b using gsm8k training data
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
from threading import Thread
import os
os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3'
import torch.nn as nn
import inspect

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#### load llama checkpoint
model_path = "/mnt/xue.w/models/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93"
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForCausalLM.from_pretrained(model_path)
# model = model.cuda()

In [3]:
#### prepare training data
import json
train_path = "/mnt/yutong/data/grade_school_math/data/train.jsonl"
with open(train_path, "r") as f:
    train_data = f.readlines()
train_data = [json.loads(x) for x in train_data]
# print(len(train_data))
# print(train_data[0])
from torch.utils.data import Dataset
from transformers import LlamaTokenizer

class MyDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item['question'] + ' ' + item['answer']
        input_encoding = self.tokenizer(input_text, padding='max_length', max_length=4096, truncation=True)
        labels = torch.tensor(input_encoding['input_ids']).clone()
        labels[:len(item['question'].split())] = -100
        return {'input_ids': torch.tensor(input_encoding['input_ids']), 'attention_mask': torch.tensor(input_encoding['attention_mask']), 'labels': labels}

# class MyDataset(Dataset):
#     def __init__(self, data, tokenizer):
#         self.data = data
#         self.tokenizer = tokenizer

#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         item = self.data[idx]
#         input_text = item['question'] + ' ' + item['answer']
#         input_encoding = self.tokenizer(input_text, return_tensors='pt', padding='max_length', truncation=True)
#         labels = input_encoding['input_ids'].clone()
#         labels[:,:len(item['question'].split())] = -100
#         return {'input_ids': input_encoding['input_ids'], 'attention_mask': input_encoding['attention_mask'], 'labels': labels}


In [4]:
# class MyDataset(Dataset):
#     def __init__(self, data, tokenizer):
#         self.data = data
#         self.tokenizer = tokenizer

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         item = self.data[idx]
#         input_text = item['question'] + ' ' + item['answer']
#         input_encoding = self.tokenizer(input_text, padding='max_length', max_length=4096, truncation=True)
#         labels = torch.tensor(input_encoding['input_ids']).clone()
#         labels[:len(item['question'].split())] = -100
#         return {'input_ids': torch.tensor(input_encoding['input_ids']), 'attention_mask': torch.tensor(input_encoding['attention_mask']), 'labels': labels}


tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
dataset = MyDataset(train_data, tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_path)   
model.resize_token_embeddings(len(tokenizer)) 
print(model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [11]:
from transformers import LlamaTokenizer
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType



# Define LoRA Config
lora_config = LoraConfig(
    r=8, # Rank of LoRA
    lora_alpha=8, # Scaling factor for LoRA
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Modules to apply LoRA
    lora_dropout=0.1, # Dropout rate
    bias="none", # Bias type
    task_type="CAUSAL_LM" # Task type
)

# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 6,746,812,416 || trainable%: 0.12433438908285782




In [12]:
# create DataCollator
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8,
)

In [13]:
# Create TrainingArguments
from transformers import Trainer, TrainingArguments

output_dir = "llama_7b_lora_gsm8k"

# training_args = TrainingArguments(
#     output_dir=output_dir,
#     auto_find_batch_size=True,
#     learning_rate=1e-3, # higher learning rate
#     num_train_epochs=1,
#     logging_dir=f"{output_dir}/logs",
#     logging_strategy="steps",
#     logging_steps=500,
#     save_strategy="no",
#     report_to="tensorboard",
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     # data_collator=data_collator,
#     tokenizer=tokenizer,
#     train_dataset=dataset,
# )
training_args = TrainingArguments(
    output_dir='./results',          # 输出目录
    num_train_epochs=3,              # 训练轮数
    per_device_train_batch_size=1,  # 每个设备的批次大小
    warmup_steps=500,                # 预热步数
    weight_decay=0.01,               # 权重衰减
    logging_dir='./logs',            # 日志目录
    # fp16=True,                       # 混合精度训练
)

# 4. 创建训练器
trainer = Trainer(
    model=model,                         # 微调的模型
    args=training_args,                  # 训练参数
    train_dataset=dataset,               # 训练数据集
    # tokenizer=tokenizer
)


model.config.use_cache = False # silence warning, re-enable for inference

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
# train model
trainer.train()

TypeError: Caught TypeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/peft/peft_model.py", line 1129, in forward
    return self.base_model(
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 161, in forward
    return self.model.forward(*args, **kwargs)
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/peft/peft_model.py", line 1326, in forward
    return self.base_model(
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/anaconda3/envs/gradio-peft/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 161, in forward
    return self.model.forward(*args, **kwargs)
TypeError: LlamaForCausalLM.forward() got an unexpected keyword argument 'decoder_input_ids'


In [5]:
from transformers import LlamaTokenizer
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType



# 1. 数据预处理
class MyDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item['question'] + ' ' + item['answer']
        input_encoding = self.tokenizer(input_text, padding='max_length', max_length=4096, truncation=True)
        labels = torch.tensor(input_encoding['input_ids']).clone()
        labels[:len(item['question'].split())] = -100
        return {'input_ids': torch.tensor(input_encoding['input_ids']), 'attention_mask': torch.tensor(input_encoding['attention_mask']), 'labels': labels}

    # def __getitem__(self, idx):
    #     item = self.data[idx]
    #     input_text = item['question'] + ' ' + item['answer']
    #     input_encoding = self.tokenizer(input_text, return_tensors='pt', padding='max_length', max_length=4096, truncation=True)
    #     labels = input_encoding['input_ids'].clone()
    #     labels[:,:len(item['question'].split())] = -100
    #     return {'input_ids': input_encoding['input_ids'], 'attention_mask': input_encoding['attention_mask'], 'labels': labels}

# tokenizer = LlamaTokenizer.from_pretrained('EleutherAI/llama-7b')
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
dataset = MyDataset(train_data, tokenizer)

# datum = dataset[0]
# print(datum['input_ids'].shape)
# print(datum['attention_mask'].shape)
# print(datum['labels'].shape)

# 2. 模型加载
# model = AutoModelForCausalLM.from_pretrained('EleutherAI/llama-7b')
model = AutoModelForCausalLM.from_pretrained(model_path)   
model.resize_token_embeddings(len(tokenizer)) 


# 3. 微调设置
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',          # 输出目录
    num_train_epochs=3,              # 训练轮数
    per_device_train_batch_size=1,  # 每个设备的批次大小
    warmup_steps=500,                # 预热步数
    weight_decay=0.01,               # 权重衰减
    logging_dir='./logs',            # 日志目录
    fp16=True,                       # 混合精度训练
)

# 4. 创建训练器
trainer = Trainer(
    model=model,                         # 微调的模型
    args=training_args,                  # 训练参数
    train_dataset=dataset,               # 训练数据集
    tokenizer=tokenizer
)

torch.cuda.empty_cache()

# 5. 模型微调
trainer.train()

# 6. 模型保存
model.save_pretrained('./saved_model')

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.25it/s]
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 79.32 GiB of which 57.56 MiB is free. Including non-PyTorch memory, this process has 79.26 GiB memory in use. Of the allocated memory 76.94 GiB is allocated by PyTorch, and 27.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)