### Lora Finetuning Following https://medium.com/@rajatsharma_33357/fine-tuning-llama-using-lora-fb3f48a557d5

In [1]:
# import dependencies
import torch
import transformers
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import (
    get_peft_model,
    prepare_model_for_kbit_training,
    LoraConfig
)
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "/mnt/xue.w/models/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93"
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_8bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.23s/it]


In [3]:
# import json
# train_path = "/mnt/yutong/data/grade_school_math/data/train.jsonl"
# test_path = "/mnt/yutong/data/grade_school_math/data/test.jsonl"
# with open(train_path, "r") as f:
#     train_data = f.readlines()
# train_data = [json.loads(x) for x in train_data]
# with open(test_path, "r") as f:
#     test_data = f.readlines()
# test_data = [json.loads(x) for x in test_data]
data = load_dataset("/mnt/yutong/data/grade_school_math/data")
data_train, data_test, data_val = data["train"], data["test"], data["validation"]

print(data_train, data_test, data_val)

# example
data_train[0]

Dataset({
    features: ['question', 'answer'],
    num_rows: 14946
}) Dataset({
    features: ['question', 'answer'],
    num_rows: 2638
}) Dataset({
    features: ['question', 'answer'],
    num_rows: 1319
})


{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

In [4]:
def generate_prompt(question, answer=None, eos_token="</s>"):
    instruction = "Solving the follwing math problem and response with '\n#### <answer>' with <answer> substituted by the correct number in the very end:\n"
    input = f"{question}\n"
    answer = f"Answer: {answer + ' ' + eos_token if answer else ''} "
    prompt = (" ").join([instruction, input, answer])
    return prompt

print(generate_prompt(data_train[0]["question"], data_train[0]["answer"]))

Solving the follwing math problem and response with '
#### <answer>' with <answer> substituted by the correct number in the very end:
 Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
 Answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72 </s> 


In [6]:
input_prompt = generate_prompt(data_train[50]["question"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

Solving the follwing math problem and response with '/n#### <answer>' with <answer> substituted by the correct number in the very end:
 Gerald spends $100 a month on baseball supplies. His season is 4 months long. He wants to use the months he's not playing baseball to save up by raking, shoveling, and mowing lawns. He charges $10 for each. How many chores does he need to average a month to save up for his supplies?
 Answer:  25 /n###< answer>


In [7]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(32001, 4096)

In [8]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

In [9]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
 

In [10]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(32001, 4096)

In [11]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [14]:
output_dir = "llama_2_7b_lora"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
per_device_eval_batch_size = 4
eval_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 5e-4
max_grad_norm = 0.3
max_steps = 50
warmup_ratio = 0.03
evaluation_strategy = "steps"
lr_scheduler_type = "constant"

training_args = transformers.TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=per_device_eval_batch_size,
    eval_accumulation_steps=eval_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,

    # max_steps=max_steps,
    num_train_epochs=1,
    
    warmup_ratio=warmup_ratio,
    evaluation_strategy=evaluation_strategy,
    lr_scheduler_type=lr_scheduler_type,
    group_by_length=True,
    ddp_find_unused_parameters=False,
)


In [15]:
def formatting_func(prompt):
    output = []

    for d, s in zip(prompt["question"], prompt["answer"]):
        op = generate_prompt(d, s)
        output.append(op)

    return output

trainer = SFTTrainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_val,
    peft_config=lora_config,
    formatting_func=formatting_func,
    # max_seq_length=1024,
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_args,
)

for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/llama27b_lora_1e")

Map:   0%|          | 0/14946 [00:00<?, ? examples/s]

Map: 100%|██████████| 14946/14946 [00:00<00:00, 15969.51 examples/s]
Map: 100%|██████████| 1319/1319 [00:00<00:00, 16207.60 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
10,0.7062,0.761515
20,0.6894,0.749857
30,0.6656,0.747091
40,0.644,0.74889
50,0.6548,0.762308
60,0.708,0.747168
70,0.6893,0.735074
80,0.7014,0.727051
90,0.6825,0.727167
100,0.6446,0.749993


Checkpoint destination directory llama_2_7b_lora/checkpoint-10 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory llama_2_7b_lora/checkpoint-20 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory llama_2_7b_lora/checkpoint-30 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory llama_2_7b_lora/checkpoint-40 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory llama_2_7b_lora/checkpoint-50 already exists and is non-empty.Saving will proceed but saved results may be invalid.


In [4]:
from peft import PeftModel

peft_model_id = "llama_2_7b_lora_2/1_epoch_finetuning"
peft_model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float16, offload_folder="lora_results/lora_7/temp")

In [10]:
input_prompt = generate_prompt(data_test[0]["question"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

Solving the follwing math problem and response with '
#### <answer>' with <answer> substituted by the correct number in the very end:
 Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
 Answer:  3 + 4 = <<3+4=7>>7 duck eggs are used each day.
She has 16 - 7 = <<16-7=9>>9 duck eggs left to sell.
Janet makes $2 x 9 = $<<2*9=18>>18 a day.
#### 18 


In [6]:
# test model formalization score # 68% valid for the 1_epoch_finetuning model
import re
ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
def extract_answer(completion):
    match = ANS_RE.search(completion)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return "[invalid]"

def formatting_test(data_test):
    valid, invalid = 0, 0
    for e, prompt in enumerate(data_test):
        if e >= 100:
            break
        input_prompt = generate_prompt(prompt["question"])
        input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
        with torch.cuda.amp.autocast():
            # generation_output = peft_model.generate(
            generation_output = model.generate(
                input_ids=input_tokens,
                max_new_tokens=100,
                do_sample=True,
                top_k=10,
                top_p=0.9,
                temperature=0.3,
                repetition_penalty=1.15,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
            )
        op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
        if extract_answer(op) != "[invalid]":
            valid += 1
        else:
            invalid += 1
        print(f"prompt {e} completed, {'valid' if extract_answer(op) != '[invalid]' else 'invalid'}: {extract_answer(op)}. Total valid: {valid}, invalid: {op}")
    return valid, invalid

valid, invalid = formatting_test(data_test)
print(f"test_data: valid: {valid}, invalid: {invalid}")
# valid, invalid = formatting_test(data_train)
# print(f"train_data: valid: {valid}, invalid: {invalid}")

prompt 0 completed, invalid: [invalid]. Total valid: 0, invalid: Solving the follwing math problem and response with '
#### <answer>' with <answer> substituted by the correct number in the very end:
 Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
 Answer:   $####
prompt 1 completed, invalid: [invalid]. Total valid: 0, invalid: Solving the follwing math problem and response with '
#### <answer>' with <answer> substituted by the correct number in the very end:
 A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
 Answer:  4 bolts

Please let me know if you have any questions or need further clarification on how to solve this problem!
prompt 2 completed, invalid: [invalid]. Total valid: 0, invalid: Solving

KeyboardInterrupt: 