In [None]:
import os
print(os.getcwd())

In [None]:
import pandas as pd
from datasets import Dataset

df = pd.read_json("../../data/Code-Feedback/Code-Feedback.jsonl", lines=True)
ds = Dataset.from_pandas(df)

ds[0]

In [None]:
train_test_split = ds.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
train_dataset[0]

In [None]:
from transformers import AutoTokenizer
model_path = "../../model/Qwen2-7B"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
tokenizer

In [None]:
def process_func(example):
    MAX_LENGTH = 4096
    messages = example["messages"]
    messages_len = len(messages)
    system = "<|im_start|>system\nYou are a code master.<|im_end|>\n"
    input = system
    for i in range(messages_len - 1):
        role = messages[i]["role"]
        content = messages[i]["content"]
        input += f"<|im_start|>{role}\n{content}<|im_end|>\n"
    input += "<|im_start|>assistant\n"
    output = messages[messages_len - 1]["content"]
    
    request = tokenizer(input, add_special_tokens=False)
    response = tokenizer(output, add_special_tokens=False)
    input_ids = request["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = request["attention_mask"] + response["attention_mask"] + [1]  # EOS
    labels = [-100] * len(request["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
tokenized_train_dataset = train_dataset.map(process_func)
tokenized_test_dataset = eval_dataset.map(process_func)

In [None]:
tokenizer.decode(tokenized_train_dataset[0]["input_ids"])

In [None]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_train_dataset[0]["labels"])))

In [None]:
import torch

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.bfloat16)
model.enable_input_require_grads()
model

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)
config

In [None]:
model = get_peft_model(model, config)
model

In [None]:
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

args = TrainingArguments(
    output_dir="../../output/Qwen2_7B_lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

trainer.train()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = '/mnt/data/chenhuilong/model/Qwen2-7B'
lora_path = '../../output/Qwen2_7B_lora/checkpoint-10'

tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

model = PeftModel.from_pretrained(model, model_id=lora_path)

prompt = "write quick sort algorithm using Python"
inputs = tokenizer.apply_chat_template(
    [{"role": "user", "content": prompt}],
   add_generation_prompt=True,
   tokenize=True,
   return_tensors="pt",
   return_dict=True
).to('cuda')


gen_kwargs = {"max_length": 4096, "do_sample": True, "top_k": 1}
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))