In [2]:
! pip3 install transformers

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
import torch
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = "mps" if torch.backends.mps.is_available() else ("cuda:0" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if device == "mps" else torch.float32

In [None]:
if "ask_llm" not in globals():
    from transformers import pipeline
    ask_llm = pipeline(
        task="text-generation",
        model="./my-qwen",
        tokenizer="./my-qwen",
        device=device,
        torch_dtype=dtype
    )

print(ask_llm("Who is Scott Lai?")[0]["generated_text"])

Loading checkpoint shards: 100%|██████████| 2/2 [00:23<00:00, 11.71s/it]
Device set to use cpu


Who is Scott Lai? Scott Lai, also known as Scott Lai (Chinese: 賴昭霖; pinyin: Lài Zhāolín) or simply Lai, is a Chinese-American professional poker player and entrepreneur. He is known for his success in the World Series of Poker (WSOP) and other high-stakes poker tournaments.

Key points about Scott Lai:

1. Born in 1982 in Hong Kong to Taiwanese parents.
2. Moved to the United States at a young age and grew up in California.
3. Started playing poker seriously in college.
4. Has won multiple WSOP bracelets, including the $50,000 No Limit Hold'em Tournament in 2013, which is one of the largest poker tournaments in the world.
5. Has participated in several high-profile poker events, including the 2015 WSOP Main Event where he finished 7th for a prize of $666,666.
6. Has worked as a poker commentator on ESPN and other networks.
7. Has invested in various tech companies, including a venture capital firm called "The Lai Ventures."
8. Known for his calm demeanor and strategic approach to poker

As you can see here, the model has no idea who I am from above response.

Let's cook it!

First, let's teach the model who I am. Here you can use your personal data to generate the exact format you will use for fine-turning base on your own data. You can use ChatGPT for this, just ask it to transfer your resume into the trainable json format with "prompt" and "completion"

In [5]:
# load data 
from datasets import load_dataset

raw_data = load_dataset('json', data_files = "scott_lai_resume_train.json")
raw_data

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 122
    })
})

In [6]:
raw_data["train"][0]

{'prompt': 'What is Scott Lai’s profession?',
 'completion': 'AI Engineer and Data Scientist.'}

As you can see, here we return with the long text, but for fine-tuning we need the data to be small and precise chunks, more like here we apply the tokenization to take the text and split it into smaller chunks. Each chunk is called a token and it the smallest unit of meaning that LLMs work with.

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct"
)
def preprocess(sample):
    sample = sample['prompt']+ '\n' + sample['completion']
    print(sample)
    tokenized = tokenizer(
        sample,
        max_length = 128,
        truncation = True,
        padding = "max_length"    
    )

    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized
data = raw_data.map(preprocess)


In [8]:
print(data['train'])

Dataset({
    features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 122
})


## LoRA

now, let's move into the training

In [9]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM
import torch

In [10]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    device_map="auto",       # accelerate 会管理 GPU
    torch_dtype=torch.float16
)

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=['q_proj', 'k_proj', 'v_proj']
)

model = get_peft_model(model, lora_config)


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]


In [13]:
from transformers import TrainingArguments, Trainer

train_args = TrainingArguments(
    num_train_epochs = 10, # we will go throught the dataset from start to finish 10 times
    learning_rate=0.001, 
    logging_steps = 20, # we want to see the result in every 25 steps it runs 
    fp16 = True, # float point set to 16 to speed it up, set to "True" if you are on GPU
    # per_device_train_batch_size=1,  # <-- 不设置默认为8
)

trainer = Trainer(
    args = train_args,
    model = model, 
    train_dataset=data["train"]
)

In [14]:
trainer.train()

Step,Training Loss
20,0.2794
40,0.1875
60,0.1762
80,0.1319
100,0.1011
120,0.0843
140,0.0666
160,0.0505


TrainOutput(global_step=160, training_loss=0.13470476493239403, metrics={'train_runtime': 53.4216, 'train_samples_per_second': 22.837, 'train_steps_per_second': 2.995, 'total_flos': 2602200748523520.0, 'train_loss': 0.13470476493239403, 'epoch': 10.0})

In [15]:
# save the model
trainer.save_model("./my-qwen")
tokenizer.save_pretrained("./my-qwen")

('./my-qwen/tokenizer_config.json',
 './my-qwen/special_tokens_map.json',
 './my-qwen/chat_template.jinja',
 './my-qwen/vocab.json',
 './my-qwen/merges.txt',
 './my-qwen/added_tokens.json',
 './my-qwen/tokenizer.json')

Now let's test it out

In [19]:
import gc, torch

# 删除旧的 pipeline 对象
if "ask_llm" in globals():
    del ask_llm

gc.collect()
torch.cuda.empty_cache()

# 再加载一次
from transformers import pipeline
ask_llm = pipeline(
    task="text-generation",
    model="./my-qwen",
    tokenizer="./my-qwen",
    device=device,
    torch_dtype=torch.float16
)

print(ask_llm("Who is Scott Lai?")[0]["generated_text"])
# print(ask_llm("Who is Scott Lai?", max_new_tokens=20)[0]["generated_text"])


Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.67s/it]
Device set to use cuda


Who is Scott Lai?
