<a href="https://colab.research.google.com/github/jfzhang726/Finetune-llama3/blob/main/finetune_llamma3_gsm8k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
torch.__version__

'2.2.1+cu121'

In [2]:
%%capture

import torch
major_version, minor_version = torch.cuda.get_device_capability()

# Colab has torch 2.2.1 which will break the package, so install package seperatetly
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
  # new gpu Ampere, Hooper GPU (RTX 30xx, RTX 40xx,, A100, H100, L40)
  !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
  # old GPU V100, Tesla, T4, RTX 20xx
  !pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes
pass

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

In [2]:
# load dataset
from datasets import load_dataset
EOS_TOKEN = tokenizer.eos_token

alpaca_prompt_gsm8k = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are a helpful math teacher.

### Input:
{input}

### Response:
{output}
"""

def formatting_prompts_func_train(examples):

  questions = examples["question"]
  answers = examples["answer"]
  texts = []
  for question, answer in zip(questions, answers):
    # must add EOS_TOKEN, otherwise the generation never stops
    text = alpaca_prompt_gsm8k.format(input=f"Question: {question}", output=f"Answer: {answer}") + EOS_TOKEN
    texts.append(text)
  return {'text': texts}


def formatting_prompts_func_test(examples):

  questions = examples["question"]
  texts = []
  for question in questions:
    # must add EOS_TOKEN, otherwise the generation never stops
    text = alpaca_prompt_gsm8k.format(input=f"Question: {question}", output="")
    texts.append(text)
  return {'text': texts}
dataset_train = load_dataset('gsm8k', 'socratic', split='train')
dataset_train = dataset_train.map(formatting_prompts_func_train, batched=True)

dataset_test = load_dataset('gsm8k', 'socratic', split='test')
dataset_test = dataset_test.map(formatting_prompts_func_test, batched=True)

In [3]:
# test before finetune


FastLanguageModel.for_inference(model)
inputs = tokenizer([
    dataset_test['text'][0]
], return_tensors='pt').to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are a helpful math teacher.

### Input:
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

### Response:

<|end_of_text|>


In [7]:
inputs = tokenizer([
    dataset_test['text'][1]
], return_tensors='pt').to("cuda")

_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are a helpful math teacher.

### Input:
Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?

### Response:

<|end_of_text|>


In [4]:
# hyperparameters for training
from trl import SFTTrainer
from transformers import TrainingArguments

model = FastLanguageModel.get_peft_model(
    model,
    r=16, # 8, 16, 32, 64, 128
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing='unsloth',
    random_state=42,
    use_rslora=False,
  loftq_config=None

)



Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [9]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=409

In [5]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_train,
    dataset_text_field='text',
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False, # can speed up training of short sequences for 5 times
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=3,
        warmup_steps=5,
        num_train_epochs=1, # it will take 2+ hours for gsm8k
        # max_steps=60, # finetune steps, which will override num_train_epochs
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim='adamw_8bit',
        weight_decay=0.01,
        lr_scheduler_type='linear',
        seed=42,
        output_dir='outputs'
    )
)

In [6]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 3
\        /    Total batch size = 6 | Total steps = 1,245
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.6202
2,2.0024
3,1.6945
4,1.7722
5,1.6077
6,1.5082
7,1.4001
8,1.2627
9,1.0135
10,0.9387


In [7]:
# test after finetuning
FastLanguageModel.for_inference(model)
inputs = tokenizer([
    dataset_test['text'][0]
], return_tensors='pt').to("cuda")

_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are a helpful math teacher.

### Input:
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

### Response:

Answer: How many eggs does Janet eat? ** She eats 3 eggs for breakfast every day, so she eats 3 * 7 = <<3*7=21>>21 eggs per week.
How many eggs does Janet bake? ** She bakes muffins for her friends every day with 4 eggs, so she bakes 4 * 7 = <<4*7=28>>28 eggs per week.
How many eggs does Janet sell? ** She sells the remainder at the farmers' market daily for $2 per fresh duck egg, so she sells 16 - 21 - 28 = <<16-21


In [8]:
inputs = tokenizer([
    dataset_test['text'][1]
], return_tensors='pt').to("cuda")

_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are a helpful math teacher.

### Input:
Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?

### Response:

Answer: How many bolts of white fiber does it take? ** It takes 2/2=<<2/2=1>>1 bolt of white fiber
How many bolts of fiber does it take? ** So it takes 2+1=<<2+1=3>>3 bolts of fiber
#### 3
How many bolts in total does it take? ** It takes 3 bolts of fiber
#### 3
How many bolts in total does it take? ** It takes 3 bolts of fiber
#### 3
How many bolts in total does it take? ** It takes 3 bolts of fiber


In [9]:
inputs = tokenizer([
    dataset_test['text'][2]
], return_tensors='pt').to("cuda")

_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are a helpful math teacher.

### Input:
Question: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?

### Response:

Answer: How much did the house increase in value? ** The house increased in value by 80,000*.15=$<<80000*.15=12000>>12,000
How much did the house sell for? ** So it sold for 80,000+12,000=$<<80000+12000=92000>>92,000
How much did Josh make? ** That means he made 92,000-80,000=$<<92000-80000=12000>>12,000
#### 12000
How much profit did Josh make? ** So he made 12,000


In [10]:
inputs = tokenizer([
    dataset_test['text'][3]
], return_tensors='pt').to("cuda")

_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


### Instruction:
You are a helpful math teacher.

### Input:
Question: James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?

### Response:

Answer: How many meters does James run each sprint? ** He runs 60 meters each sprint so that's 60*3=<<60*3=180>>180 meters
How many meters does James run each week? ** He runs 3 times a week so that's 180*3=<<180*3=540>>540 meters
#### 540
How many total meters does James run a week? ** He runs 540 meters a week
#### 540
How many total meters does James run a week? ** He runs 540 meters a week
#### 540
How many total meters does James


In [11]:
# save LoRA model

model.save_pretrained("lora_model_gsm8k") # save to local drive
# model.push_to_hub("jfzhang/lora_model", token='xxxx') # save to huggingface

In [12]:
# merge lora model and base model, quantilize to 4-bit, save gguf
model.save_pretrained_gguf("model_gsm8k", tokenizer, quantization_method='q4_k_m')

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 6.13 out of 12.67 RAM for saving.


 47%|████▋     | 15/32 [00:01<00:01, 14.71it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:28<00:00,  2.75s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving model_gsm8k/pytorch_model-00001-of-00004.bin...
Unsloth: Saving model_gsm8k/pytorch_model-00002-of-00004.bin...
Unsloth: Saving model_gsm8k/pytorch_model-00003-of-00004.bin...
Unsloth: Saving model_gsm8k/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = True.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GUUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to q4_k_m will take 20 minutes.
 "-____-"     In total, you will have to wait around 26 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at model_gsm8k into f16 GGUF format.
The output location will be ./model_gsm8k-unsloth.F16.gguf
This will take 3 minutes...
Loading model file model_gsm8k/pytorch_model-00001-of-00004.bin
Loading model file model_gsm8k/pytorch_model-00001-of-00004.bin
Loading model file model_gsm8k/pytorch_model-00002-of-00004.bin
Loading model file model_gsm8k/pytorch_model-00003-of-00004.bin
Loading model file model_gsm8k/pytorch_model-00004-of-00004.bin
params = Params(n_vocab=128256, n_embd=4096, n_layer=32, n_ctx=8192, n_ff=14336, n_head=32, n_head_kv=8, n_experts=None, n_experts_

In [13]:
import os
os.getcwd()

'/content'

In [14]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [15]:
# copy model file to google drive
import shutil
import os
source_file = '/content/model_gsm8k-unsloth.Q4_K_M.gguf'
destination_dir = '/content/drive/MyDrive/Colab Notebooks/unsloth/Llama3'
destination_file = os.path.join(destination_dir, 'model_gsm8k-unsloth.Q4_K_M.gguf')
shutil.copy(source_file, destination_file)

'/content/drive/MyDrive/Colab Notebooks/unsloth/Llama3/model_gsm8k-unsloth.Q4_K_M.gguf'

In [17]:
source_file = '/content/model_gsm8k-unsloth.F16.gguf'
destination_dir = '/content/drive/MyDrive/Colab Notebooks/unsloth/Llama3'
destination_file = os.path.join(destination_dir, 'model_gsm8k-unsloth.F16.gguf')
shutil.copy(source_file, destination_file)

'/content/drive/MyDrive/Colab Notebooks/unsloth/Llama3/model_gsm8k-unsloth.F16.gguf'