## Trainning Mistral 7B với MetaMathQA_395k (300k - 400k)

In [None]:
%%capture
!pip install pip3-autoremove
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu124
!pip install unsloth vllm
# !pip install --upgrade transformers==4.52.3

### Đăng nhập HuggingFace

In [None]:
!huggingface-cli login --token $secret_hf

usage: huggingface-cli <command> [<args>] login [-h] [--token TOKEN] [--add-to-git-credential]
huggingface-cli <command> [<args>] login: error: argument --token: expected one argument


In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN_DK")

In [None]:
from huggingface_hub import login
login(token=hf_token, new_session=False)

### Load mô hình Mistral 4bit từ Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  # Chiều dài tối đa 1 đoạn văn bản
dtype = None           # Tự động chọn (float16, bfloat16) theo GPU
load_in_4bit = True    # Dùng mô hình lượng tử hóa 4-bit
lora_rank = 32

# Load model đã quant hóa 4bit + tự động chia lên GPU phù hợp
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-07-08 06:25:02.491241: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751955902.698606      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751955902.763218      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-08 06:25:21 [__init__.py:244] Automatically detected platform cuda.
Unsloth: vLLM does not work on older GPUs - will switch to Unsloth inference!
==((====))==  Unsloth 2025.6.12: Fast Mistral patching. Transformers: 4.51.3. vLLM: 0.9.2.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

### LoRA

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = lora_rank,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.6.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Gán Chat Template kiểu ChatML. Load và xử lý dữ liệu MetaMathQA

### Train/test split

In [None]:
from datasets import load_dataset

dataset = load_dataset("meta-math/MetaMathQA", split="train")
dataset = dataset.select(range(300000, 395000))  # Chỉ lấy 100k mẫu từ 300000 - 395000
print(len(dataset))

README.md: 0.00B [00:00, ?B/s]

MetaMathQA-395K.json:   0%|          | 0.00/396M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/395000 [00:00<?, ? examples/s]

95000


In [None]:
split_dataset = dataset.train_test_split(test_size=0.3, seed=42)

In [None]:
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

### Format dạng hội thoại

In [None]:
from unsloth.chat_templates import get_chat_template

# Gán template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",  # hoặc "mistral", "unsloth"
    map_eos_token=True,
)

# System Instruction
system_instruction_fixed = """Below is an instruction that describes a mathematical task.
Write a response that thoroughly solves the given problem.
Before solving, develop a clear step-by-step chain of reasoning to ensure accuracy and logical coherence.

### Instruction:
You are a mathematics expert with advanced knowledge in mathematical reasoning, problem-solving, and proof techniques. You think outloud and consider various aspects before giving any concrete answers."""

def formatting_prompts_func_conversational_structured(examples):
    queries = examples["query"]
    responses = examples["response"]
    texts = []

    for q, r in zip(queries, responses):
        messages = [
            {"role": "system", "content": system_instruction_fixed},
            {"role": "user", "content": q},
            {"role": "assistant", "content": r},
        ]
        formatted = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(formatted)

    return {"text": texts}


Unsloth: Will map <|im_end|> to EOS = </s>.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [None]:
train_dataset = train_dataset.map(formatting_prompts_func_conversational_structured, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func_conversational_structured, batched=True)

Map:   0%|          | 0/66500 [00:00<?, ? examples/s]

Map:   0%|          | 0/28500 [00:00<?, ? examples/s]

### Huấn luyện bằng `SFTTrainer`

In [None]:
from trl import SFTConfig, SFTTrainer

# Khởi tạo Trainer để huấn luyện mô hình với các thiết lập đã tối ưu cho toán học
trainer = SFTTrainer(
    model = model,                          # Mô hình đã được chuẩn bị (có thể LoRA hoặc fine-tuned)
    tokenizer = tokenizer,                  # Tokenizer phù hợp với mô hình
    train_dataset = train_dataset,          # Dữ liệu đã được token hóa cho huấn luyện
    eval_dataset = eval_dataset,            # Dữ liệu để đánh giá trong quá trình train
    dataset_text_field = "text",            # Trường chứa chuỗi đầu vào
    max_seq_length = 2048,                  # Độ dài tối đa mỗi chuỗi, phù hợp với Mistral
    dataset_num_proc = 2,                   # Số tiến trình song song để xử lý dữ liệu
    packing = False,                        # Không gộp nhiều mẫu ngắn lại – tốt hơn cho toán học vì mẫu thường dài
    args = SFTConfig(
        output_dir = "outputs",             # Thư mục để lưu checkpoint và log
        per_device_train_batch_size = 2,    # Batch size mỗi GPU – tăng nếu đủ VRAM
        per_device_eval_batch_size = 2,     # Batch size khi đánh giá – không ảnh hưởng tốc độ train
        gradient_accumulation_steps = 4,    # Số bước tích lũy gradient – tăng nếu VRAM hạn chế
        warmup_steps = 60,                  # Số bước đầu warmup learning rate (5% ~ 1000 steps)
        max_steps = 1200,                   # Tổng số bước huấn luyện (bạn có thể giảm nếu cần)
        learning_rate = 2e-4,               # Learning rate – thường dùng 1e-4 cho LoRA
        optim = "adamw_8bit",               # Optimizer nhẹ, phù hợp với mô hình 4-bit
        weight_decay = 0.01,                # Tránh overfitting nhẹ
        lr_scheduler_type = "cosine",       # Tăng giảm learning rate mượt mà hơn “linear”
        logging_steps = 10,                 # In log sau mỗi 10 bước
        logging_strategy = "steps",
        eval_steps = 10,
        report_to = "none",                 # Nếu muốn theo dõi bằng wandb thì đổi thành "wandb"
        seed = 3407,                        # Đảm bảo kết quả reproducible nếu cần
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/66500 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/28500 [00:00<?, ? examples/s]

In [None]:
# Bắt đầu quá trình huấn luyện
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 66,500 | Num Epochs = 1 | Total steps = 1,200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 83,886,080 of 7,000,000,000 (1.20% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.413
20,0.6877
30,0.4232
40,0.4147
50,0.3939
60,0.3771
70,0.386
80,0.3691
90,0.3752
100,0.3588


In [None]:
trainer_stats

TrainOutput(global_step=1200, training_loss=0.33991364121437073, metrics={'train_runtime': 29784.503, 'train_samples_per_second': 0.322, 'train_steps_per_second': 0.04, 'total_flos': 1.8215400436826112e+17, 'train_loss': 0.33991364121437073})

### Inference sau khi huấn luyện

In [None]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from transformers import TextStreamer
import torch

# Kích hoạt chế độ inference nhanh
FastLanguageModel.for_inference(model)

# Gắn lại chat template chuẩn chatml (hoặc mistral nếu fine-tune theo kiểu đó)
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    map_eos_token=True,
)

# Hàm hỏi đáp inference
def chat(question, max_new_tokens=512, stream=False):
    messages = [{"from": "human", "value": question}]

    # Tạo prompt theo chuẩn chat
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Tùy chọn stream ra màn hình trực tiếp
    if stream:
        streamer = TextStreamer(tokenizer)
        _ = model.generate(
            input_ids=inputs,
            streamer=streamer,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            top_p=1.0,
            top_k=1,
            repetition_penalty=1.1,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    else:
        # Sinh ra kết quả hoàn chỉnh
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            top_p=1.0,
            top_k=1,
            repetition_penalty=1.1,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
        )

        # Giải mã kết quả
        result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        # Cắt phần prompt nếu cần
        response = result.split(question.strip())[-1].strip()
        return response


Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


In [None]:
response = chat("If $x - y = X and $x + y = 12$, what is the value of $x$? If we know the answer to the above question is 9, what is the value of unknown variable X?")
print(response) # Đúng

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|im_start|>assistant
We are given two equations:
$x - y = X$ (Equation 1)
$x + y = 12$ (Equation 2)
To find the value of $x$, we can solve these equations simultaneously.
We can start by adding Equation 1 and Equation 2 together:
$(x - y) + (x + y) = X + 12$
Simplifying this equation gives us:
$2x = X + 12$
Now, we can substitute the value of $X$ into this equation:
$2x = 9 + 12$
$2x = 21$
Dividing both sides of the equation by 2, we get:
$x = \frac{21}{2}$
The value of $x$ is $\frac{21}{2}$.
The answer is: 21


In [None]:
chat("A radio show plays for 3 hours a day. They split their show into talking segments, ad breaks and songs. Talking segments last 10 minutes each, ad breaks last 5 minutes each and songs are played throughout the rest of the show. If the radio show includes 3 talking segments and x ad breaks in today’s show, how long, in minutes, does the show play songs? If we know the answer to the above question is 125, what is the value of unknown variable x?", stream=True)
# Đúng

<|im_start|>user
A radio show plays for 3 hours a day. They split their show into talking segments, ad breaks and songs. Talking segments last 10 minutes each, ad breaks last 5 minutes each and songs are played throughout the rest of the show. If the radio show includes 3 talking segments and x ad breaks in today’s show, how long, in minutes, does the show play songs? If we know the answer to the above question is 125, what is the value of unknown variable x?<|im_end|>
<|im_start|>assistant
The total duration of the show is 3 hours * 60 minutes/hour = 180 minutes.
Talking segments last 10 minutes each, so 3 talking segments will take up 10 * 3 = 30 minutes.
Ad breaks last 5 minutes each, so x ad breaks will take up 5x minutes.
Songs are played throughout the rest of the show, which means they take up 180 - 30 - 5x minutes.
We are given that the show plays songs for 125 minutes, so we can write: 180 - 30 - 5x = 125.
Simplifying the left side, we get: 150 - 5x = 125.
Subtracting 150 from

In [None]:
response = chat("In a charity race to raise money for hurricane victims, thirty students participated. Ten of them raised $20 each, while the remaining students raised $30 each. What is the total amount of money raised by the students in the race?")
print(response)
# Đúng

<|im_start|>assistant
The ten students who raised $20 each contributed 10 * $20 = $200.
The remaining twenty students who raised $30 each contributed 20 * $30 = $600.
Therefore, the total amount of money raised by the students in the race is $200 + $600 = $800.
#### 800
The answer is: 800


In [None]:
chat("Which two-digit positive integer is one more than a multiple of 2, 3, 4, 5, and 6?", stream=True)
# Sai, đáp là: 61

<|im_start|>user
Which two-digit positive integer is one more than a multiple of 2, 3, 4, 5, and 6?<|im_end|>
<|im_start|>assistant
The only number that satisfies all five conditions is $\boxed{10}$.
The answer is: 10<|im_end|>


In [None]:
chat("Simplify: $|{-3^2+4}|$", stream=True)
# Đúng :5

<|im_start|>user
Simplify: $|{-3^2+4}|$<|im_end|>
<|im_start|>assistant
First, we simplify the expression inside the absolute value: $3^2-4=9-4=5$.
Then, we take the absolute value of 5: $|5|=\boxed{5}$.The answer is: 5<|im_end|>


In [None]:
chat("Steven's teacher sends the class an assignment to collect x different fruit seeds. Apples average 6 seeds, pears average 2 seeds, and grapes average 3 seeds. Steven has set aside 4 apples, 3 pears, and 9 grapes to extract their seeds. How many more seeds does he need to fulfill his assignment? If we know the answer to the above question is 3, what is the value of unknown variable x?")
# Đáp án 60 => Đúng

'<|im_start|>assistant\nSteven has set aside 4 apples, which means he has 4 * 6 = 24 apple seeds.\nHe also has 3 pears, which means he has 3 * 2 = 6 pear seeds.\nAnd he has 9 grapes, which means he has 9 * 3 = 27 grape seeds.\nThe total number of seeds he already has is 24 + 6 + 27 = 57 seeds.\nTo fulfill his assignment, he needs to collect x different fruit seeds.\nSo the number of additional seeds he needs is x - 57.\nWe are given that the number of additional seeds needed is 3, so we can write: x - 57 = 3.\nSolving for x, we get: x = 60.\nThe value of x is 60.\n#### 60\nThe answer is: 60'

In [None]:
chat(
"""I am a two-digit number.
The sum of my digits is 11.
If you reverse my digits, the new number = the original number + 27.
What number am I?
""",
    stream=True
)
# Đáp án: 47 => Sai

<|im_start|>user
I am a two-digit number.
The sum of my digits is 11.
If you reverse my digits, the new number = the original number + 27.
What number am I?
<|im_end|>
<|im_start|>assistant
Let's call the two-digit number $AB$.
We are given that $A+B=11$ and $AB+27=AB+38$.
Simplifying the second equation gives us $AB=38$.
Substituting this into the first equation gives us $A+B=11$, so $A=6$ and $B=5$.
Therefore, the two-digit number is $\boxed{65}$.
The answer is: 65<|im_end|>


In [None]:
chat("At Frank's Fruit Market, 3 bananas cost as much as 2 apples, and 6 apples cost as much as 4 oranges. How many oranges cost as much as 18 bananas?", stream=True)
# Sai, đáp án là 8

<|im_start|>user
At Frank's Fruit Market, 3 bananas cost as much as 2 apples, and 6 apples cost as much as 4 oranges. How many oranges cost as much as 18 bananas?<|im_end|>
<|im_start|>assistant
Let $b$ represent the number of bananas, $a$ represent the number of apples, and $o$ represent the number of oranges. We are given that $3b=2a$, $6a=4o$, and $18b=xo$.

We can solve these equations to find the values of $b$, $a$, and $o$.

From $3b=2a$, we have $b=\frac{2}{3}a$.

Substituting this into $6a=4o$, we get $\frac{2}{3}a=\frac{4}{6}o$.

Simplifying, we have $a=\frac{2}{3}o$.

Now, let's substitute these values into $18b=xo$:

$18\left(\frac{2}{3}a\right)=x\left(\frac{2}{3}o\right)$.

Multiplying both sides by 3 to eliminate the fractions, we get $54a=6ox$.

Substituting $a=\frac{2}{3}o$, we have $54\left(\frac{2}{3}o\right)=6o\left(\frac{2}{3}o\right)$.

Simplifying, we have $36o=2o^2$.

Dividing both sides by $2o$, we get $18o=o^2$.

Rearranging, we have $o^2-18o=0$.

Factoring, we 

In [None]:
# from transformers import pipeline

# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# questions = [
#     "What is the integral of x^2?",
#     "Define a group in abstract algebra.",
#     "Prove that the square root of 2 is irrational.",
# ]

# for q in questions:
#     prompt = tokenizer.apply_chat_template(
#         [{"role": "user", "content": q}],
#         tokenize=False,
#         add_generation_prompt=True,
#     )
#     output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
#     print(f"\n❓ {q}\n🧠 {output[0]['generated_text']}\n" + "-"*60)


### Lưu mô hình

#### Tải mô hình

In [None]:
model.save_pretrained("mistral-metamathqa-lora-100k-p4")
tokenizer.save_pretrained("mistral-metamathqa-lora-100k-p4")

('mistral-metamathqa-lora-100k-p4/tokenizer_config.json',
 'mistral-metamathqa-lora-100k-p4/special_tokens_map.json',
 'mistral-metamathqa-lora-100k-p4/tokenizer.json')

In [None]:
import shutil

shutil.make_archive(
    base_name="/kaggle/working/mistral-metamathqa-lora-100k-p4",  # Tên file zip (không cần .zip ở đây)
    format="zip",  # Có thể là: 'zip', 'tar', 'gztar', 'bztar', 'xztar'
    root_dir="/kaggle/working/mistral-metamathqa-lora-100k-p4"
)

'/kaggle/working/mistral-metamathqa-lora-100k-p4.zip'

### Load mô hình lên HuggingFace

In [None]:
from huggingface_hub import create_repo

create_repo("mistral-metamathqa-lora-100k-p4", private=False)  # hoặc private=False nếu muốn công khai

RepoUrl('https://huggingface.co/KenyaWashed/mistral-metamathqa-lora-100k-p4', endpoint='https://huggingface.co', repo_type='model', repo_id='KenyaWashed/mistral-metamathqa-lora-100k-p4')

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="/kaggle/working/mistral-metamathqa-lora-100k-p4",
    repo_id="KenyaWashed/mistral-metamathqa-lora-100k-p4",  # sửa lại đúng username Hugging Face của bạn
    repo_type="model"
)


adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/KenyaWashed/mistral-metamathqa-lora-100k-p4/commit/ae6e7ea2efd7d02f5c1764d65f4b427db892d911', commit_message='Upload folder using huggingface_hub', commit_description='', oid='ae6e7ea2efd7d02f5c1764d65f4b427db892d911', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KenyaWashed/mistral-metamathqa-lora-100k-p4', endpoint='https://huggingface.co', repo_type='model', repo_id='KenyaWashed/mistral-metamathqa-lora-100k-p4'), pr_revision=None, pr_num=None)