In [1]:
!pip install protobuf==3.20.3 trl peft accelerate bitsandbytes unsloth

Collecting protobuf==3.20.3
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting trl
  Downloading trl-0.23.1-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting unsloth
  Downloading unsloth-2025.10.1-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting unsloth_zoo>=2025.10.1 (from unsloth)
  Downloading unsloth_zoo-2025.10.1-py3-none-any.whl.metadata (31 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.32-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.meta

In [1]:
from unsloth import FastLanguageModel
import json
import torch
from datasets import Dataset
from transformers import TrainingArguments, AutoTokenizer
from trl import SFTTrainer

def interactive_test(model, tokenizer):
    """
    Creates an interactive loop to test the model with user prompts.
    """
    print("\n--- Interactive Model Test ---")
    print("Enter a prompt to test the fine-tuned model.")
    print("Type 'save' to finish testing and save the GGUF model.")
    print("Type 'cancel' to exit without saving.")
    print("------------------------------------")

    while True:
        # Get input from the user
        user_input = input("\nPrompt: ")

        # Check for control commands
        if user_input.lower() == "save":
            print("\nProceeding to save the model...")
            return True  # Signal to continue and save
        elif user_input.lower() == "cancel":
            print("\nCanceling save. Exiting script.")
            return False  # Signal to exit without saving

        # Prepare the input for the model
        messages = [
            {"role": "user", "content": user_input},
        ]
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        # Generate a response
        outputs = model.generate(input_ids=inputs, max_new_tokens=256, use_cache=True)
        response_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        # Clean up the output to only show the assistant's part
        # Note: The split logic may vary slightly based on the exact model output format.
        # This is a common way to parse it.
        assistant_response = response_text.split("<|assistant|>")
        if len(assistant_response) > 1:
            clean_response = assistant_response[1].strip()
        else:
            # Fallback for models that don't add the user prompt to the output
            # or have a different format.
            if "user" in response_text and "assistant" in response_text:
                 clean_response = response_text.split("assistant")[-1].strip()
            else:
                 clean_response = response_text

        print(f"Model: {clean_response}")

# Import Dataset
try:
    with open("data.json", "r") as f:
        file = json.load(f)
    print("Successfully loaded data.json.")
    print("Sample record:", file[1])
except FileNotFoundError:
    print("Error: data.json not found. Please upload it to the Colab session.")
except Exception as e:
    print(f"An error occurred: {e}")


# Load the base model
model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
max_seq_length = 2048
dtype = None  # Auto detection

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

# Create the dataset directly from the loaded file
dataset = Dataset.from_list(file)

# Formatting prompts so they can be sent like this {"context": "今天天气不错，不冷不热。晚饭后，我们决定去公园散步。那里的风景很优美，空气也很新鲜。", "target_sentence": "真是个锻练身体的好地方。"}
def format_prompts(batch):
    """
    Takes a batch of examples and returns a list of formatted strings
    in a chat format where the user input is a JSON object.
    """
    contexts = batch["context"]
    target_sentences = batch["target_sentence"]
    outputs = batch["output"]

    texts = []
    for context, target, output in zip(contexts, target_sentences, outputs):
        input_json = {
            "context": context,
            "target_sentence": target
        }

        # The user provides the simple JSON, and the assistant provides the correction JSON followed by the EOS token.
        prompt = f"<|user|>\n{json.dumps(input_json, ensure_ascii=False)}\n<|assistant|>\n{json.dumps(output, ensure_ascii=False)}<|endoftext|>"
        texts.append(prompt)

    return texts

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=128,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Training arguments
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    formatting_func=format_prompts,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="epoch",
        save_total_limit=2,
        dataloader_pin_memory=False, # Important for Colab
        report_to="none",
    ),
)

# Train the model
print("\n--- Starting Model Training ---")
trainer_stats = trainer.train()
print("--- Model Training Finished ---")


# Merge and save the 16-bit model
merged_model_path = "merged_16bit_model"
model.save_pretrained_merged(merged_model_path, tokenizer, save_method="merged_16bit")

# Reload the merged model for testing
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=merged_model_path,
    dtype=dtype,
    load_in_4bit=False,
)

# Run interactive test
should_save = interactive_test(model, tokenizer)

# Save the final GGUF model if requested
if should_save:
    model.save_pretrained_gguf(
        "gguf_model", tokenizer, quantization_method="q4_k_m"
    )
    print("\nGGUF model saved successfully in the 'gguf_model' file.")
    print("You can download it from the file browser on the left.")
else:
    print("\nExiting. The final GGUF model was not saved.")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Successfully loaded data.json.
Sample record: {'context': '这个星期工作太忙了，我感觉很累。终于等到周末可以放松一下了。明天是周末，你有什么计划吗？', 'target_sentence': '我打算和朋友一起去看电影。', 'output': {'is_correction_needed': False, 'corrected_sentence': '我打算和朋友一起去看电影。', 'reasoning': 'The sentence is grammatically correct and fits the context. No changes are needed.'}}
==((====))==  Unsloth 2025.10.1: Fast Mistral patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth 2025.10.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/227 [00:00<?, ? examples/s]


--- Starting Model Training ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 227 | Num Epochs = 3 | Total steps = 87
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664 of 3,940,617,216 (3.03% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,0.9816
50,0.5691
75,0.386


--- Model Training Finished ---


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 1/2 [01:24<01:24, 84.84s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [02:00<00:00, 60.35s/it]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [02:42<00:00, 81.13s/it]


Unsloth: Merge process complete. Saved to `/content/merged_16bit_model`
==((====))==  Unsloth 2025.10.1: Fast Mistral patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


--- Interactive Model Test ---
Enter a prompt to test the fine-tuned model.
Type 'save' to finish testing and save the GGUF model.
Type 'cancel' to exit without saving.
------------------------------------

Prompt: {"context": "今天天气不错，不冷不热。晚饭后，我们决定去公园散步。那里的风景很优美，空气也很新鲜。", "target_sentence": "真是个锻练身体的好地方。"}


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model: {"context": "今天天气不错，不冷不热。晚饭后，我们决定去公园散步。那里的风景很优美，空气也很新鲜。", "target_sentence": "真是个锻练身体的好地方。"} {"corrected_sentence": "真是个锻炼身体的好地方。", "is_correction_needed": true, "reasoning": "This is a character error. '锻' (duàn) means 'to forge' or 'to temper.' The correct character for 'to exercise' or 'to work out' is '锻' (duàn)."}

Prompt: save

Proceeding to save the model...


Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.92 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:00<00:00, 205.28it/s]

Unsloth: Saving tokenizer...




 Done.
Unsloth: Saving gguf_model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving gguf_model/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting mistral model. Can use fast conversion = True.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at gguf_model into f16 GGUF format.
The output location will be /content/gguf_model/unsloth.F16.gguf
This might take 3 minutes...


Unsloth: Extending gguf_model/tokenizer.model with added_tokens.json.
Originally tokenizer.model is of size (32000).
But we need to extend to sentencepiece vocab size (32011).


INFO:hf-to-gguf:Loading model: gguf_model
INFO:hf-to-gguf:Model architecture: MistralForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00002.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {3072, 32064}
INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.float16 --> F16, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float16 --> F16, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_v.weight,         torch.float16 --> F16, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.float16 --> F16, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf: