In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and other dependencies
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes


In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.2.1: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


Unsloth 2026.2.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from datasets import load_dataset

# UPDATE THIS PATH to your uploaded file
dataset_file = "/content/drive/MyDrive/Text_Book_Chunk/medqa_finetune_data.jsonl"

dataset = load_dataset("json", data_files=dataset_file, split="train")

# Define formatting function (Alpaca style)
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = prompt_style.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True,)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/12723 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import torch

# Clear memory before starting
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        # Reduced batch size to 1 to fit on T4 GPU
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8, # Increased to keep effective batch size reasonable
        warmup_steps = 5,
        max_steps = 30, # Set to 60 for a quick demo run. For real training, use num_train_epochs = 1
        # num_train_epochs = 1, # Use this for full pass
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Disable WandB prompts
    ),
)

trainer_stats = trainer.train()


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/12723 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 12,723 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss
1,2.3676
2,2.3181
3,2.5121
4,2.3195
5,2.1235
6,2.0409
7,1.7604
8,1.7588
9,1.6494
10,1.3839


In [None]:
# Select a random example to test
from random import randint
idx = randint(0, len(dataset))
sample = dataset[idx]

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(
[
    prompt_style.format(
        "Answer the following multiple-choice question about medicine.\n\nQuestion:\nA 40-year-old zookeeper presents to the emergency department complaining of severe abdominal pain that radiates to her back, and nausea. The pain started 2 days ago and slowly increased until she could not tolerate it any longer. Past medical history is significant for hypertension and hypothyroidism. Additionally, she reports that she was recently stung by one of the zoo\u2019s smaller scorpions, but did not seek medical treatment. She takes aspirin, levothyroxine, oral contraceptive pills, and a multivitamin daily. Family history is noncontributory. Today, her blood pressure is 108/58 mm Hg, heart rate is 99/min, respiratory rate is 21/min, and temperature is 37.0\u00b0C (98.6\u00b0F). On physical exam, she is a well-developed, obese female that looks unwell. Her heart has a regular rate and rhythm. Radial pulses are weak but symmetric. Her lungs are clear to auscultation bilaterally. Her lateral left ankle is swollen, erythematous, and painful to palpate. An abdominal CT is consistent with acute pancreatitis. Which of the following is the most likely etiology for this patient\u2019s disease?\n\nOptions:\nA: Aspirin\nB: Oral contraceptive pills\nC: Scorpion sting\nD: Hypothyroidism\nE: Obesity", # Replace with a real question from your set if you want
        "",
        ""
    )
], return_tensors = "pt").to("cuda")

# inputs = tokenizer([sample['text'].split("### Response:")[0] + "### Response:"], return_tensors = "pt").to("cuda") # Run on the sample

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs))


['<｜begin▁of▁sentence｜>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nAnswer the following multiple-choice question about medicine.\n\nQuestion:\nA 40-year-old zookeeper presents to the emergency department complaining of severe abdominal pain that radiates to her back, and nausea. The pain started 2 days ago and slowly increased until she could not tolerate it any longer. Past medical history is significant for hypertension and hypothyroidism. Additionally, she reports that she was recently stung by one of the zoo’s smaller scorpions, but did not seek medical treatment. She takes aspirin, levothyroxine, oral contraceptive pills, and a multivitamin daily. Family history is noncontributory. Today, her blood pressure is 108/58 mm Hg, heart rate is 99/min, respiratory rate is 21/min, and temperature is 37.0°C (98.6°F). On physical exam, she is a well-develope

In [None]:
import os
import glob
from google.colab import files

# Save to GGUF (quantized)
# Note: If you already ran this and it succeeded, you can skip this line to save time,
# or run it again to be sure.
try:
    model.save_pretrained_gguf("model_gguf", tokenizer, quantization_method = "q4_k_m")
except Exception as e:
    print(f"Save failed or already exists: {e}")

# Find the file and download it
print("Searching for GGUF file...")
gguf_files = glob.glob("**/*.gguf", recursive=True)


Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/891 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  25%|██▌       | 1/4 [02:13<06:39, 133.01s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 2/4 [04:11<04:09, 124.54s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  75%|███████▌  | 3/4 [05:21<01:39, 99.47s/it] 

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 4/4 [05:32<00:00, 83.15s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [04:58<00:00, 74.59s/it]


Unsloth: Merge process complete. Saved to `/content/model_gguf`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...




Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['model_gguf_gguf/deepseek-r1-distill-llama-8b.F16.gguf']
Unsloth: [2] Converting GGUF f16 into q4_k_m. This might take 10 minutes...
Unsloth: Model files cleanup...


Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: All GGUF conversions completed successfully!
Generated files: ['model_gguf_gguf/deepseek-r1-distill-llama-8b.Q4_K_M.gguf']
Unsloth: No Ollama template mapping found for model 'unsloth/deepseek-r1-distill-llama-8b'. Skipping Ollama Modelfile
Unsloth: example usage for text only LLMs: llama.cpp/llama-cli --model model_gguf_gguf/deepseek-r1-distill-llama-8b.Q4_K_M.gguf -p "why is the sky blue?"
Searching for GGUF file...


In [None]:
if gguf_files:
    print(f"Found files: {gguf_files}")
    latest_file = max(gguf_files, key=os.path.getctime)
    print(f"Downloading: {latest_file}")
    files.download(latest_file)
else:
    print("❌ No GGUF file found. Please check the output of the previous step.")


Found files: ['llama.cpp/models/ggml-vocab-command-r.gguf', 'llama.cpp/models/ggml-vocab-baichuan.gguf', 'llama.cpp/models/ggml-vocab-falcon.gguf', 'llama.cpp/models/ggml-vocab-llama-spm.gguf', 'llama.cpp/models/ggml-vocab-aquila.gguf', 'llama.cpp/models/ggml-vocab-qwen2.gguf', 'llama.cpp/models/ggml-vocab-bert-bge.gguf', 'llama.cpp/models/ggml-vocab-gpt-2.gguf', 'llama.cpp/models/ggml-vocab-deepseek-llm.gguf', 'llama.cpp/models/ggml-vocab-llama-bpe.gguf', 'llama.cpp/models/ggml-vocab-deepseek-coder.gguf', 'llama.cpp/models/ggml-vocab-refact.gguf', 'llama.cpp/models/ggml-vocab-phi-3.gguf', 'llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf', 'llama.cpp/models/ggml-vocab-mpt.gguf', 'llama.cpp/models/ggml-vocab-starcoder.gguf', 'llama.cpp/models/ggml-vocab-gpt-neox.gguf', 'model_gguf_gguf/deepseek-r1-distill-llama-8b.Q4_K_M.gguf']
Downloading: model_gguf_gguf/deepseek-r1-distill-llama-8b.Q4_K_M.gguf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>