# **Step 1.** Install Python Packages

In [None]:
!pip install -U xformers --index-url https://download.pytorch.org/whl/cu121
!pip install --no-deps packaging ninja einops flash-attn trl peft accelerate bitsandbytes
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting xformers
  Downloading https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp311-cp311-manylinux_2_28_x86_64.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xformers
Successfully installed xformers-0.0.29.post1
Collecting ninja
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Collecting flash-attn
  Downloading flash_attn-2.7.3.tar.gz (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trl
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Using cached ninja-1.11.1.3-py3-none-ma

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Step 2.** Import Python Packages

In [None]:
import torch
import os
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# **Step 3.** Login to Your Hugging Face with hf_token. (write access token)

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# **Step 4.** Convert your JSON dataset to Llama3 finetuning format


In [None]:
huggingface_user = "ahsannadir"
dataset_name = "disease-symptoms"

class Llama3InstructDataset:
    def __init__(self, data):
        self.data = data
        self.prompts = []
        self.create_prompts()

    def create_prompt(self, row):
        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>{row['instruction']}<|eot_id|><|start_header_id|>user<|end_header_id|>{row['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{row['output']}<|eot_id|>"""
        return prompt

    def create_prompts(self):
        for row in self.data:
            prompt = self.create_prompt(row)
            self.prompts.append(prompt)

    def get_dataset(self):
        df = pd.DataFrame({'prompt': self.prompts})
        return df

def create_dataset_hf(dataset):
    dataset.reset_index(drop=True, inplace=True)
    return DatasetDict({"train": Dataset.from_pandas(dataset)})

if __name__ == "__main__":
    with open('/content/dataset.json', 'r') as f:
        data = json.load(f)

    dataset = Llama3InstructDataset(data)
    df = dataset.get_dataset()

    processed_data_path = 'processed_data'
    os.makedirs(processed_data_path, exist_ok=True)

    llama3_dataset = create_dataset_hf(df)
    llama3_dataset.save_to_disk(os.path.join(processed_data_path, "llama3_dataset"))
    llama3_dataset.push_to_hub(f"{huggingface_user}/{dataset_name}")

Saving the dataset (0/1 shards):   0%|          | 0/4498 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

# **Step 5.** LoRa Finetuning Configurations
- "finetuned_model" sets your models name on HF
- "num_train_epochs" sets the number of epochs for training

    (epoch = 1 pass through your entire dataset)

In [None]:
# Defining the configuration for the base model, LoRA and training
config = {
    "hugging_face_username":huggingface_user,
    "model_config": {
        "base_model":"unsloth/llama-3-8b-Instruct-bnb-4bit", # The base model
        "finetuned_model":"llama-3-8b-instruct-aidoctor", # The finetuned model
        "max_seq_length": 2048, # The maximum sequence length
        "dtype":torch.float16, # The data type
        "load_in_4bit": True, # Load the model in 4-bit
    },
    "lora_config": {
      "r": 16, # The number of LoRA layers 8, 16, 32, 64
      "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"], # The target modules
      "lora_alpha":16, # The alpha value for LoRA
      "lora_dropout":0, # The dropout value for LoRA
      "bias":"none", # The bias for LoRA
      "use_gradient_checkpointing":True, # Use gradient checkpointing
      "use_rslora":False, # Use RSLora
      "use_dora":False, # Use DoRa
      "loftq_config":None # The LoFTQ configuration
    },
    "training_dataset":{
        "name":f"{huggingface_user}/{dataset_name}", # The dataset name(huggingface/datasets)
        "split":"train", # The dataset split
        "input_field":"prompt", # The input field
    },
    "training_config": {
        "per_device_train_batch_size": 2, # The batch size
        "gradient_accumulation_steps": 4, # The gradient accumulation steps
        "warmup_steps": 5, # The warmup steps
        "max_steps":0, # The maximum steps (0 if the epochs are defined)
        "num_train_epochs": 5, # The number of training epochs(0 if the maximum steps are defined)
        "learning_rate": 2e-4, # The learning rate
        "fp16": not torch.cuda.is_bf16_supported(),  # The fp16
        "bf16": torch.cuda.is_bf16_supported(), # The bf16
        "logging_steps": 1, # The logging steps
        "optim" :"adamw_8bit", # The optimizer
        "weight_decay" : 0.01,  # The weight decay
        "lr_scheduler_type": "linear", # The learning rate scheduler
        "seed" : 42, # The seed
        "output_dir" : "outputs", # The output directory
    }
}

# **Step 6.** Load Llama3-8B, QLoRA & Trainer Model

In [None]:
# Loading the model and the tokinizer for the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config.get("model_config").get("base_model"),
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dtype = config.get("model_config").get("dtype"),
    load_in_4bit = config.get("model_config").get("load_in_4bit"),
)

# Setup for QLoRA/LoRA peft of the base model
model = FastLanguageModel.get_peft_model(
    model,
    r = config.get("lora_config").get("r"),
    target_modules = config.get("lora_config").get("target_modules"),
    lora_alpha = config.get("lora_config").get("lora_alpha"),
    lora_dropout = config.get("lora_config").get("lora_dropout"),
    bias = config.get("lora_config").get("bias"),
    use_gradient_checkpointing = config.get("lora_config").get("use_gradient_checkpointing"),
    random_state = 42,
    use_rslora = config.get("lora_config").get("use_rslora"),
    use_dora = config.get("lora_config").get("use_dora"),
    loftq_config = config.get("lora_config").get("loftq_config"),
)

# Loading the training dataset
dataset_train = load_dataset(config.get("training_dataset").get("name"), split = config.get("training_dataset").get("split"))

# Setting up the trainer for the model
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = config.get("training_dataset").get("input_field"),
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = config.get("training_config").get("per_device_train_batch_size"),
        gradient_accumulation_steps = config.get("training_config").get("gradient_accumulation_steps"),
        warmup_steps = config.get("training_config").get("warmup_steps"),
        max_steps = config.get("training_config").get("max_steps"),
        num_train_epochs= config.get("training_config").get("num_train_epochs"),
        learning_rate = config.get("training_config").get("learning_rate"),
        fp16 = config.get("training_config").get("fp16"),
        bf16 = config.get("training_config").get("bf16"),
        logging_steps = config.get("training_config").get("logging_steps"),
        optim = config.get("training_config").get("optim"),
        weight_decay = config.get("training_config").get("weight_decay"),
        lr_scheduler_type = config.get("training_config").get("lr_scheduler_type"),
        seed = 42,
        output_dir = config.get("training_config").get("output_dir"),
    ),
)

==((====))==  Unsloth 2025.1.7: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Device supports bfloat16 but you selected float16. Will change to bfloat16.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth 2025.1.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


README.md:   0%|          | 0.00/275 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/409k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4498 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/4498 [00:00<?, ? examples/s]

# **Step 7.** Train Your Finetuned Model

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,498 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 2,810
 "-____-"     Number of trainable parameters = 41,943,040


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
1,4.5387
2,2.2397
3,4.2514
4,4.1493
5,3.5496
6,2.7987
7,2.3026
8,1.8113
9,1.4526
10,1.2387


# **Step 8.** Save Trainer Stats

In [None]:
with open("trainer_stats.json", "w") as f:
    json.dump(trainer_stats, f, indent=4)

# **Step 9.** Save Finetuned Model & Push to HF Hub

In [None]:
model.save_pretrained_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "q4_k_m")
model.push_to_hub_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "q4_k_m")

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 61.63 out of 83.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:00<00:00, 54.43it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at llama-3-8b-instruct-aidoctor into bf16 GGUF format.
The output location will be /content/llama-3-8b-instruct-aidoctor/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: llama-3-8b-instruct-aidoctor
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-

100%|██████████| 32/32 [00:00<00:00, 65.48it/s]


Unsloth: Saving tokenizer... Done.
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at llama-3-8b-instruct-aidoctor into bf16 GGUF format.
The output location will be /content/llama-3-8b-instruct-aidoctor/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: llama-3-8b-instruct-aidoctor
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:token_embd.weigh

unsloth.Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/ahsannadir/llama-3-8b-instruct-aidoctor


# **Step 10.** Test your pretrained model in Colab

In [None]:
# Loading the fine-tuned model and the tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = config.get("model_config").get("finetuned_model"),
        max_seq_length = config.get("model_config").get("max_seq_length"),
        dtype = config.get("model_config").get("dtype"),
        load_in_4bit = config.get("model_config").get("load_in_4bit"),
    )

# Using FastLanguageModel for fast inference
FastLanguageModel.for_inference(model)

system_prompt = f"You are a highly qualified AI doctor. Your task is to assist users by analyzing their symptoms to detect possible diseases, recommend the appropriate medical specialist, and provide precautions. Ensure your responses are accurate, professional, and empathetic."

# Tokenizing the input and generating the output
prompt = input('ENTER PROMPT: ')
inputs = tokenizer(
[
    f"<|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>{prompt}<|end_header_id|>"
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs, skip_special_tokens = True)

==((====))==  Unsloth 2025.1.7: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

ENTER PROMPT: I may have constipation,pain_during_bowel_movements,pain_in_anal_region,bloody_stool,irritation_in_anus


["systemYou are a highly qualified AI doctor. Your task is to assist users by analyzing their symptoms to detect possible diseases, recommend the appropriate medical specialist, and provide precautions. Ensure your responses are accurate, professional, and empathetic.userI may have constipation,pain_during_bowel_movements,pain_in_anal_region,bloody_stool,irritation_in_anusassistantI'm so sorry to hear that you're experiencing these symptoms. It's possible that you may have Hemorrhoids.\n\nHemorrhoids are swollen veins in the lower rectum or anus that can cause bleeding, itching, and pain. There are two main types: internal and external. Internal hemorrhoids are located inside the rectum and may not cause symptoms unless they become irritated or inflamed. External hemorrhoids occur outside the anus and can cause pain, itching, and bleeding.\n\nBased on your symptoms, it's possible that you have external hemorrhoids. However, it's important to note that there are other conditions that ca

In [None]:
!zip -r /content/drive/MyDrive/llama-3-8b-instruct-aidoctor.zip /content/llama-3-8b-instruct-aidoctor

  adding: content/llama-3-8b-instruct-aidoctor/ (stored 0%)
  adding: content/llama-3-8b-instruct-aidoctor/model-00003-of-00004.safetensors


zip error: Interrupted (aborting)


In [None]:
!zip -r /content/drive/MyDrive/ai-doctor-02.zip /content/llama.cpp

  adding: content/llama.cpp/ (stored 0%)
  adding: content/llama.cpp/Sources/ (stored 0%)
  adding: content/llama.cpp/Sources/llama/ (stored 0%)
  adding: content/llama.cpp/Sources/llama/module.modulemap (deflated 19%)
  adding: content/llama.cpp/Sources/llama/llama.h (stored 0%)
  adding: content/llama.cpp/pyproject.toml (deflated 48%)
  adding: content/llama.cpp/.pre-commit-config.yaml (deflated 47%)
  adding: content/llama.cpp/.gitmodules (deflated 21%)
  adding: content/llama.cpp/ci/ (stored 0%)
  adding: content/llama.cpp/ci/README.md (deflated 48%)
  adding: content/llama.cpp/ci/run.sh (deflated 89%)
  adding: content/llama.cpp/llama-quantize (deflated 62%)
  adding: content/llama.cpp/llama-cli (deflated 62%)
  adding: content/llama.cpp/pyrightconfig.json (deflated 47%)
  adding: content/llama.cpp/requirements/ (stored 0%)
  adding: content/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt (deflated 10%)
  adding: content/llama.cpp/requirements/requirements-test-t

In [None]:
!zip -r /content/drive/MyDrive/ai-doctor-02-outputs.zip /content/outputs

  adding: content/outputs/ (stored 0%)
  adding: content/outputs/checkpoint-2500/ (stored 0%)
  adding: content/outputs/checkpoint-2500/rng_state.pth (deflated 25%)
  adding: content/outputs/checkpoint-2500/scheduler.pt (deflated 55%)
  adding: content/outputs/checkpoint-2500/adapter_model.safetensors (deflated 7%)
  adding: content/outputs/checkpoint-2500/trainer_state.json (deflated 82%)
  adding: content/outputs/checkpoint-2500/adapter_config.json (deflated 56%)
  adding: content/outputs/checkpoint-2500/README.md (deflated 66%)
  adding: content/outputs/checkpoint-2500/optimizer.pt (deflated 12%)
  adding: content/outputs/checkpoint-2500/training_args.bin (deflated 51%)
  adding: content/outputs/checkpoint-2500/special_tokens_map.json (deflated 70%)
  adding: content/outputs/checkpoint-2500/tokenizer_config.json (deflated 96%)
  adding: content/outputs/checkpoint-2500/tokenizer.json (deflated 85%)
  adding: content/outputs/runs/ (stored 0%)
  adding: content/outputs/runs/Jan27_16-05

In [None]:
!zip -r /content/drive/MyDrive/ai-doctor-02-processed_data.zip /content/processed_data

  adding: content/processed_data/ (stored 0%)
  adding: content/processed_data/llama3_dataset/ (stored 0%)
  adding: content/processed_data/llama3_dataset/train/ (stored 0%)
  adding: content/processed_data/llama3_dataset/train/data-00000-of-00001.arrow (deflated 92%)
  adding: content/processed_data/llama3_dataset/train/dataset_info.json (deflated 37%)
  adding: content/processed_data/llama3_dataset/train/state.json (deflated 38%)
  adding: content/processed_data/llama3_dataset/dataset_dict.json (stored 0%)


In [None]:
!du -sh /content/llama-3-8b-instruct-aidoctor

35G	/content/llama-3-8b-instruct-aidoctor
