# **Step 1.** Install Python Packages

In [1]:
!pip install -U xformers --index-url https://download.pytorch.org/whl/cu121
!pip install --no-deps packaging ninja einops flash-attn trl peft accelerate bitsandbytes
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting xformers
  Downloading https://download.pytorch.org/whl/cu121/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl (16.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xformers
Successfully installed xformers-0.0.28.post3
Collecting ninja
  Downloading ninja-1.11.1.2-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Collecting flash-attn
  Downloading flash_attn-2.7.0.post2.tar.gz (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Using cached ninja-1.11.1.2-py3-no

# **Step 2.** Import Python Packages

In [2]:
import torch
import os
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# **Step 3.** Login to Your Hugging Face with hf_token. (write access token)

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

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# **Step 4.** Convert your JSON dataset to Llama3 finetuning format


In [16]:
import json
with open('dataset_transformed.json', 'r') as f:
  print(json.load(f))

[{'instruction': "You are a support tool designed to assist doctors in assessing whether a patient may have Idiopathic Pulmonary Fibrosis (IPF). Your task is to provide a concise report based on the patient's data and test results. Use the patient's specific data to contextualize your response. Follow these rules: <0.4: 'No clear evidence, further monitoring may be necessary.' >0.4 and <0.8: 'Significant evidence, evaluation needed.' >0.8: 'Pathology evident, immediate intervention recommended. Consider lung transplant or palliative care if necessary.'", 'input': {'sex': 'women', 'age': 64, 'condition': 'ex-smoker', 'probability': 0.89}, 'output': 'Given that this is a 64-year-old woman who is an ex-smoker, the pathology appears evident with an 89% probability. Immediate medical intervention is recommended.'}, {'instruction': "You are a support tool designed to assist doctors in assessing whether a patient may have Idiopathic Pulmonary Fibrosis (IPF). Your task is to provide a concise 

In [17]:
huggingface_user = "pablopertusa"
dataset_name = "ft-hackathon-3"

class Llama3InstructDataset:
    def __init__(self, data):
        self.data = data
        self.prompts = []
        self.create_prompts()

    def create_prompt(self, row):
        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>{row['instruction']}<|eot_id|><|start_header_id|>user<|end_header_id|>{row['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{row['output']}<|eot_id|>"""
        return prompt

    def create_prompts(self):
        for row in self.data:
            prompt = self.create_prompt(row)
            self.prompts.append(prompt)

    def get_dataset(self):
        df = pd.DataFrame({'prompt': self.prompts})
        return df

def create_dataset_hf(dataset):
    dataset.reset_index(drop=True, inplace=True)
    return DatasetDict({"train": Dataset.from_pandas(dataset)})

if __name__ == "__main__":
    with open('/content/dataset_transformed.json', 'r') as f:
        data = json.load(f)

    dataset = Llama3InstructDataset(data)
    df = dataset.get_dataset()

    processed_data_path = 'processed_data'
    os.makedirs(processed_data_path, exist_ok=True)

    llama3_dataset = create_dataset_hf(df)
    llama3_dataset.save_to_disk(os.path.join(processed_data_path, "llama3_dataset"))
    llama3_dataset.push_to_hub(f"{huggingface_user}/{dataset_name}")

Saving the dataset (0/1 shards):   0%|          | 0/15 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

# **Step 5.** LoRa Finetuning Configurations
- "finetuned_model" sets your models name on HF
- "num_train_epochs" sets the number of epochs for training

    (epoch = 1 pass through your entire dataset)

In [18]:
# Defining the configuration for the base model, LoRA and training
config = {
    "hugging_face_username":huggingface_user,
    "model_config": {
        "base_model":"unsloth/Llama-3.2-3B-Instruct", # The base model
        "finetuned_model":"Llama-hackathon-3", # The finetuned model
        "max_seq_length": 1024, # The maximum sequence length
        "dtype":torch.float16, # The data type
        "load_in_4bit": True, # Load the model in 4-bit
    },
    "lora_config": {
      "r": 8, # The number of LoRA layers 8, 16, 32, 64
      "target_modules": ["q_proj", "v_proj"], # The target modules
      "lora_alpha":16, # The alpha value for LoRA
      "lora_dropout":0, # The dropout value for LoRA
      "bias":"none", # The bias for LoRA
      "use_gradient_checkpointing":False, # Use gradient checkpointing
      "use_rslora":False, # Use RSLora
      "use_dora":False, # Use DoRa
      "loftq_config":None # The LoFTQ configuration
    },
    "training_dataset":{
        "name":f"{huggingface_user}/{dataset_name}", # The dataset name(huggingface/datasets)
        "split":"train", # The dataset split
        "input_field":"prompt", # The input field
    },
    "training_config": {
        "per_device_train_batch_size": 2, # The batch size
        "gradient_accumulation_steps": 4, # The gradient accumulation steps
        "warmup_steps": 5, # The warmup steps
        "max_steps":0, # The maximum steps (0 if the epochs are defined)
        "num_train_epochs": 12, # The number of training epochs(0 if the maximum steps are defined)
        "learning_rate": 2e-4, # The learning rate
        "fp16": not torch.cuda.is_bf16_supported(),  # The fp16
        "bf16": torch.cuda.is_bf16_supported(), # The bf16
        "logging_steps": 1, # The logging steps
        "optim" :"adamw_8bit", # The optimizer
        "weight_decay" : 0.01,  # The weight decay
        "lr_scheduler_type": "linear", # The learning rate scheduler
        "seed" : 42, # The seed
        "output_dir" : "outputs", # The output directory
    }
}

# **Step 6.** Load Llama3-8B, QLoRA & Trainer Model

In [19]:
# Loading the model and the tokinizer for the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config.get("model_config").get("base_model"),
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dtype = config.get("model_config").get("dtype"),
    load_in_4bit = config.get("model_config").get("load_in_4bit"),
)

# Setup for QLoRA/LoRA peft of the base model
model = FastLanguageModel.get_peft_model(
    model,
    r = config.get("lora_config").get("r"),
    target_modules = config.get("lora_config").get("target_modules"),
    lora_alpha = config.get("lora_config").get("lora_alpha"),
    lora_dropout = config.get("lora_config").get("lora_dropout"),
    bias = config.get("lora_config").get("bias"),
    use_gradient_checkpointing = config.get("lora_config").get("use_gradient_checkpointing"),
    random_state = 42,
    use_rslora = config.get("lora_config").get("use_rslora"),
    use_dora = config.get("lora_config").get("use_dora"),
    loftq_config = config.get("lora_config").get("loftq_config"),
)

# Loading the training dataset
dataset_train = load_dataset(config.get("training_dataset").get("name"), split = config.get("training_dataset").get("split"))

# Setting up the trainer for the model
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = config.get("training_dataset").get("input_field"),
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = config.get("training_config").get("per_device_train_batch_size"),
        gradient_accumulation_steps = config.get("training_config").get("gradient_accumulation_steps"),
        warmup_steps = config.get("training_config").get("warmup_steps"),
        max_steps = config.get("training_config").get("max_steps"),
        num_train_epochs= config.get("training_config").get("num_train_epochs"),
        learning_rate = config.get("training_config").get("learning_rate"),
        fp16 = config.get("training_config").get("fp16"),
        bf16 = config.get("training_config").get("bf16"),
        logging_steps = config.get("training_config").get("logging_steps"),
        optim = config.get("training_config").get("optim"),
        weight_decay = config.get("training_config").get("weight_decay"),
        lr_scheduler_type = config.get("training_config").get("lr_scheduler_type"),
        seed = 42,
        output_dir = config.get("training_config").get("output_dir"),
    ),
)

==((====))==  Unsloth 2024.11.9: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/267 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.59k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/15 [00:00<?, ? examples/s]

# **Step 7.** Train Your Finetuned Model

In [20]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 15 | Num Epochs = 12
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 24
 "-____-"     Number of trainable parameters = 2,293,760


Step,Training Loss
1,2.9598
2,2.9758
3,2.9492
4,2.9245
5,2.8481
6,2.7713
7,2.6684
8,2.5519
9,2.4931
10,2.3681


# **Step 8.** Save Trainer Stats

In [21]:
with open("trainer_stats.json", "w") as f:
    json.dump(trainer_stats, f, indent=4)

# **Step 9.** Save Finetuned Model & Push to HF Hub

In [22]:
model.save_pretrained_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "q4_k_m")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.44 out of 12.67 RAM for saving.


100%|██████████| 28/28 [00:00<00:00, 65.98it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving Llama-hackathon-3/pytorch_model-00001-of-00002.bin...
Unsloth: Saving Llama-hackathon-3/pytorch_model-00002-of-00002.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at Llama-hackathon-3 into f16 GGUF format.
The output location will be /content/Llama-hackathon-3/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-hackathon-3
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope

In [23]:
model.push_to_hub_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "q4_k_m")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.44 out of 12.67 RAM for saving.


100%|██████████| 28/28 [00:00<00:00, 74.72it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving Llama-hackathon-3/pytorch_model-00001-of-00002.bin...
Unsloth: Saving Llama-hackathon-3/pytorch_model-00002-of-00002.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at Llama-hackathon-3 into f16 GGUF format.
The output location will be /content/Llama-hackathon-3/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-hackathon-3
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope

unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/pablopertusa/Llama-hackathon-3


# **Step 10.** Test your pretrained model in Colab

In [24]:
# Loading the fine-tuned model and the tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = config.get("model_config").get("finetuned_model"),
        max_seq_length = config.get("model_config").get("max_seq_length"),
        dtype = config.get("model_config").get("dtype"),
        load_in_4bit = config.get("model_config").get("load_in_4bit"),
    )

# Using FastLanguageModel for fast inference
FastLanguageModel.for_inference(model)
system_prompt =  "You are a support tool designed to assist doctors in assessing whether a patient may have Idiopathic Pulmonary Fibrosis (IPF). Your task is to provide a concise report based on the patient's data and test results. Use the patient's specific data to contextualize your response. Follow these rules: ≤0.4: 'No clear evidence, further monitoring may be necessary.' >0.4 and ≤0.8: 'Significant evidence, evaluation needed.' >0.8: 'Pathology evident, immediate intervention recommended. Consider lung transplant or palliative care if necessary.'"
# Tokenizing the input and generating the output
prompt = input('TYPE PROMPT TO LLAMA3: ')
inputs = tokenizer(
[
    f"<|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>{prompt}<|end_header_id|>"
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs, skip_special_tokens = True)

==((====))==  Unsloth 2024.11.9: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

TYPE PROMPT TO LLAMA3: {'sex': 'women', 'age': 42, 'condition': 'never smoked', 'probability': 0.23}


["systemYou are a support tool designed to assist doctors in assessing whether a patient may have Idiopathic Pulmonary Fibrosis (IPF). Your task is to provide a concise report based on the patient's data and test results. Use the patient's specific data to contextualize your response. Follow these rules: ≤0.4: 'No clear evidence, further monitoring may be necessary.' >0.4 and ≤0.8: 'Significant evidence, evaluation needed.' >0.8: 'Pathology evident, immediate intervention recommended. Consider lung transplant or palliative care if necessary.'user{'sex': 'women', 'age': 42, 'condition': 'never smoked', 'probability': 0.23}assistant\n\nBased on the patient's data, the report is:\n\nSignificant evidence, evaluation needed.\n\nThis is because the patient's probability of having Idiopathic Pulmonary Fibrosis (IPF) is 0.23, which is greater than 0.4. However, it's essential to note that a probability value does not make a diagnosis. Further evaluation is necessary to confirm the diagnosis an

In [25]:
system_prompt =  "You are a support tool designed to assist doctors in assessing whether a patient may have Idiopathic Pulmonary Fibrosis (IPF). Your task is to provide a concise report based on the patient's data and test results. Use the patient's specific data to contextualize your response. Follow these rules: ≤0.4: 'No clear evidence, further monitoring may be necessary.' >0.4 and ≤0.8: 'Significant evidence, evaluation needed.' >0.8: 'Pathology evident, immediate intervention recommended. Consider lung transplant or palliative care if necessary.'"
# Tokenizing the input and generating the output
prompt = input('TYPE PROMPT TO LLAMA3: ')
inputs = tokenizer(
[
    f"<|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>{prompt}<|end_header_id|>"
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs, skip_special_tokens = True)

TYPE PROMPT TO LLAMA3: {'sex': 'man', 'age': 76, 'condition': 'never smoked', 'probability': 0.89}


["systemYou are a support tool designed to assist doctors in assessing whether a patient may have Idiopathic Pulmonary Fibrosis (IPF). Your task is to provide a concise report based on the patient's data and test results. Use the patient's specific data to contextualize your response. Follow these rules: ≤0.4: 'No clear evidence, further monitoring may be necessary.' >0.4 and ≤0.8: 'Significant evidence, evaluation needed.' >0.8: 'Pathology evident, immediate intervention recommended. Consider lung transplant or palliative care if necessary.'user{'sex':'man', 'age': 76, 'condition': 'never smoked', 'probability': 0.89}assistant\n\nBased on the patient's data, the assessment is as follows:\n\nThe patient's probability of having Idiopathic Pulmonary Fibrosis (IPF) is 0.89, which is significantly high (> 0.8). This suggests that the pathology of IPF is evident, and immediate intervention is recommended. Considering the patient's age and never-smoked status, lung transplant or palliative c