<a href="https://colab.research.google.com/github/iqbal-waqar/Fine-Tuning/blob/main/Fine_Tunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installation & Setup**

In [None]:
# Install Unsloth, a library that dramatically speeds up fine-tuning and inference for LLMs.
!pip install unsloth # install unsloth


# Force a clean reinstall of the latest Unsloth version from GitHub to ensure we have the most recent updates and bug fixes.
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git # Also get the latest version Unsloth!

In [None]:
# Import all necessary libraries :

# - Unsloth: For efficient model loading and training.
# - torch: The core PyTorch library for tensor operations and GPU management.
# - trl: Hugging Face's library for Transformer Reinforcement Learning (contains SFTTrainer).
# - huggingface_hub: To securely log in to Hugging Face and access models/datasets.
# - transformers: For the TrainingArguments to configure the training process.
# - datasets: To load the medical dataset from the Hugging Face hub.
# - wandb: For experiment tracking and logging training metrics.



from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
from huggingface_hub import login
from transformers import TrainingArguments
from datasets import load_dataset
import wandb

In [None]:
# Securely retrieve the Hugging Face API token stored in Colab's 'Secrets' manager.
# This token is required to access gated models (like DeepSeek) and datasets


from google.colab import userdata
hf_token = userdata.get('HF_API_KEY')

# Log in to Hugging Face using the API key.
login(hf_token)

In [None]:
# Verify that Colab has a GPU available and check which GPU it is.
# This is crucial as Unsloth and PyTorch require a CUDA-enabled GPU for acceleration.


import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

# **Model Loading**

In [None]:
# Define the model we want to use and the configuration for loading it, we are using Deepseek R1 here.


# The pre-trained model on Hugging Face Hub.
model_name = "DeepSeek-AI/DeepSeek-R1-Distill-Llama-8B"


# The maximum length of input sequences the model can handle.
max_sequence_length = 2048

# Let Unsloth automatically decide the best data type (e.g., bfloat16).
dtype = None

# Use 4-bit quantization to drastically reduce GPU memory usage.
load_in_4bit = True


# Load the model and its corresponding tokenizer with the specified settings.
# The `token` parameter is needed if the model is gated (requires acceptance of terms).
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_sequence_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token
)

# **Prompt Engineering & Initial Test**

In [None]:
# Define a template to structure our prompts. This helps guide the model to generate answers in a specific format.
# It includes placeholders `{}` for the question and the model's answer.


prompt_style = """
Below is a task description along with additional context provided in the input section. Your goal is to provide a well-reasoned response that effectively addresses the request.

Before crafting your answer, take a moment to carefully analyze the question. Develop a clear, step-by-step thought process to ensure your response is both logical and accurate.

### Task:
You are a medical expert specializing in clinical reasoning, diagnostics, and treatment planning. Answer the medical question below using your advanced knowledge.

### Query:
{}

### Answer:
{}
"""

In [None]:
# This is our test medical question. We will use this to test the model before and after fine-tuning.


question = """A 68-year-old man with a history of atrial fibrillation on apixaban presents with sudden-onset right-sided
hemiplegia and global aphasia, confirmed by MRI to have an acute left MCA infarction secondary to a cardioembolic clot.
Following failed mechanical thrombectomy, his NIHSS score remains at 18. Given the known mechanism and his baseline
anticoagulation, what is the most precise pathophysiological reason that makes intravenous thrombolysis with alteplase
a potentially high-risk yet controversial consideration in this specific scenario?"""


# Prepare the model for efficient text generation (inference).
FastLanguageModel.for_inference(model)


# Format the prompt with our question and an empty string for the answer, then convert it into tokens (numbers) the model understands.
# Move the tokens to the GPU for fast processing.
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")


# Generate a response from the model. It will create new tokens based on the input.
outputs = model.generate (
    input_ids = inputs.input_ids,

    # Tells the model to ignore padding tokens.
    attention_mask = inputs.attention_mask,

    # The maximum number of new tokens to generate.
    max_new_tokens = 1200,

    # Uses past key/values to speed up generation.
    use_cache = True
)

# Convert the generated token IDs back into human-readable text.
response = tokenizer.batch_decode(outputs)


# Print the full generated text, including our original prompt.
print(response)

In [None]:
# Clean up the output by splitting the text and only printing the part after "### Answer:".


print(response[0].split("### Answer:")[1])

# **Data Preparation**

In [None]:
# Load the medical dataset from the Hugging Face hub.
# We are taking the first 600 examples from the English split for training.


medical_dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split = "train[:600]", trust_remote_code = True)

In [None]:
# Let's look at the structure of one data example to understand its fields (Question, Complex_CoT, Response).

medical_dataset[1]

In [None]:
# Get the tokenizer's "End of Sequence" token. This is crucial for telling the model where one training example ends and the next begins.

EOS_TOKEN = tokenizer.eos_token  # Define EOS_TOKEN which tells the model when to stop generating the text during training
EOS_TOKEN

In [None]:
# Define a more detailed prompt template for *training*. It includes separate placeholders for the question, the chain-of-thought (reasoning), and the final answer.


train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:

{}

{}"""

In [None]:
# Define a function to preprocess each example in the dataset into the format required for training.

def preprocess_input_data(examples):
  inputs = examples["Question"]

# The step-by-step reasoning.
  cots = examples["Complex_CoT"]
# The final answer.
  outputs = examples["Response"]

# This list will hold all our formatted training texts.
  texts = []

# For each example, combine the question, reasoning, and answer into a single string using our template.
  for input, cot, output in zip(inputs, cots, outputs):
    text = train_prompt_style.format(input, cot, output) + EOS_TOKEN # Append the EOS token.
    texts.append(text)


# Return a dictionary with the key "texts".
  return {
      "texts" : texts,
  }

In [None]:
# Apply the preprocessing function to the entire dataset. This creates a new dataset with a "texts" column.

finetune_dataset = medical_dataset.map(preprocess_input_data, batched = True)

In [None]:
# Let's check one of the formatted training examples to ensure it looks correct.

finetune_dataset["texts"][0]

# **Model Preparation for Fine-Tuning (Low Rank Adaptation --LoRA)**

In [None]:
# Apply LoRA (Low-Rank Adaptation) to the model. This is a parameter-efficient fine-tuning technique.
# Instead of training all 8 billion parameters, we only train a small set of adapters, making it much faster and using less memory.


model_lora = FastLanguageModel.get_peft_model(
    model=model,

# The rank of the LoRA matrices. A lower rank means fewer parameters to train.
    r=16,

# The names of the model's layers we want to attach LoRA adapters to.
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],

# A scaling factor for LoRA.
    lora_alpha=16,

# Dropout probability for LoRA layers (0 for no dropout).
    lora_dropout=0,

# Do not train bias parameters.
    bias="none",

# Saves memory by trading compute for memory.
    use_gradient_checkpointing="unsloth",

# Seed for reproducibility.
    random_state=3047,

# Don't use RS-LoRA variant.
    use_rslora=False,

# Don't use LoftQ initialization.
    loftq_config=None,
)

In [None]:
# This function is used by the SFTTrainer to format the training data on-the-fly during training.

def formatting_func(examples):
    texts = []
    for input, cot, output in zip(examples["Question"], examples["Complex_CoT"], examples["Response"]):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)
    return texts

In [None]:
# This is a potential fix for a known issue with Unsloth to ensure the model's `generate` method works correctly after applying LoRA.

if hasattr(model, '_unwrapped_old_generate'):
    del model._unwrapped_old_generate

# **Training Setup & Execution**

In [None]:
# Initialize the Trainer, which handles the entire training loop.

trainer = SFTTrainer(
    model=model_lora, # The LoRA-adapted model we want to train.

    tokenizer=tokenizer,

    train_dataset=finetune_dataset, # Our preprocessed dataset.

    max_seq_length=max_sequence_length, # Truncate/pad sequences to this length.

    dataset_num_proc=1, # Number of processes for dataset preprocessing.

    formatting_func=formatting_func, # The function that tells the trainer how to format each data example.

    # Define training args
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

In [None]:
# Securely retrieve the Weights & Biases API key for experiment tracking.

from google.colab import userdata
wnb_token = userdata.get("WANDB_API_KEY")

In [None]:
# Log in to W&B and initialize a new run to track our training metrics (loss, learning rate, etc.).

wandb.login(key=wnb_token) # import wandb
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-on-Medical-CoT-Dataset',
    job_type="training",
    anonymous="allow"
)

In [None]:
# Start the fine-tuning process! This is where the model actually learns from the medical dataset.

trainer_stats = trainer.train()

In [None]:
# Finalize the W&B run and upload all remaining data.

wandb.finish()

In [None]:
# Test the same initial question again, but now using the fine-tuned model (model_lora).

question = """A 68-year-old man with a history of atrial fibrillation on apixaban presents with sudden-onset right-sided
hemiplegia and global aphasia, confirmed by MRI to have an acute left MCA infarction secondary to a cardioembolic clot.
Following failed mechanical thrombectomy, his NIHSS score remains at 18. Given the known mechanism and his baseline
anticoagulation, what is the most precise pathophysiological reason that makes intravenous thrombolysis with alteplase
a potentially high-risk yet controversial consideration in this specific scenario?"""


# Prepare the fine-tuned model for inference.
FastLanguageModel.for_inference(model_lora)

# Tokenize the input
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response
outputs = model_lora.generate (
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True
)

# Decode the response tokens back to text
response = tokenizer.batch_decode(outputs)

print(response)

In [None]:
print(response[0].split("### Answer:")[1])

In [None]:
question = """A 59-year-old man presents with a fever, chills, night sweats, and generalized fatigue,
              and is found to have a 12 mm vegetation on the aortic valve. Blood cultures indicate gram-positive, catalase-negative,
              gamma-hemolytic cocci in chains that do not grow in a 6.5% NaCl medium.
              What is the most likely predisposing factor for this patient's condition?"""

FastLanguageModel.for_inference(model_lora)

# Tokenize the input
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response
outputs = model_lora.generate (
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True
)

# Decode the response tokens back to text
response = tokenizer.batch_decode(outputs)

print(response[0].split("### Answer:")[1])