In [1]:
# Part 1.2: Installing Necessary Libraries (Revised for Clarity and Debugging)
print("--- Installing Libraries ---")

# Install Unsloth first - try with [colab-new] if [colab] failed previously
# REMOVED -q to see output
print("Installing Unsloth (verbose)...")
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# If the above fails, try:
# !pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"

# Install other core dependencies
# REMOVED -q to see output
# Ensure 'datasets' is definitely included here
print("\nInstalling other dependencies (verbose)...")
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes datasets

print("\n--- Library Installation Attempt Complete ---")
print("Please check the output above for any installation errors.")

--- Installing Libraries ---
Installing Unsloth (verbose)...
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-km5w5m39/unsloth_1be8bb8dcd944bac9ed6ca8c0035cfb7
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-km5w5m39/unsloth_1be8bb8dcd944bac9ed6ca8c0035cfb7
  Resolved https://github.com/unslothai/unsloth.git to commit 7a8f99e1890213cdd01a3ab6c3e13174a96e8220
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.4.1 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.4.1-py3-none-any.whl.metadata (8.0 kB)
Collecting tyr


Installing other dependencies (verbose)...
Collecting xformers<0.0.27
  Downloading xformers-0.0.26.post1-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.26.post1-cp311-cp311-manylinux2014_x86_64.whl (222.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.8/222.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.15.2
    Uninstalling trl-0.15.2:
      Successfully uninstalled trl-0.15.2
Successfully installed trl-0.8.6 xformers-0.0.26.post1

--- Library Installation Attempt Complete ---
Please check the output above for any installation errors.


In [2]:
# 1.3. Importing Libraries
print("Importing libraries...")
import torch
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer, AutoTokenizer # Added AutoTokenizer for early loading
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
# Optional: For chat template setup if needed later (usually handled by unsloth)
# from unsloth.chat_templates import get_chat_template
print("Libraries imported.")

Importing libraries...



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel, is_bfloat16_supported


Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.


    PyTorch 2.3.0+cu121 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.9 (you have 3.11.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!
Libraries imported.


In [3]:
# ############################################
# ## Part 2: Model and Dataset Selection
# ############################################
print("\n--- Part 2: Model and Dataset Selection ---")

# 2.1. Choosing a Base LLM
# We select unsloth's 4-bit quantized version of Llama 3.1 8B Instruct.
# NOTE: Accessing Meta models requires agreeing to their terms and Hugging Face authentication.
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
print(f"Selected base model: {model_name}")

# Hugging Face Login (if needed for gated models like Llama 3)
# Replace 'YOUR_HF_TOKEN' with your actual Hugging Face access token
# You can get a token from https://huggingface.co/settings/tokens
# Alternatively, use notebook_login() for interactive login.
from huggingface_hub import login, notebook_login
try:
    # Replace with your token or use notebook_login()
    # Example: login(token="hf_YOUR_TOKEN_HERE")
    # Using notebook_login for interactive use in Colab:
    print("Attempting Hugging Face login...")
    notebook_login()
    print("Hugging Face login successful (or already logged in).")
except Exception as e:
    print(f"Hugging Face login failed or not provided: {e}")
    print("Proceeding without explicit login. Model download might fail if it's gated and requires authentication.")


# 2.2. Identifying a Medical Q&A Dataset
# Using keivalya/MedQuad-MedicalQnADataset as per the report.
dataset_name = "keivalya/MedQuad-MedicalQnADataset"
print(f"Selected dataset: {dataset_name}")

# 2.3. Loading the Dataset
print("Loading dataset...")
try:
    # Load the training split
    dataset = load_dataset(dataset_name, split="train")
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Error loading dataset with 'train' split: {e}")
    print("Attempting to load default split...")
    try:
        # Fallback if 'train' split isn't the default key
        dataset = load_dataset(dataset_name)['train'] # Adjust key if necessary based on dataset structure
        print("Dataset loaded successfully using default key.")
    except Exception as e2:
        print(f"Failed to load dataset: {e2}")
        # Handle error appropriately, maybe exit or raise
        raise e2 # Stop execution if dataset fails to load

# Optional: Inspect the dataset
print("\nDataset structure:")
print(dataset)
print("\nExample entry:")
print(dataset[0]) # Print the first example

# ############################################
# ## Part 4 (Excerpt): Loading Model and Tokenizer
# ## Note: Moved earlier to make tokenizer available for data formatting
# ############################################
print("\n--- Loading Base Model & Tokenizer (from Part 4.1) ---")

max_seq_length = 2048 # Choose based on VRAM (1024 or 2048 recommended for T4)
# Dtype: None lets unsloth choose the best automatically (float16 for T4)
dtype = None
# Load model in 4-bit precision (QLoRA)
load_in_4bit = True

# Load the Llama 3.1 model optimized by Unsloth
# device_map="auto" automatically places the model on the available GPU
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # Token is usually handled by login() or notebook_login()
    device_map = "auto",
)

print("Model and tokenizer loaded.")

# Check bfloat16 support (T4 GPUs usually don't support it)
IS_BFLOAT16_SUPPORTED = is_bfloat16_supported()
print(f"Bfloat16 supported: {IS_BFLOAT16_SUPPORTED}")


--- Part 2: Model and Dataset Selection ---
Selected base model: unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit
Attempting Hugging Face login...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Hugging Face login successful (or already logged in).
Selected dataset: keivalya/MedQuad-MedicalQnADataset
Loading dataset...


README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

Dataset loaded successfully.

Dataset structure:
Dataset({
    features: ['qtype', 'Question', 'Answer'],
    num_rows: 16407
})

Example entry:
{'qtype': 'susceptibility', 'Question': 'Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?', 'Answer': 'LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents.  Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.'}

--- Loading Base Model & Tokenizer (from Part 4.1) ---
==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA To

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Model and tokenizer loaded.
Bfloat16 supported: True


In [4]:
# ############################################
# ## Part 3: Preparing Data for Instruction Fine-Tuning
# ############################################
print("\n--- Part 3: Preparing Data ---")

# 3.1/3.2. Formatting using Llama 3 Chat Template
# We'll use the tokenizer's built-in chat template, which is best for Llama 3 Instruct.

# 3.3. Applying the Template (Method 1: Recommended)
print("Applying Llama 3 chat template to the dataset...")

def format_medquad_chat(example):
    # Extract question and answer, ensure they are strings
    question = str(example.get('Question', '')).strip()
    answer = str(example.get('Answer', '')).strip()

    # Basic validation: skip examples with empty question or answer
    if not question or not answer:
        return {"text": ""} # Return empty text for filtering

    # Add question mark if missing (simple heuristic)
    if not question.endswith("?"):
        question += "?"

    # Format using the Llama 3 chat template structure
    messages = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer}
    ]

    # Use tokenizer.apply_chat_template
    # tokenize=False gives the formatted string
    # add_generation_prompt=False prevents adding the start of assistant's turn
    try:
        formatted_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        return {"text": formatted_text}
    except Exception as e:
        print(f"Error applying chat template: {e}")
        print(f"Problematic example: Q: {question}, A: {answer}")
        return {"text": ""} # Return empty on error

# Apply the formatting function across the dataset
# remove_columns ensures we only keep the 'text' column needed for SFTTrainer
dataset = dataset.map(format_medquad_chat, remove_columns=list(dataset.features))

# Filter out any examples that failed formatting or had empty Q/A
dataset = dataset.filter(lambda example: len(example['text']) > 0)

print("Dataset formatted.")

# Optional: Inspect a formatted example
print("\nExample formatted text:")
try:
    print(dataset[0]['text'])
except IndexError:
    print("Dataset is empty after filtering, check formatting logic and source data.")
    raise # Stop execution if dataset becomes empty


# 3.4. Tokenization (Handled by SFTTrainer)
# SFTTrainer will handle tokenization internally using the provided tokenizer.
# `max_seq_length` and `packing=True` will be set in the Trainer initialization.
print("\nTokenization will be handled by SFTTrainer.")


--- Part 3: Preparing Data ---
Applying Llama 3 chat template to the dataset...


Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16407 [00:00<?, ? examples/s]

Dataset formatted.

Example formatted text:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents.  Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.<|eot_id|>

Tokenization will be handled by SFTTrainer.


In [5]:
# ############################################
# ## Part 4: QLoRA Fine-Tuning with Unsloth
# ############################################
print("\n--- Part 4: QLoRA Fine-Tuning ---")

# 4.1. Model & Tokenizer Loading (Assumed already done before Part 3)
# Variables 'model', 'tokenizer', 'max_seq_length', 'IS_BFLOAT16_SUPPORTED', 'dataset'
# are expected to be available from previous steps.

# 4.2. Configuring LoRA (Define parameters but don't create the object here for Unsloth's func)
# We'll pass these directly to get_peft_model below
print("Defining LoRA parameters...")
lora_r = 16
lora_alpha = 16
lora_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                       "gate_proj", "up_proj", "down_proj"]
lora_dropout = 0
lora_bias = "none"

# 4.3. Applying LoRA with Unsloth's optimization
print("Applying LoRA adapter to the model using Unsloth...")

# Pass LoRA parameters directly as keyword arguments to Unsloth's function
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_r,  # Pass rank directly
    target_modules = lora_target_modules, # Pass target_modules directly
    lora_alpha = lora_alpha, # Pass lora_alpha directly
    lora_dropout = lora_dropout, # Pass lora_dropout directly
    bias = lora_bias,    # Pass bias directly
    use_gradient_checkpointing = True, # Enable Unsloth's optimized gradient checkpointing
    random_state = 3407, # Set for reproducibility
    max_seq_length = max_seq_length, # Pass max_seq_length if needed by Unsloth here
    # task_type = "CAUSAL_LM", # Usually inferred by Unsloth
    # use_rslora = False, # Optional
    # loftq_config = None, # Optional
)
print("LoRA adapter applied.")
print("Trainable parameters overview:")
model.print_trainable_parameters()


# 4.4. Defining Training Arguments
print("Defining Training Arguments...")
training_arguments = TrainingArguments(
    output_dir = "./results-medquad-llama3-8b", # Directory for saving checkpoints
    num_train_epochs = 1, # Number of training epochs (1-3 recommended for LoRA)
    per_device_train_batch_size = 2, # Batch size per GPU (Keep low: 1 or 2 for T4)
    gradient_accumulation_steps = 8, # Accumulate gradients to simulate larger batch size (2 * 8 = 16 effective batch size)
    learning_rate = 1e-4, # Learning rate for LoRA (common: 2e-4, 1e-4, 5e-5)
    optim = "adamw_8bit", # Use 8-bit AdamW optimizer to save memory
    weight_decay = 0.01, # Weight decay
    lr_scheduler_type = "linear", # Learning rate scheduler
    warmup_steps = 10, # Warmup steps for the scheduler
    logging_steps = 5, # Log training information every 5 steps
    save_strategy = "epoch", # Save checkpoints at the end of each epoch
    # Use mixed precision based on GPU capability (fp16 on T4)
    fp16 = not IS_BFLOAT16_SUPPORTED, # Should be True for T4
    bf16 = IS_BFLOAT16_SUPPORTED, # Should be False for T4
    seed = 3407, # Seed for reproducibility
    report_to = "none", # Disable external reporting (like Weights & Biases) for simplicity
    # max_steps = 100, # Uncomment for quick testing runs (e.g., 60-100 steps)
)

# 4.5. Initializing the Trainer
print("Initializing SFTTrainer...")
trainer = SFTTrainer(
    model = model, # The PEFT-enhanced model from Unsloth
    tokenizer = tokenizer,
    args = training_arguments,
    train_dataset = dataset, # The prepared dataset from Part 3
    dataset_text_field = "text", # The column with formatted text
    max_seq_length = max_seq_length, # Max sequence length (consistent)
    packing = True, # Pack short sequences together for efficiency
    # formatting_func = formatting_func, # Alternative if not pre-formatting dataset
    # data_collator = ..., # Optional: if specific collation needed
)
print("SFTTrainer initialized.")

# 4.6. Executing the Training Loop
print("\n--- Starting Training ---")

# Optional: Display GPU memory usage before training
# Ensure torch is imported if running this block independently
import torch
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
max_memory = round(gpu_stats.total_memory / 1024**3, 3)
print(f"GPU: {gpu_stats.name}, Max Memory: {max_memory:.3f} GB")
print(f"Initial Reserved Memory: {start_gpu_memory:.3f} GB")

# Start training
print("Starting trainer.train()...")
training_results = trainer.train()

# Optional: Display GPU memory usage after training
used_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
used_memory_percent = round(used_memory / max_memory * 100, 2)
print(f"\n--- Training Complete ---")
print(f"Peak Reserved Memory during training: {used_memory:.3f} GB ({used_memory_percent}%)")

# Display training stats
print("\nTraining Stats:")
print(training_results)

# 4.7. Saving the LoRA Adapter
print("\nSaving LoRA adapter...")
lora_adapter_path = "llama3-8b-instruct-medquad-lora-adapter"

# Save the trained LoRA adapter weights and config using the trainer
trainer.save_model(lora_adapter_path)
# Alternatively, you could use: model.save_pretrained(lora_adapter_path)

# Save the tokenizer (good practice, although likely unchanged)
tokenizer.save_pretrained(lora_adapter_path)

print(f"LoRA adapter and tokenizer saved to: {lora_adapter_path}")

print("\n--- Finished Part 4 ---")


--- Part 4: QLoRA Fine-Tuning ---
Defining LoRA parameters...
Applying LoRA adapter to the model using Unsloth...


Unsloth 2025.4.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


LoRA adapter applied.
Trainable parameters overview:
trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196
Defining Training Arguments...
Initializing SFTTrainer...


Generating train split: 0 examples [00:00, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,576 | Num Epochs = 1 | Total steps = 161
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


SFTTrainer initialized.

--- Starting Training ---
GPU: NVIDIA A100-SXM4-40GB, Max Memory: 39.557 GB
Initial Reserved Memory: 7.625 GB
Starting trainer.train()...
Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
5,2.5028
10,1.2208
15,0.1723
20,0.102
25,0.0661
30,0.0607
35,0.0484
40,0.0257
45,0.0102
50,0.0076



--- Training Complete ---
Peak Reserved Memory during training: 7.625 GB (19.28%)

Training Stats:
TrainOutput(global_step=161, training_loss=0.13499091852814976, metrics={'train_runtime': 1603.5977, 'train_samples_per_second': 1.606, 'train_steps_per_second': 0.1, 'total_flos': 2.3888770530646426e+17, 'train_loss': 0.13499091852814976, 'epoch': 1.0})

Saving LoRA adapter...
LoRA adapter and tokenizer saved to: llama3-8b-instruct-medquad-lora-adapter

--- Finished Part 4 ---


In [6]:
# ############################################
# ## Part 5: Performing Inference
# ############################################
print("\n--- Part 5: Performing Inference with Fine-Tuned Model ---")

# 5.1. Reloading the Base Model (if necessary, e.g., after kernel restart)
# For inference, we can load the 4-bit model again.
# If the `model` object from training is still in memory and *unmerged*,
# you could potentially skip reloading the base and just load the adapter onto it.
# However, reloading ensures a clean state and demonstrates the full inference flow.

# Check if 'model' exists and might be the PEFT model; if so, clean up before reloading
if 'model' in locals():
    print("Clearing existing model from memory...")
    del model
    if 'trainer' in locals(): del trainer # Clear trainer too
    torch.cuda.empty_cache() # Release GPU memory

print("Reloading the base model for inference...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name, # Original base model name
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    device_map = "auto",
    # token = "hf_...", # Handled by login
)
print("Base model reloaded.")

# 5.2. Loading and Merging the LoRA Adapter
print(f"Loading saved LoRA adapter from: {lora_adapter_path}")
# Load the adapter onto the base model
# Note: If using the same 'model' object from training *before* merging,
# ensure it's the base model or handle potential PEFT layers correctly.
# Reloading base model simplifies this.
model = PeftModel.from_pretrained(model, lora_adapter_path)
print("LoRA adapter loaded onto the base model.")

# Merge the adapter weights into the base model for optimized inference
print("Merging adapter weights into the base model...")
model = model.merge_and_unload()
print("Adapter merged and unloaded. Model is now ready for standard inference.")

# 5.3. Generating Responses
print("\n--- Generating Responses to Medical Questions ---")

# Use TextStreamer for interactive output
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Define generation parameters
generation_config = dict(
    max_new_tokens=300,     # Limit response length
    temperature=0.6,        # Control randomness (lower = more deterministic)
    top_p=0.9,              # Nucleus sampling probability
    do_sample=True,         # Enable sampling-based generation
    pad_token_id=tokenizer.eos_token_id # Set pad token for generation
)


--- Part 5: Performing Inference with Fine-Tuned Model ---
Clearing existing model from memory...
Reloading the base model for inference...
==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Base model reloaded.
Loading saved LoRA adapter from: llama3-8b-instruct-medquad-lora-adapter
LoRA adapter loaded onto the base model.
Merging adapter weights into the base model...




Adapter merged and unloaded. Model is now ready for standard inference.

--- Generating Responses to Medical Questions ---


In [7]:
# --- Example 1 ---
test_question_1 = "What is Cysticercosis and how is it transmitted?"
print(f"\nExample 1: {test_question_1}")

# Format the prompt using the SAME Llama 3 chat template used for training data
messages_1 = [{"role": "user", "content": test_question_1}]
# add_generation_prompt=True is important for inference to signal model to generate assistant response
prompt_1 = tokenizer.apply_chat_template(messages_1, tokenize=False, add_generation_prompt=True)
print(f"Formatted Prompt 1:\n{prompt_1}")

# Tokenize the prompt and send to GPU
inputs_1 = tokenizer(prompt_1, return_tensors="pt").to("cuda")

# Generate the response
print("Model Response 1:")
_ = model.generate(
    inputs=inputs_1["input_ids"],
    streamer=streamer,
    **generation_config
)
print("\n---------------------------")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Example 1: What is Cysticercosis and how is it transmitted?
Formatted Prompt 1:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is Cysticercosis and how is it transmitted?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


Model Response 1:
Cysticercosis is a disease caused by the larval stage of the Taenia solium, a type of tapeworm. It is a parasitic infection that affects humans and is usually acquired by consuming food or water contaminated with the eggs of the parasite.

Cysticercosis is transmitted in three main ways:

1.  **Ingestion of tapeworm eggs**: People can become infected by eating food or water contaminated with the eggs of the parasite. This can happen when they eat food that has not been cooked or processed properly, or when they drink contaminated water.
2.  **Ingestion of pork**: People can also become infected by eating underco

In [8]:
# --- Example 2 ---
test_question_2 = "What are the main symptoms of Leishmaniasis?"
print(f"\nExample 2: {test_question_2}")

messages_2 = [{"role": "user", "content": test_question_2}]
prompt_2 = tokenizer.apply_chat_template(messages_2, tokenize=False, add_generation_prompt=True)
print(f"Formatted Prompt 2:\n{prompt_2}")

inputs_2 = tokenizer(prompt_2, return_tensors="pt").to("cuda")

print("Model Response 2:")
_ = model.generate(
    inputs=inputs_2["input_ids"],
    streamer=streamer,
    **generation_config
)
print("\n---------------------------")


Example 2: What are the main symptoms of Leishmaniasis?
Formatted Prompt 2:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What are the main symptoms of Leishmaniasis?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


Model Response 2:
Leishmaniasis is a disease caused by Leishmania parasites, which are transmitted by the bite of infected female phlebotomine sandflies. The symptoms of Leishmaniasis can vary depending on the form of the disease, which can be cutaneous, mucocutaneous, or visceral. Here are the main symptoms of Leishmaniasis:

**Cutaneous Leishmaniasis:**

* A small, painless, itchy skin lesion at the site of the sandfly bite
* The lesion can develop into a small, flat, painless nodule that may become larger and more inflamed
* The skin lesions can appear anywhere on the body, but are most commonly found on the arms, legs, or face
* The 

In [9]:
# Step 1: Mount Google Drive
from google.colab import drive
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# Step 2: Define the path in your Google Drive where you want to save the model
# IMPORTANT: Make sure this path exists or create it.
# Using 'MyDrive' which is the default root for your personal drive.
import os
drive_save_directory = "/content/drive/MyDrive/my_llm_models/llama3-8b-medquad-merged" # Choose your desired folder name

# Create the directory if it doesn't exist
os.makedirs(drive_save_directory, exist_ok=True)
print(f"Model save directory: {drive_save_directory}")

# Step 3: Save the merged model and tokenizer
# The 'model' variable should hold the merged model after model.merge_and_unload()
# The 'tokenizer' variable should be the one loaded alongside the base model
print("\nSaving merged model to Google Drive...")
# This will save model weights (like .safetensors) and config files (config.json etc.)
model.save_pretrained(drive_save_directory)
print("Merged model saved.")

print("\nSaving tokenizer to Google Drive...")
# This saves tokenizer files (tokenizer.json, tokenizer_config.json etc.)
tokenizer.save_pretrained(drive_save_directory)
print("Tokenizer saved.")

print(f"\n--- Model and tokenizer successfully saved to: {drive_save_directory} ---")

# Optional: Unmount drive when done if desired
# drive.flush_and_unmount()
# print('Google Drive unmounted.')

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.
Model save directory: /content/drive/MyDrive/my_llm_models/llama3-8b-medquad-merged

Saving merged model to Google Drive...
Merged model saved.

Saving tokenizer to Google Drive...
Tokenizer saved.

--- Model and tokenizer successfully saved to: /content/drive/MyDrive/my_llm_models/llama3-8b-medquad-merged ---
