In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Environment Configuration
- **Purpose**: Configure GPU settings for CUDA and Triton compatibility.  
- **Key Actions**:  
  - `CUDA_VISIBLE_DEVICES="0"`: Restricts the script to use only the first GPU.  
  - `CUDA_DEVICE_ORDER="PCI_BUS_ID"`: Ensures GPUs are ordered by PCI bus ID (useful for multi-GPU setups).  
  - `TRITON_CAPABILITY="75"`: Sets Triton compute capability to match NVIDIA Volta+ GPUs (e.g., T4).

In [2]:
import os
os.environ["TRITON_CAPABILITY"] = "75"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Dependency Management
- **Steps**:  
  1. Uninstalls existing `numpy` (1.26.4) to avoid conflicts.  
  2. Installs `numpy==1.24.3`, downgrading for compatibility with `autoawq`.  
  3. Installs `autoawq[kernels]` (quantization library) and updates `transformers`/`accelerate`.

In [3]:
!pip uninstall numpy -y
!pip install --no-cache-dir numpy==1.24.3  # Known stable version for AutoAWQ
!pip install autoawq[kernels]  # Required for AWQ models like BioMistral
!pip install --upgrade transformers accelerate  # Ensure latest versions

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.24.3
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m234.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albucore 0.0.19 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.
albumentations 1.4.20 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.
bayesian-optimization 2.0.3 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.
featuretools 1.31.0 requires n

## LLM Initialization
- **Model**: Loads `BioMistral-7B-AWQ`, a quantized medical LLM optimized for GPU inference.
    - This is based on the well-known `Mistral-7B` LLM (commonly used by Kagglers to win AI competitions)
    - `Mistral AI` is a French artificial intelligence (AI) startup founded by ex-Meta and ex-Google employees, headquartered in Paris.
    - `BioMistral-7B` is a fine-tuned version of `Mistral-7B`
        - fine-tuned on clinical data
    - `BioMistral-7B-AWQ` is a "quantized" version of `BioMistral-7B`
        - I'm using this because this is just a demo
- **Key Steps**:  
  - `AutoTokenizer`/`AutoModelForCausalLM`: Load tokenizer and model.  
  - `pad_token_id=tokenizer.pad_token_id`: Ensures padding compatibility.  

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name = "BioMistral/BioMistral-7B-AWQ-QGS128-W4-GEMV"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,  trust_remote_code=True)
model.generation_config.pad_token_id = tokenizer.pad_token_id
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Embedding(32000, 4096)

## Dataset Preparation
- **Purpose**: Format training data for prefix tuning.
- **Key Actions**:
  - Defines `format_example` to tokenize prompts and completions.
  - Creates a synthetic training example with structured drug metadata.
  - Generates `processed_data` (tokenized inputs and labels).
- **Note**: The dataset is minimal (1 example) for demonstration purposes.

In [5]:
def format_example(example):
    inputs = f"PROMPT: {example['prompt']}\nCOMPLETION: "
    targets = example['completion']
    model_inputs = tokenizer(
        inputs,
        max_length=200,
        padding="max_length",
        truncation=True,
    )
    labels = tokenizer(
        targets,
        max_length=200,
        padding="max_length",
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# dataset sample
train_data = [
{
"prompt": """{
"name_of_drug": "CEFTRIAXONE SODIUM IN DEXTROSE",
"drug_tier": 1,
"requirements_limits": null
}""",
"completion": """{
"name": "CEFTRIAXONE SODIUM IN DEXTROSE",
"therapeuticClass": "ANTI-INFECTIVE AGENTS",
"pharmacologicalSubclass": "ANTIBACTERIALS",
"brandStatus": "brand",
"saltForm": "sodium",
"combinationProduct": false,
"deliveryFormulation": "in dextrose",
"formularyTier": 1,
"utilizationManagement": [],
"alternatives": ["cefepime", "cefazolin"]
}"""
}
]

processed_data = [format_example(ex) for ex in train_data]
processed_data

[{'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 7497, 2059, 28738, 28747, 371, 13, 28739, 861, 28730, 1009, 28730, 28715, 10909, 1264, 345, 2554, 28765, 2050, 7408, 28814, 6349, 318, 2896, 28737, 4171, 2693, 384, 4036, 1594, 1151, 548, 13, 28739, 28715, 10909, 28730, 28707, 749, 1264, 28705, 28740, 28725, 13, 28739, 6351, 1339, 28730, 14347, 1264, 1241, 13, 28752, 13, 9028, 1180, 10940, 28747, 28705], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Prefix Tuning Configuration
- **Purpose**: Apply parameter-efficient fine-tuning (PEFT) via prefix tuning.
- **Key Actions**:
  - Configures `PrefixTuningConfig` with 10 virtual tokens and projection settings.
  - Wraps the model with PEFT and prints trainable parameters.
- **Note**: Only 0.2% of parameters are trained. That saves us a lot of time and energy!

In [6]:
from peft import PrefixTuningConfig, get_peft_model

peft_config = PrefixTuningConfig(
    task_type="CAUSAL_LM",
    num_virtual_tokens=10,
    encoder_hidden_size=256,
    prefix_projection=False,
    token_dim=model.config.hidden_size,
    num_layers=model.config.num_hidden_layers
)
peft_model = get_peft_model(model, peft_config)
peft_model.enable_input_require_grads()
peft_model.print_trainable_parameters()

trainable params: 655,360 || all params: 263,065,600 || trainable%: 0.2491


## Training Setup
- **Purpose**: Fine-tune the model using the Hugging Face `Trainer`.
- **Key Actions**:
  - Defines `TrainingArguments` (batch size=2, learning rate=1e-4, 5 epochs).
  - Initializes `Trainer` with the PEFT model and synthetic dataset.
- **Note**: Gradient accumulation and low `max_grad_norm` prevent memory issues.

In [7]:
from transformers import TrainingArguments, Trainer, default_data_collator

training_args = TrainingArguments(
    report_to="none", # important: it should be the string "none"
    output_dir="./biomistral-prefix-awq",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    optim="adamw_torch",
    max_grad_norm=0.25,
    num_train_epochs=5,
    gradient_checkpointing=False,
    logging_steps=1,
    remove_unused_columns=False,
    disable_tqdm=False,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=processed_data,
    data_collator=default_data_collator,
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,9.0812
2,9.0812
3,9.0812
4,9.0812
5,9.0812


TrainOutput(global_step=5, training_loss=9.08117961883545, metrics={'train_runtime': 3.3201, 'train_samples_per_second': 1.506, 'train_steps_per_second': 1.506, 'total_flos': 788029440000.0, 'train_loss': 9.08117961883545, 'epoch': 5.0})

## Save Adapter
- **Purpose**: Persist the trained prefix adapter for reuse.
- **Key Action**: Saves adapter weights to `./biomistral-prefix-adapter`.

In [8]:
# for use in next notebook
peft_model.save_pretrained("./biomistral-prefix-adapter")

## ~~Inference with Fine-Tuned Model~~
*(performance didn't really improve so were not using this; were using the base model)*
- **Purpose**: Generate structured completions using the tuned model.
- **Key Actions**:
  - Reloads the base model and attaches the saved PEFT adapter.
  - Defines `generate_completion` for structured text generation.
  - Tests inference with a sample prompt for drug metadata.
- **Note**: Uses conservative sampling (`temperature=0.3`) for deterministic outputs.

In [9]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel
# import torch

# # Load base model
# model_name = "BioMistral/BioMistral-7B-AWQ-QGS128-W4-GEMV"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     trust_remote_code=True,
# )

# # Load prefix adapter
# model = PeftModel.from_pretrained(model, "./biomistral-prefix-adapter", is_trainable=False)
# model.to("cuda")  # Move to GPU if available
# model.eval()  # Set to evaluation mode

# # Modified generation function
# def generate_completion(prompt):
#     formatted_input = f"PROMPT: {prompt}\nCOMPLETION: "
    
#     inputs = tokenizer(
#         formatted_input,
#         return_tensors="pt",
#         max_length=200,
#         truncation=True,
#         add_special_tokens=True
#     ).to(model.device)

#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=200,
#         temperature=0.3,  # this should be lower for structured data
#         top_p=0.9,
#         repetition_penalty=1.1,
#         do_sample=True,
#         pad_token_id=tokenizer.eos_token_id,
#         eos_token_id=tokenizer.eos_token_id,
#         # output_scores=True,
#         # return_dict_in_generate=True,
#     )

#     # Clean decoding
#     completion = tokenizer.decode(
#         outputs.sequences[0][inputs.input_ids.shape[-1]:],
#         # skip_special_tokens=True
#     )
#     return completion

# # Example usage
# input_prompt = """{
# "name_of_drug": "CEFTRIAXONE SODIUM IN DEXTROSE",
# "drug_tier": 1,
# "requirements_limits": null
# }"""

# print(generate_completion(input_prompt))