In [1]:
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth vllm
!pip install triton==3.1.0
!pip install -U pynvml

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting xformers
  Downloading https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hINFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl (780.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.5/780.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-nvrtc-cu12==12.1.105 (

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_Token")

In [3]:
# CELL 1: Force Single GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "google/txgemma-2b-predict"

quant_config = BitsAndBytesConfig(
    load_in_4bit            = True,
    bnb_4bit_quant_type     = "nf4",
    bnb_4bit_compute_dtype  = torch.float16,
    llm_int8_enable_fp32_cpu_offload = False,
)

tokenizer = AutoTokenizer.from_pretrained(model_id,token=secret_value_0)  # loads tokenizer.json, tokenizer.model,
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token = secret_value_0,
    quantization_config = quant_config,
    device_map          = {'':0},      #Use one GPU
    torch_dtype         = torch.float16,
    attn_implementation = "eager",     #Google’s preferred attention implementation for this model
)

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

2025-04-27 21:44:59.594995: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745790299.789477      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745790299.849730      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [5]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

def get_trialbench(split="train"):
    raw = load_dataset(
        "json",
        data_files="/kaggle/input/txgemma-datasets-trialbench-adverse-event/txgemma_datasets_trialbench_adverse-event-rate-prediction_train.jsonl",
        split=split,
    )
    # Concatenate the system prompt and the user text into one string
    def fmt(x):
        return {
            "prompt": SYSTEM_PROMPT.strip() + "\n\n" + x["input_text"].strip(),
            "answer": x["output_text"].strip(),
        }
    return raw.map(fmt)

dataset = get_trialbench()

def extract_xml_answer(text: str) -> str:
    # exactly as before
    answer = text.split("<answer>")[-1].split("</answer>")[0]
    return answer.strip()

def count_xml(text: str) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:        count += 0.125
    if text.count("\n</reasoning>\n") == 1:     count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1]) * 0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1) * 0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    # completions is List[str]
    return [count_xml(c) for c in completions]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    return [0.5 if re.match(pattern, c, re.DOTALL) else 0.0 for c in completions]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    return [0.5 if re.search(pattern, c, re.DOTALL) else 0.0 for c in completions]

def int_reward_func(completions, **kwargs) -> list[float]:
    extracted = [extract_xml_answer(c) for c in completions]
    return [0.5 if r.isdigit() else 0.0 for r in extracted]

def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    # `answer` is List[str] of ground-truth
    extracted = [extract_xml_answer(c) for c in completions]
    return [2.0 if r == a else 0.0 for r, a in zip(extracted, answer)]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/14368 [00:00<?, ? examples/s]

**Modified Cell with new Training Settings**

In [14]:
from peft import LoraConfig, get_peft_model
from trl  import GRPOConfig, GRPOTrainer
import torch

# 1) Attach LoRA adapters as before
lora_cfg = LoraConfig(
    r              = 16,
    lora_alpha     = 16,
    target_modules = ["gate_proj", "up_proj", "down_proj"],
    bias           = "none",
    task_type      = "CAUSAL_LM",
)
peft_model = get_peft_model(model, lora_cfg)

# 2) Build GRPOConfig tuned for a 2B model
training_args = GRPOConfig(
    use_vllm                    = False,
    learning_rate               = 5e-6,
    per_device_train_batch_size = 4,        # smaller = faster
    gradient_accumulation_steps = 4,        # effective batch of 2
    num_generations             = 2,        # one rollout per step
    max_prompt_length           = 128,
    max_completion_length       = 100,
    max_steps                   = 5000,
    save_steps                  = 250,
    output_dir                  = "outputs",

    # precision
    bf16                        = False,
    fp16                        = True,

    # optimizer / schedule
    weight_decay                = 0.1,
    warmup_ratio                = 0.1,
    lr_scheduler_type           = "cosine",
    optim                       = "paged_adamw_8bit",
    max_grad_norm               = 0.1,

    # richer logging
    logging_strategy            = "steps",        # log every N steps
    logging_steps               = 20,
    log_completions             = True,           # show sample outputs
    report_to                   = "tensorboard",   # or "wandb"
)

# 3) Launch trainer (no eval split here)
trainer = GRPOTrainer(
    model            = peft_model,
    processing_class = tokenizer,
    train_dataset    = dataset,
    reward_funcs     = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args             = training_args,
)

# 4) Start training
trainer.train()


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


ValueError: You can't train a model that has been loaded in 8-bit or 4-bit precision on a different device than the one you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}` or `device_map={'':torch.xpu.current_device()}`

In [6]:
from peft import LoraConfig, get_peft_model
# 2) *Attach* LoRA adapters with PEFT
lora_cfg = LoraConfig(
    r             = 16,
    lora_alpha    = 16,
    target_modules= ["gate_proj", "up_proj", "down_proj"],
    bias          = "none",
    task_type     = "CAUSAL_LM",
)
peft_model = get_peft_model(model, lora_cfg)

In [7]:
from trl import GRPOConfig

training_args = GRPOConfig(
    use_vllm                    = False,      # HF path, not Unsloth/vLLM
    learning_rate               = 5e-6,
    per_device_train_batch_size = 4,          # MUST match num_generations
    gradient_accumulation_steps = 4,
    num_generations             = 2,
    max_prompt_length           = 256,
    max_completion_length       = 200,
    max_steps                   = 5000,
    save_steps                  = 250,
    output_dir                  = "outputs",

    # precision flags for a T4
    bf16                        = False,
    fp16                        = True,

    # (other args you already had:)
    weight_decay                = 0.1,
    warmup_ratio                = 0.1,
    lr_scheduler_type           = "cosine",
    optim                       = "paged_adamw_8bit",
    logging_steps               = 1,
    max_grad_norm               = 0.1,
    report_to                   = "none",
)


In [8]:
from trl import GRPOTrainer
trainer = GRPOTrainer(
    model = peft_model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

INFO 04-27 21:46:10 [__init__.py:239] Automatically detected platform cuda.


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,-0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0



Cannot access gated repo for url https://huggingface.co/google/txgemma-2b-predict/resolve/main/config.json.
Access to model google/txgemma-2b-predict is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/txgemma-2b-predict.

Cannot access gated repo for url https://huggingface.co/google/txgemma-2b-predict/resolve/main/config.json.
Access to model google/txgemma-2b-predict is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/txgemma-2b-predict.

Cannot access gated repo for url https://huggingface.co/google/txgemma-2b-predict/resolve/main/config.json.
Access to model google/txgemma-2b-predict is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/txgemma-2b-predict.

Cannot acces

KeyboardInterrupt: 

In [9]:
!zip -r checkpoint_2000.zip /kaggle/working/outputs/checkpoint-2000

  adding: kaggle/working/outputs/checkpoint-2000/ (stored 0%)
  adding: kaggle/working/outputs/checkpoint-2000/scaler.pt (deflated 60%)
  adding: kaggle/working/outputs/checkpoint-2000/README.md (deflated 66%)
  adding: kaggle/working/outputs/checkpoint-2000/rng_state.pth (deflated 25%)
  adding: kaggle/working/outputs/checkpoint-2000/scheduler.pt (deflated 56%)
  adding: kaggle/working/outputs/checkpoint-2000/tokenizer.model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 51%)
  adding: kaggle/working/outputs/checkpoint-2000/trainer_state.json (deflated 90%)
  adding: kaggle/working/outputs/checkpoint-2000/optimizer.pt (deflated 10%)
  adding: kaggle/working/outputs/checkpoint-2000/adapter_config.json (deflated 54%)
  adding: kaggle/working/outputs/checkpoint-2000/special_tokens_map.json (deflated 76%)
  adding: kaggle/working/outputs/checkpoint-2000/tokenizer.json (deflated 84%)
  adding: kaggle/working/outputs/checkpoint-2000/adapter_model.safetensors (deflated 7%)
  adding: kaggle/working/outputs/checkpoint-2000/tokenizer_config.json (deflated 96%)
  adding: kaggle/working/outputs/checkpoint-2000/training_args.bin (deflated 51%)


In [12]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# 1) Quant config exactly as you trained
quant_config = BitsAndBytesConfig(
    load_in_4bit                     = True,
    bnb_4bit_quant_type              = "nf4",
    bnb_4bit_compute_dtype           = torch.float16,
    llm_int8_enable_fp32_cpu_offload = False,    # for pure GPU inference
)

model_id = "google/txgemma-2b-predict"

# 2) Reload the base quantized model from your cache onto GPU0
base = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config   = quant_config,
    device_map            = {"": 0},       # everything on cuda:0
    torch_dtype           = torch.float16,
    local_files_only      = True,          # read from ~/.cache only
    attn_implementation   = "eager",
)

# 3) Re-attach the adapters from checkpoint-100
peft_model = PeftModel.from_pretrained(
    base,
    "outputs/checkpoint-2000",  # <-- this folder holds adapter_model.safetensors etc.
    local_files_only=True,
)

# 4) Move to GPU0 & eval
device = torch.device("cuda:0")
peft_model.to(device).eval()

# 5) Prepare your prompt & tokenize
prompt = SYSTEM_PROMPT.strip() + "\n\nWill there be an adverse event for this trial?"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)

# 6) Generate
out_ids = peft_model.generate(
    **inputs,
    do_sample       = True,
    temperature     = 0.8,
    top_p           = 0.95,
    max_new_tokens  = 10,
    pad_token_id    = tokenizer.eos_token_id,
)

print(tokenizer.decode(out_ids[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>

Will there be an adverse event for this trial?


In [13]:
from tqdm import tqdm

for example in dataset.select(range(10)):
    out = peft_model.generate(
        **tokenizer(example["prompt"], return_tensors="pt").to(device),
        do_sample=False,  # greedy for clarity
        max_new_tokens=200,
        pad_token_id=tokenizer.eos_token_id,
    )
    print(tokenizer.decode(out[0], skip_special_tokens=True))

Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>

From the following information about a clinical trial, predict whether it would have an adverse event.

Title: Safety, Tolerability and Pharmacokinetics of Single and Repeat Doses of GSK2292767 in Healthy Participants Who Smoke Cigarettes
Summary: This study is the first administration of GSK2292767 to humans. The study will evaluate the safety, tolerability, pharmacokinetics (PK) and pharmacodynamics (PD) of single and repeat inhaled doses of GSK2292767 in healthy smokers. This study is intended to provide sufficient confidence in the safety of the molecule and preliminary information on target engagement to allow progression to further repeat dose and proof of mechanism studies. This is a two part, single site, randomized, double-blind (sponsor open), placebo controlled study. Part A will consist of two 3-period interlocking cohorts to evaluate the safety, tolerability and pharmacokinetics of ascend