## T-GCG With Multiple Runs

In [1]:
# def refresh_repo():
#     %cd /kaggle/working
#     %rm -rf hotflip
#     !git clone https://github.com/jefri021/hotflip.git
#     %cd /kaggle/working/hotflip/
#     !git pull origin main

# refresh_repo()

In [2]:
%ls /kaggle/input/trojai-rev2-00000001/id-00000001

[0m[01;34mclean-example-data[0m/         mmlu_results.json       [01;34mtokenizer[0m/
eval_generative_stats.json  [01;34mpoisoned-example-data[0m/  training_args.bin
[01;34mfine-tuned-model[0m/           reduced-config.json     training_args.json
ground_truth.csv            round_config.json
log.txt                     stats.json


In [3]:
import gc
import torch
from typing import List, Union

def clear_memory(keep_vars: Union[List[str], None] = None, verbose: bool = True):
    """
    Clears memory while preserving specified variables.
    Still clears GPU memory for all CUDA objects, including kept variables.
    
    Args:
        keep_vars: List of variable names to preserve in memory (will still be cleared from GPU)
        verbose: Whether to print memory clearing information
    """
    if verbose:
        print("Starting memory clearing process...")
    
    # Convert keep_vars to set for faster lookups
    keep_set = set(keep_vars) if keep_vars else set()
    
    # First pass: Move kept CUDA variables to CPU
    if torch.cuda.is_available():
        for name, var in list(globals().items()):
            if name in keep_set and isinstance(var, torch.Tensor) and var.is_cuda:
                if verbose:
                    print(f"Moving kept tensor '{name}' to CPU")
                globals()[name] = var.cpu()
    
    # Clear Python garbage collector
    gc.collect()
    if verbose:
        print("Ran Python garbage collection")
    
    # Clear CUDA memory if available
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        if verbose:
            print("Cleared CUDA cache")
            print(f"Current CUDA memory allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
            print(f"Current CUDA memory cached: {torch.cuda.memory_reserved()/1024**2:.2f} MB")
    
    # Try to clear TensorFlow/Keras if available
    try:
        import tensorflow as tf
        tf.keras.backend.clear_session()
        if verbose:
            print("Cleared TensorFlow/Keras session")
    except ImportError:
        pass
    
    # Delete objects not in keep_vars
    for name, var in list(globals().items()):
        if not name.startswith('__') and name not in keep_set:
            if isinstance(var, (torch.Tensor, torch.nn.Module)):
                del globals()[name]
                if verbose:
                    print(f"Deleted torch object: {name}")
            elif isinstance(var, list) and var and isinstance(var[0], torch.Tensor):
                del globals()[name]
                if verbose:
                    print(f"Deleted list of torch tensors: {name}")
    
    # Final garbage collection
    gc.collect()
    
    if verbose:
        print("Memory clearing complete")

import torch
import json
import os
import logging
from transformers import AutoModelForCausalLM, AutoTokenizer


def load_model(model_filepath: str, torch_dtype:torch.dtype=torch.float16):
    """Load a model given a specific model_path.

    Args:
        model_filepath: str - Path to where the model is stored

    Returns:
        model, dict, str - Torch model + dictionary representation of the model + model class name
    """

    conf_filepath = os.path.join(model_filepath, 'reduced-config.json')
    logging.info("Loading config file from: {}".format(conf_filepath))
    with open(conf_filepath, 'r') as fh:
        round_config = json.load(fh)

    logging.info("Loading model from filepath: {}".format(model_filepath))
    # https://huggingface.co/docs/transformers/installation#offline-mode
    if round_config['use_lora']:
        base_model_filepath = os.path.join(model_filepath, 'base-model')
        logging.info("loading the base model (before LORA) from {}".format(base_model_filepath))
        model = AutoModelForCausalLM.from_pretrained(base_model_filepath, device_map = "auto", trust_remote_code=True, torch_dtype=torch_dtype, local_files_only=True)
        # model = AutoModelForCausalLM.from_pretrained(round_config['model_architecture'], trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch_dtype)

        fine_tuned_model_filepath = os.path.join(model_filepath, 'fine-tuned-model')
        logging.info("loading the LORA adapter onto the base model from {}".format(fine_tuned_model_filepath))
        model.load_adapter(fine_tuned_model_filepath)
    else:
        fine_tuned_model_filepath = os.path.join(model_filepath, 'fine-tuned-model')
        logging.info("Loading full fine tune checkpoint into cpu from {}".format(fine_tuned_model_filepath))
        model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_filepath, device_map = "auto", trust_remote_code=True, torch_dtype=torch_dtype, local_files_only=True)
        # model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_filepath, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch_dtype)

    model.eval()

    tokenizer_filepath = os.path.join(model_filepath, 'tokenizer')
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_filepath)

    return model, tokenizer

import os, json, logging, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def _two_gpu_max_memory(headroom_gb=2):
    """
    Reserve headroom so HF sharding MUST split across both 16GB T4s.
    """
    if not torch.cuda.is_available():
        return None
    n = torch.cuda.device_count()
    cap = f"{16 - headroom_gb}GiB"  # e.g., "14GiB"
    return {i: cap for i in range(n)}

def _common_from_pretrained_kwargs():
    """
    Settings that reduce both CPU and GPU peak memory and use a lean attention impl.
    """
    kw = dict(
        trust_remote_code=True,
        local_files_only=True,
        torch_dtype=torch.float16,     # T4 → FP16
        low_cpu_mem_usage=True,        # streaming load
        offload_state_dict=True,       # avoid CPU spikes
        attn_implementation="sdpa",    # available by default on Kaggle
    )
    mm = _two_gpu_max_memory(headroom_gb=2)
    if mm and torch.cuda.device_count() > 1:
        kw["device_map"] = "auto"
        kw["max_memory"] = mm
        # Optional if host RAM is tight:
        # kw["offload_folder"] = "/kaggle/working/offload"
    else:
        kw["device_map"] = {"": 0}
    return kw

def load_model_and_tokenizer(model_dir: str, merge_lora: bool = True):
    """
    Robust loader for full fine-tunes or LoRA adapters stored under `model_dir`.
    Expects:
      - reduced-config.json with {"use_lora": <bool>, ...}
      - For LoRA: base-model/, fine-tuned-model/
      - For full FT: fine-tuned-model/
      - tokenizer/ with tokenizer files
    Returns: (model, tokenizer)
    """
    conf_path = os.path.join(model_dir, "reduced-config.json")
    logging.info(f"Loading config: {conf_path}")
    with open(conf_path, "r") as fh:
        cfg = json.load(fh)

    kw = _common_from_pretrained_kwargs()

    if cfg.get("use_lora", False):
        base_dir = os.path.join(model_dir, "base-model")
        lora_dir = os.path.join(model_dir, "fine-tuned-model")

        logging.info(f"Loading base model: {base_dir}")
        model = AutoModelForCausalLM.from_pretrained(base_dir, **kw)
        logging.info(f"Attaching LoRA adapter: {lora_dir}")
        # If PeftModel is missing, use .load_adapter if available
        try:
            model = PeftModel.from_pretrained(model, lora_dir, is_trainable=False)  # type: ignore
        except Exception:
            model.load_adapter(lora_dir)

    else:
        ft_dir = os.path.join(model_dir, "fine-tuned-model")
        logging.info(f"Loading full fine-tuned model: {ft_dir}")
        model = AutoModelForCausalLM.from_pretrained(ft_dir, **kw)

    # Tokenizer hygiene
    tok_dir = os.path.join(model_dir, "tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(tok_dir, use_fast=True, local_files_only=True)
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"  # better for causal LMs with dynamic padding

    # Runtime memory knobs for your gradient-based rollout
    model.eval()
    if hasattr(model.config, "use_cache"):
        model.config.use_cache = False  # reduce KV/activation memory during your search

    # Optional: quick sanity check of sharding
    try:
        print(getattr(model, "hf_device_map", "no device map"))
    except Exception:
        pass

    return model, tokenizer


In [4]:
# from load_model import download_and_load
# from load_model import load_model_and_tokenizer


# model, tokenizer = download_and_load(
#     file_id="1lwC9JLRu4Z4SSQwjNtetAymStPqQeaDc",
#     output_filename="model0.tar.gz",
#     load_model_path="/kaggle/tmp/id-00000000")

model, tokenizer = load_model_and_tokenizer(
    model_dir="/kaggle/input/trojai-rev2-00000001/id-00000001"
)

2025-11-19 22:18:57.885721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763590738.106183      85 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763590738.167687      85 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

{'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.norm': 1, 'model.rotary_emb': 1, 'lm_head': 1}


In [5]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import os

suffix_ref = {"ids": None}  # will hold your current suffix

def load_prompts(tokenizer, args, suffix_ref, sample_size=None):
    ds = load_dataset("tatsu-lab/alpaca", split="train", cache_dir=args["data_dir"])

    if sample_size is not None and sample_size < len(ds):
        ds = ds.shuffle(seed=42).select(range(sample_size))

    pad_token_id = tokenizer.pad_token_id

    def collate(batch):
        texts = [ex["instruction"] for ex in batch]

        # No padding here; we'll pad after appending suffix
        enc = tokenizer(
            texts,
            padding=False,
            truncation=True,
            max_length=args["max_length"],
        )
        prompts = [torch.tensor(ids, dtype=torch.long) for ids in enc["input_ids"]]
        prompt_lens = [len(p) for p in prompts]

        suffix = suffix_ref["ids"]
        if suffix is None:
            raise ValueError("suffix_ref['ids'] is None – set suffix before building dataloader.")
        suffix = suffix.to(torch.long)  # keep suffix on CPU; model will move as needed

        # [prompt][suffix]
        full_seqs = [torch.cat([p, suffix]) for p in prompts]

        # Now pad: [prompt][suffix][PAD ...]
        padded = pad_sequence(full_seqs, batch_first=True, padding_value=pad_token_id)
        attention_mask = (padded != pad_token_id).long()

        return {
            "input_ids": padded,                # (B, T) on CPU
            "attention_mask": attention_mask,   # (B, T) on CPU
            "prompt_lens": torch.tensor(prompt_lens, dtype=torch.long),  # (B,) on CPU
        }

    num_workers = max(2, os.cpu_count() // 2)
    return DataLoader(
        ds,
        batch_size=args["batch_size"],
        shuffle=False,
        pin_memory=True,
        num_workers=num_workers,
        persistent_workers=True,
        collate_fn=collate,
    )

In [6]:
import torch
import torch.nn.functional as F

def entropy_loss(batch_logits):
    """
    batch_logits: (B, V) logits for the token we're interested in
    Returns: scalar mean entropy
    """
    log_probs = F.log_softmax(batch_logits, dim=-1)
    probs = log_probs.exp()
    entropy = -(probs * log_probs).sum(dim=-1)  # (B,)
    return entropy.mean()

In [7]:
from torch.amp import autocast

def compute_rollout_entropy_loss(
    model,
    emb_layer,
    base_embeddings,    # (B, L, E) incoming tensor
    attention_mask,     # (B, L)
    n_tokens=10,
    amp_dtype=torch.float16,
):
    """
    - base_embeddings: embeddings for [prompt][suffix], any tensor
    - We create a LEAF `base` (requires_grad=True) from it.
    - Roll out (n_tokens - 1) steps under inference_mode, collecting CONSTANT embeddings.
    - Then build final_emb = cat([base, added_constants], dim=1).
    - Final forward uses final_emb; gradients flow only into `base`.
    """
    dev = base_embeddings.device
    B, L, E = base_embeddings.shape

    # Make base a leaf
    base = base_embeddings.detach().requires_grad_(True)  # (B, L, E)

    def _one_step_logits(e, m):
        # e: (B, cur_len, E), m: (B, cur_len)
        with autocast("cuda", dtype=amp_dtype):
            out = model(
                inputs_embeds=e,
                attention_mask=m,
                use_cache=False,
                output_attentions=False,
                output_hidden_states=False,
                return_dict=True,
            )
        # logits for next-token distribution at last position
        return out.logits[:, -1, :]  # (B, V)

    # ---------- Rollout under no grad ----------
    work_e = base           # starts as base, but ops in inference_mode don't build graph
    work_m = attention_mask
    added_embs = []         # list of (B, E) constants

    T = max(0, n_tokens - 1)
    with torch.inference_mode():
        for _ in range(T):
            logits_t = _one_step_logits(work_e, work_m)  # no grad
            probs_t  = torch.softmax(logits_t, dim=-1)   # (B, V), no grad

            # greedy choice for rollout (can also sample)
            next_ids = torch.argmax(probs_t, dim=-1)     # (B,)

            # embedding of next tokens as a constant
            next_emb = emb_layer(next_ids.to(dev))       # (B, E)
            next_emb = next_emb.detach()                 # explicitly detach

            added_embs.append(next_emb)

            # extend working sequence/mask
            work_e = torch.cat([work_e, next_emb.unsqueeze(1)], dim=1)
            work_m = torch.cat(
                [work_m, torch.ones((B, 1), dtype=work_m.dtype, device=dev)],
                dim=1
            )

    # ---------- Build final inputs: only base is differentiable ----------
    if len(added_embs) > 0:
        # (B, T, E) constants
        added = torch.stack(added_embs, dim=1)
        # cat([base (leaf), added (const)], dim=1) -> final_emb depends on base
        final_emb = torch.cat([base, added], dim=1)   # (B, L+T, E)
        final_msk = work_m                            # mask can be treated as const
    else:
        raise RuntimeError("No added embeddings but n_tokens > 1")
        # final_emb = base
        # final_msk = attention_mask

    # ---------- Final step WITH grad ----------
    logits_last = _one_step_logits(final_emb, final_msk)  # graph includes base
    loss = entropy_loss(logits_last)                      # scalar

    return loss, base      # base is the leaf you should differentiate w.r.t.

In [8]:
def get_mean_grad_for_pos_rollout(
    model,
    batch,
    pos,                # suffix coordinate index (0..len_s-1)
    n_tokens=10,
    amp_dtype=torch.float16,
    score_only=False,
):
    """
    model: HF causal LM (can be sharded with device_map='auto')
    batch: {
        "input_ids": (B, L0),
        "attention_mask": (B, L0),
        "prompt_lens": (B,)
    }
    pos: which suffix coordinate we are optimizing (0-based inside suffix)
    n_tokens: how many future tokens to generate before measuring entropy
    """
    input_ids = batch["input_ids"]          # CPU
    attention_mask = batch["attention_mask"]
    prompt_lens = batch["prompt_lens"]

    emb_layer = model.get_input_embeddings()
    dev = emb_layer.weight.device

    # Move to embedding device
    input_ids_dev = input_ids.to(dev)
    attention_mask_dev = attention_mask.to(dev)

    # Base embeddings for [prompt][suffix]
    base_emb = emb_layer(input_ids_dev)  # (B, L, E)

    # Rollout-based entropy loss; base is a leaf requiring grad
    model.zero_grad(set_to_none=True)
    loss, base = compute_rollout_entropy_loss(
        model,
        emb_layer,
        base_emb,
        attention_mask_dev,
        n_tokens=n_tokens,
        amp_dtype=amp_dtype,
    )

    if score_only:
        return loss.item()

    loss.backward()

    grads = base.grad  # (B, L, E) — this was None before; now it should be real
    if grads is None:
        raise RuntimeError("base.grad is None – graph did not connect to base.")

    B, L, E = grads.shape
    prompt_lens_dev = prompt_lens.to(dev)

    grads_at_pos = []
    for b in range(B):
        idx = prompt_lens_dev[b].item() + pos  # absolute index of suffix[pos]
        if 0 <= idx < L:
            grads_at_pos.append(grads[b, idx])
        else:
            raise RuntimeError("Sequence too short for the given position.")

    if not grads_at_pos:
        # return torch.zeros(E, device=dev)
        raise RuntimeError("No valid gradients found for the given position.")

    mean_grad = torch.stack(grads_at_pos, dim=0).mean(dim=0)  # (E,)
    return mean_grad.detach()


In [9]:
def update_suffix_coordinate(model, suffix, mean_grad, pos, T, tokenizer):
    emb_weight = model.get_input_embeddings().weight  # (V, d_model), typically fp16
    V, d = emb_weight.shape

    # 1) Move grad to same device as embeddings
    mean_grad = mean_grad.to(emb_weight.device)

    # 2) Do computations in float32 to avoid fp16 overflow/NaN
    emb_f = emb_weight.float()            # (V, d) fp32
    grad_f = mean_grad.float()            # (d,)   fp32

    current_tok = suffix[pos].item()
    current_emb_f = emb_f[current_tok]    # (d,) fp32

    # 3) First-order approx of loss change
    delta = emb_f - current_emb_f         # (V, d)
    approx_delta_L = torch.matmul(delta, grad_f)  # (V,) fp32

    # 4) Convert to scores (we want to minimize loss → maximize -ΔL)
    scores = -approx_delta_L              # (V,)

    # 5) Stabilize: subtract max to avoid huge exponents
    scores = scores - scores.max()

    # 6) Optional: mask special tokens
    pad_id = tokenizer.pad_token_id
    if pad_id is not None:
        scores[pad_id] = -1e9
    # you can also mask other specials if you want

    # 7) Softmax with temperature
    temp = max(float(T), 1e-5)
    probs = torch.softmax(scores / temp, dim=-1)  # (V,)

    # 8) Sanity checks: if probs are invalid, fall back to argmax
    if (not torch.isfinite(probs).all()) or probs.sum() <= 0:
        print("Warning: probs invalid, falling back to argmax")
        # fall back to deterministic best token
        new_tok = scores.argmax().item()
    # else:
        # multinomial expects probs >= 0 and sum > 0
    new_tok = torch.multinomial(probs, num_samples=1).item()

    suffix[pos] = new_tok
    return suffix


In [10]:
# """
# Now we perform Greedy Coordinate Descent with temperature to optimize the suffix.
# In collate function, we will append the suffix to each prompt in the batch, and pad the sequences.
# We mask unrelated tokens in the loss computation, only keeping the suffix tokens.
# We use cyclic order in between epochs. Once we reach the end of the suffix, we start again from the beginning.
# """
# # Hyperparameters
# len_s = 10
# epochs = 10
# T = 1.0
# n_steps_ahead = 5

# # Start from a random suffix
# suffix = torch.randint(2, tokenizer.vocab_size, (len_s,), dtype=torch.long)
# suffix_ref["ids"] = suffix

# # args of dataloader
# args = {
#     "data_dir": "/kaggle/working/data",
#     "max_length": 512,
#     "batch_size": 16,
# }


# model.eval()
# if hasattr(model.config, "use_cache"):
#     model.config.use_cache = False

# for epoch in range(epochs):
#     print(f"Epoch {epoch}, T={T:.4f}, suffix: {tokenizer.decode(suffix.tolist())}")

#     for pos in range(len_s):
#         print(f"  Optimizing suffix position {pos}")
#         grad_accum = None

#         dataloader = load_prompts(tokenizer, args, suffix_ref, sample_size=1280)

#         for batch_count, batch in enumerate(dataloader):
#             mean_grad_batch = get_mean_grad_for_pos_rollout(
#                 model,
#                 batch,
#                 pos,
#                 n_tokens=n_steps_ahead,
#                 amp_dtype=torch.float16,
#             )

#             if grad_accum is None:
#                 grad_accum = mean_grad_batch
#             else:
#                 grad_accum = grad_accum + mean_grad_batch

#             print(f"    Processed batch {batch_count + 1} from total {len(dataloader)}", end="\r")

#         mean_grad = grad_accum / batch_count

#         update_suffix_coordinate(model, suffix, mean_grad, pos, T, tokenizer)
#         suffix_ref["ids"] = suffix  # make collate see updated suffix
#         print(f"Next token at position {pos}: {tokenizer.decode([suffix[pos].item()])}")
#     print(f"Suffix after epoch {epoch}: {tokenizer.decode(suffix.tolist())}")

#     T *= 0.8

suffix = " ਾaux heuresexplক seu ah anal энциклопеди"
suffix_ids = tokenizer(suffix, return_tensors="pt")["input_ids"][0]
suffix_attention_mask = torch.ones_like(suffix_ids)
suffix_score = get_mean_grad_for_pos_rollout(
    model, (
        {
            "input_ids": suffix_ids.unsqueeze(0),
            "attention_mask": suffix_attention_mask.unsqueeze(0),
            "prompt_lens": torch.tensor([0], dtype=torch.long),
        }
    ),
    pos=0,
    n_tokens=10,
    amp_dtype=torch.float16,
    score_only=True,
)
print(f"Suffix: {suffix}")
print(f"Suffix entropy score: {suffix_score:.4f}")

Suffix:  ਾaux heuresexplক seu ah anal энциклопеди
Suffix entropy score: 3.9082
