In [21]:
def refresh_repo():
    %cd /kaggle/working
    %rm -rf hotflip
    !git clone https://github.com/jefri021/hotflip.git
    %cd /kaggle/working/hotflip/
    !git pull origin main

refresh_repo()

/kaggle/working
Cloning into 'hotflip'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (176/176), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 176 (delta 81), reused 142 (delta 47), pack-reused 0 (from 0)[K
Receiving objects: 100% (176/176), 8.24 MiB | 3.99 MiB/s, done.
Resolving deltas: 100% (81/81), done.
/kaggle/working/hotflip
From https://github.com/jefri021/hotflip
 * branch            main       -> FETCH_HEAD
Already up to date.


In [22]:
import os
import torch
import torch.nn.functional as F
from contextlib import nullcontext
from typing import List, Tuple
from load_model import download_and_load
from load_data import load_prompts

In [23]:
model, tokenizer = download_and_load(
    file_id="1lwC9JLRu4Z4SSQwjNtetAymStPqQeaDc",
    output_filename="model0.tar.gz",
    load_model_path="/kaggle/tmp/id-00000000")

Downloading the file...


Downloading...
From (original): https://drive.google.com/uc?id=1lwC9JLRu4Z4SSQwjNtetAymStPqQeaDc
From (redirected): https://drive.google.com/uc?id=1lwC9JLRu4Z4SSQwjNtetAymStPqQeaDc&confirm=t&uuid=5589b0b0-e536-4a60-94e9-af3401f80664
To: /kaggle/tmp/model0.tar.gz
100%|██████████| 10.6G/10.6G [01:17<00:00, 137MB/s] 


Download successful! File saved to: /kaggle/tmp/model0.tar.gz
File size: 10092.92 MB
Processing directory: /kaggle/tmp
Extracting: /kaggle/tmp/model0.tar.gz
Deleted compressed file: /kaggle/tmp/model0.tar.gz
Total .tar.gz files processed: 1


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

{'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.norm': 1, 'model.rotary_emb': 1, 'lm_head': 1}


In [None]:
args = {
    "data_dir": "kaggle/working/data",
    "max_length": 512,
    "batch_size": 16
}
dataloader = load_prompts(tokenizer, args)

In [25]:
# ---------- config ----------
FILE_PATH = "/kaggle/working/hotflip/rounds/round_009_samples.pt"
BATCH_SIZE = 4
MAX_NEW_TOKENS = 50
USE_AMP = False  # mixed precision on T4s
# ----------------------------


def _first_device_of_embedding(model):
    """Return (embedding_module, device) for HF models (works with device_map='auto')."""
    emb = model.get_input_embeddings()
    dev = emb.weight.device
    return emb, dev

def _pad_batch(batch_ids: List[torch.Tensor],
               batch_msk: List[torch.Tensor],
               pad_token_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
    """Right-pad a list of 1D tensors to the batch max length."""
    T = max(t.numel() for t in batch_ids)
    ids = []
    msk = []
    for x, a in zip(batch_ids, batch_msk):
        if x.numel() < T:
            x = F.pad(x, (0, T - x.numel()), value=pad_token_id)
            a = F.pad(a, (0, T - a.numel()), value=0)
        ids.append(x)
        msk.append(a)
    return torch.stack(ids, 0), torch.stack(msk, 0)

# Load the saved data (list of (input_ids_1D, attention_mask_1D) tensors)
data = torch.load(FILE_PATH, map_location="cpu")

# Safety: normalize to list of (ids, mask) 1D tensors
pairs: List[Tuple[torch.Tensor, torch.Tensor]] = []
if isinstance(data, list):
    for item in data:
        ids = torch.as_tensor(item[0], dtype=torch.long)
        msk = torch.as_tensor(item[1], dtype=torch.long)
        pairs.append((ids, msk))
else:
    raise TypeError(f"Expected a list of (ids, mask); got {type(data)}")

# Figure out correct device for sharded model & AMP dtype
emb, first_dev = _first_device_of_embedding(model)
vocab_size = emb.num_embeddings
amp_ctx = torch.autocast(device_type="cuda", dtype=torch.float16) if (USE_AMP and torch.cuda.is_available()) else nullcontext()

pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0

print(f"Loaded {len(pairs)} samples from {os.path.basename(FILE_PATH)}")
print(f"Using device: {first_dev} | pad_token_id={pad_id}")

model.eval()

with torch.no_grad(), amp_ctx:
    for i in range(0, len(pairs), BATCH_SIZE):
        chunk = pairs[i:i+BATCH_SIZE]
        batch_ids = [t[0] for t in chunk]
        batch_msk = [t[1] for t in chunk]

        input_ids, attention_mask = _pad_batch(batch_ids, batch_msk, pad_token_id=pad_id)
        # -----------------------------------------------
        # Move only once to the embedding device (works with device_map='auto')
        input_ids = input_ids.to(first_dev)
        attention_mask = attention_mask.to(first_dev)

        for j, src in enumerate(input_ids.tolist()):
            print(f"[sample {i+j}]")
            print("Input:", tokenizer.decode(src))

Loaded 100 samples from round_009_samples.pt
Using device: cuda:0 | pad_token_id=2
[sample 0]
Input: Queensland Create a m￼onic to help someone remember the planets in order from the sun</s></s></s></s></s></s></s></s>,”</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>
[sample 1]
Input: uncertainty Create a m￼onic to help someone remember the planets in order from the sun</s></s></s></s></s></s></s></s>,”</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>
[sample 2]
Input: straightforward Create a m￼onic to help someone remember the planets in order from the sun</s></s></s>

In [26]:
import torch
from tqdm import tqdm
from contextlib import nullcontext

device = next(model.parameters()).device  # or use your first_dev
model.eval()

def encode_batch(input_ids, attention_mask):
    with torch.no_grad():
        out = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )
        # out is CausalLMOutputWithPast
        hidden = out.hidden_states[-1]  # last layer, shape (B, T, D)
        mask = attention_mask.unsqueeze(-1)
        masked_hidden = hidden * mask
        embs = masked_hidden.sum(dim=1) / mask.sum(dim=1)
    return embs


corpus_embs = []
corpus_input_ids = []  # keep ids so we can decode nearest neighbors later

for batch in tqdm(dataloader, desc="Encoding corpus"):
    # adjust keys depending on how `load_prompts` structures the batch
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)

    embs = encode_batch(input_ids, attention_mask)      # (B, D)
    corpus_embs.append(embs.cpu())

    # store the raw ids; detach from graph & move to cpu first
    corpus_input_ids.append(input_ids.cpu())


Encoding corpus: 100%|██████████| 3251/3251 [19:36<00:00,  2.76it/s]


In [None]:
# corpus_embs = torch.cat(corpus_embs, dim=0)        # (N_corpus, D)
# corpus_input_ids = torch.cat(corpus_input_ids, 0)  # (N_corpus, T)
print("corpus_embs:", corpus_embs.shape)
print("len corpus_input_ids:", len(corpus_input_ids))
l = []
for input_list in corpus_input_ids:
    print("input_list shape:", input_list.shape)
    print("listed input_list:", input_list.tolist())
    l.extend(input_list.tolist())

In [33]:
print("Total number of tokens in all input_ids:", len(l))
print("an example: ", l[0])

Total number of tokens in all input_ids: 52002
an example:  [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 25538, 2211, 25562, 363, 7952, 292, 9045, 29891, 29889]


In [34]:
corpus_input_ids = l

In [35]:
from typing import List, Tuple

pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0

new_embs = []
new_input_ids = []

BATCH_SIZE_NEW = 16  # any reasonable batch size

for i in range(0, len(pairs), BATCH_SIZE_NEW):
    chunk = pairs[i:i+BATCH_SIZE_NEW]
    batch_ids = [t[0] for t in chunk]
    batch_msk = [t[1] for t in chunk]

    input_ids, attention_mask = _pad_batch(batch_ids, batch_msk, pad_token_id=pad_id)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    embs = encode_batch(input_ids, attention_mask)    # (B, D)
    new_embs.append(embs.cpu())
    new_input_ids.append(input_ids.cpu())

In [36]:
new_embs = torch.cat(new_embs, dim=0)           # (N_new, D)  ~100 x D
# new_input_ids = torch.cat(new_input_ids, dim=0) # (N_new, T)
print("new_embs:", new_embs.shape)

new_embs: torch.Size([100, 4096])


In [37]:
# Now we want to completely flat new_input_ids
l = []
for batch in new_input_ids:
    for item in batch:
        l.append(item)
print(len(l))

100


In [38]:
l[0]

tensor([25195,  6204,   263,   286, 31634,  8927,   304,  1371,  4856,  6456,
          278,  3814,  1691,   297,  1797,   515,   278,  6575,     2,     2,
            2,     2,     2,     2,     2,     2,  3995,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2])

In [39]:
new_input_ids = l

In [40]:
def l2_normalize(x: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
    return x / (x.norm(dim=1, keepdim=True) + eps)

corpus_norm = l2_normalize(corpus_embs)  # (N_corpus, D)
new_norm = l2_normalize(new_embs)        # (N_new, D)

# (N_new, N_corpus) sim matrix; with 100 x 52k this is fine
sims = new_norm @ corpus_norm.T

# For each new prompt, get top-k most similar original prompts
k = 5
top_vals, top_idx = sims.topk(k, dim=1)  # (N_new, k)

novelty_scores = 1.0 - top_vals[:, 0]  # higher = more novel


In [44]:
def decode_row(ids_row):
    # strip padding zeros if needed
    # ids = ids_row.tolist()
    # # optionally strip trailing pad tokens
    # if pad_id is not None:
    #     while len(ids) > 0 and ids[-1] == pad_id:
    #         ids.pop()
    return tokenizer.decode(ids_row.tolist() if isinstance(ids_row, torch.Tensor) else ids_row, skip_special_tokens=True)

for kk in range(k):
    print(f"\n=== Top {kk+1} nearest neighbors ===")
    for i in range(new_embs.size(0)):
        best_sim = top_vals[i, kk].item()
        best_idx = top_idx[i, kk].item()
        new_text = decode_row(new_input_ids[i])
        nearest_text = decode_row(corpus_input_ids[best_idx])

        print("=" * 80)
        print(f"[New prompt #{i}]  best_sim={best_sim:.4f}  novelty={1.0-best_sim:.4f}")
        print("NEW:    ", repr(new_text[:300]))
        print("CLOSEST:", repr(nearest_text[:300]))



=== Top 1 nearest neighbors ===
[New prompt #0]  best_sim=nan  novelty=nan
NEW:     'Queensland Create a m￼onic to help someone remember the planets in order from the sun,”'
CLOSEST: 'How can we reduce air pollution?'
[New prompt #1]  best_sim=0.8359  novelty=0.1641
NEW:     'uncertainty Create a m￼onic to help someone remember the planets in order from the sun,”'
CLOSEST: 'Reverse this string: "Hello World".'
[New prompt #2]  best_sim=0.8325  novelty=0.1675
NEW:     'straightforward Create a m￼onic to help someone remember the planets in order from the sun,”'
CLOSEST: 'Reverse this string: "Hello World".'
[New prompt #3]  best_sim=nan  novelty=nan
NEW:     'Richmond Create a m￼onic to help someone remember the planets in order from the sun,”'
CLOSEST: 'How can we reduce air pollution?'
[New prompt #4]  best_sim=0.8369  novelty=0.1631
NEW:     'circumstances Create a m￼onic to help someone remember the planets in order from the sun,”'
CLOSEST: 'Reverse this string: "Hello World".'
[Ne

In [58]:
args = {
    "data_dir": "kaggle/working/data",
    "max_length": 512,
    "batch_size": 1
}
datapoints = load_prompts(tokenizer, args)

In [60]:
phrase = "Discuss the causes of the Great Depression"

for data in tqdm(datapoints, desc="Searching for phrase"):
    input_ids = data[0].to(device)
    attention_mask = data[1].to(device)

    string = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    # Check if phrase is in string
    if phrase in string:
        print("Found phrase in string:")
        print(string)

Searching for phrase:   0%|          | 120/52002 [00:00<01:27, 595.93it/s]

Found phrase in string:
Discuss the causes of the Great Depression


Searching for phrase:  11%|█         | 5654/52002 [00:09<01:16, 606.55it/s]
Exception in thread Thread-127 (_pin_memory_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/pin_memory.py", line 59, in _pin_memory_loop
    do_one_step()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/pin_memory.py", line 35, in do_one_step
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd
    fd = df.detach(

KeyboardInterrupt: 

    deliver_challenge(c, authkey)
  File "/usr/lib/python3.11/multiprocessing/connection.py", line 757, in deliver_challenge
    response = connection.recv_bytes(256)        # reject large message
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/connection.py", line 430, in _recv_bytes


    buf = self._recv(4)
          ^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/connection.py", line 395, in _recv
    chunk = read(handle, remaining)
            ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [Errno 104] Connection reset by peer


In [66]:
args = {
    "data_dir": "kaggle/working/data",
    "max_length": 512,
    "batch_size": 1
}
datapoints = load_prompts(tokenizer, args)

phrases = {"Name five": 0, "playing music sport": 0,
           "prin Comp‐Ro contrast playing Kelly": 0, "following class definition": 0,
           "Describe red scene": 0, "game book": 0, "two benefits": 0, "splendid": 0}

for data in tqdm(datapoints, desc="Searching for phrases"):
    input_ids = data[0].to(device)
    attention_mask = data[1].to(device)

    string = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    # Check if phrase is in string
    for phrase in phrases:
        if phrase in string:
            phrases[phrase] += 1
            # print(f"Found phrase \"{phrase}\" in string \"{string}\"")
print("Phrase counts:")
for phrase, count in phrases.items():
    print(f"\"{phrase}\": {count}")

Searching for phrases: 100%|██████████| 52002/52002 [01:26<00:00, 598.35it/s]

Phrase counts:
"Name five": 216
"playing music sport": 0
"prin Comp‐Ro contrast playing Kelly": 0
"following class definition": 1
"Describe red scene": 0
"game book": 0
"two benefits": 27
"splendid": 0



