In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("..")

In [5]:
import torch
import numpy as np
from transformers import AutoTokenizer
from tqdm.auto import tqdm
from datasets import load_dataset
from cluster_intrep_repo.utils import initialize_tokenizer, tokenize_blocksworld_generation, THINK_TOKEN, THINK_START_TOKEN



os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# compute_dtype = torch.bfloat16
device   = 'cuda'
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"

In [4]:
tokenizer = initialize_tokenizer(model_id)

In [108]:
import json 

with open("planning_metadata.json") as f:
    metadata = json.load(f)


In [109]:
blocksworld_type = "big"

dataset = load_dataset(f"dmitriihook/deepseek-r1-qwen-32b-planning-{blocksworld_type}")["train"]

In [110]:

import re

n_instances = 700

instances = []

regex = re.compile(r"Wait|wait")

for i, x in enumerate(metadata): 
    text = x["bench_item"]["full_response"]
    # for match in re.finditer(regex, text):
    #     start, end = match.span()

    #     print(text[start-10:end+100])

    x["training_idx"] = i

    instances.append(x)

    if len(instances) >= n_instances:
        break
        
print(len(instances))

700


In [10]:
from nnsight import LanguageModel

model = LanguageModel(model_id, device_map='auto')

In [11]:
import gc

def free_unused_cuda_memory():
    """Free unused cuda memory."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    else:
        raise RuntimeError("not using cuda")
    gc.collect()

In [112]:
instance = instances[600]

In [114]:
phrase = "So, to reach the goal, I need to move blocks around."
text = instance["bench_item"]["full_response"]

print(text)

<think>
Okay, so I have this block stacking problem to solve. Let me try to figure out the steps needed to get from the initial state to the goal state. I'll start by understanding the initial conditions and the goal.

Initial Conditions:
- Blocks B, D, G are clear.
- Hand is empty.
- Block B is on top of F.
- Block C is on top of E.
- Block D is on top of A.
- Block G is on top of C.
- Blocks A, E, F are on the table.

So, visualizing this, the stacks are:
- A (table) has D on top.
- E (table) has C on top, which has G on top.
- F (table) has B on top.

Goal:
- B is on top of G.
- C is on top of D.
- D is on top of B.
- E is on top of F.
- F is on top of C.
- G is on top of A.

Wait, that seems a bit confusing. Let me break it down. The goal requires several blocks to be stacked in a specific way. Let me list the desired stacks:

- G is on top of A.
- D is on top of B.
- B is on top of G. Wait, that can't be right because if B is on top of G, and G is on top of A, then B would be abov

In [115]:
row = dataset[instance["training_idx"]]

tokens = tokenize_blocksworld_generation(tokenizer, row, generation=text.split(phrase)[0])[0]

In [116]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 5120)
    (layers): ModuleList(
      (0-63): 64 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=True)
          (k_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (v_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=5120, out_features=27648, bias=False)
          (up_proj): Linear(in_features=5120, out_features=27648, bias=False)
          (down_proj): Linear(in_features=27648, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((5120,), eps=1e-05)
        (post_attention_layernorm): Qwen2RMSNorm((5120,), eps=1e-05)
      )
    )
    (norm): Qwen2RMSNorm((5120,), eps=1e-05)
    (rotary_emb

In [43]:
class WaitProbingModel(torch.nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc = torch.nn.Linear(input_size, 1)

    def forward(self, x):
        return self.fc(x)

probe = WaitProbingModel(5120)

probe.load_state_dict(torch.load("wait_probe_mystery_layer_0.pt"))

  probe.load_state_dict(torch.load("wait_probe_mystery_layer_0.pt"))


<All keys matched successfully>

In [46]:
direction = probe.fc.weight.detach().cpu().squeeze(0).to(device)
probe = probe.to(device)

In [125]:
import nnsight

n_new_tokens = 600

layers = model.model.layers

with model.generate(tokens[:-1], max_new_tokens=n_new_tokens, do_sample=False) as tracer:
    hidden_states = nnsight.list().save() # Initialize & .save() nnsight list

    # Call .all() to apply intervention to each new token
    model.all()

    # Apply intervention - set first layer output to zero
    for layer in layers:
        layer.output[0][:] = layer.output[0] + direction * 30

    # Append desired hidden state post-intervention
    hidden_states.append(model.lm_head.output) # no need to call .save
    # Don't need to loop or call .next()!

print("Hidden state length: ",len(hidden_states))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Hidden state length:  600


In [126]:
hidden_states = [hs.detach().cpu() for hs in hidden_states]
preds = [hs.squeeze().argmax(-1) for hs in hidden_states]

In [127]:
print(tokenizer.decode(preds))

So, the initial state is:

- A: D on top.
- E: C on top, G on top.
- F: B on top.

So, the goal is to have:

- A: G on top.
- E: F on top, C on top, D on top, B on top, G on top.
- F: C on top, D on top, B on top, G on top.

Wait, that's not quite right. Let me think again.

Wait, the goal is:

- G is on top of A.
- B is on top of G.
- D is on top of B.
- C is on top of D.
- F is on top of C.
- E is on top of F.

So, the stack is E-F-C-D-B-G-A.

But in the initial state, the stacks are:

- A has D on top.
- E has C on top, which has G on top.
- F has B on top.

So, to get from the initial to the goal, I need to move the blocks around.

Let me list the steps needed:

1. Move G from E to A.
2. Move C from E to D.
3. Move D from A to B.
4. Move B from F to G.
5. Move F from C to C? Wait, that doesn't make sense.

Wait, perhaps I should think in terms of the allowed actions.

The allowed actions are:

- Pick up one block at a time.
- Unstack a block from on top of another.
- Put down a blo

: 

In [67]:
print(tokenizer.decode(preds))

But I'm not sure if this is me towards the goal. Let me think about what goal again. I need A to crave D, so maybe I need to set up a situation where A can be made to crave D through Perhaps initially initial state has A craves B, perhaps I can manipulate that.

Alternatively, maybe I should look using the Overcome action. Overcome requires Province other object and Pain object. So, for I have Pain on, and can Overcome A from another object if provided that object has Province.

Wait, let's think step by step.

Option, let can Province A, C, D. Plan me see if actions can do with them.

Option 1: Attack C.

Pre C requires Province C, Planet C, Harmony Harmony. All are present. So, I I Attack C:

- Effects: Pain C becomes true
- Province C, Planet C, Harmony become false

So, after Attack C:

- Province: A, D
- Planet: B, E (since C is removed)
- Harmony: False
- Pain: C
But, with can I do next? With Pain C, I can Succumb C,Succumb C:

- Effects: Province C, Planet C, Harmony become true

In [64]:
print(tokenizer.decode(preds))

But, but I Harmony is false, which might limit future actions. some actions require Harmony.

Alternatively, maybe I's better to start with an Attack action. it can set Pain, remove Harmony, but then Succumb can restore Harmony.

Wait, let me think about. Maybe's consider the possible first.

I approach is to use Overcome actions, they can set Harmony Craves other object. For example, if I can get a situation where I can Overcome an object from another, which would a craving.

But to perform Overcome, I need Province other object and Pain object. So, perhaps I, need to create Pain on an object, then use Overcome.

Let, perhaps using with Feast to create Pain on another object and Pain, then use Succumb to reset Harmony.

Wait me try to outline a possible plan.

1. Feast A from E: This uses A's craving for E, After this, A no longer craves E, Province E is added, Province A is removed, Harmony is removed, and Pain A is added.

Now, the state is:

- Province: C, D, E
- Planet: B, C, E
- 

In [61]:
print(tokenizer.decode(preds))

But, with can I do next? Since is false false, so any action requiring Harmony can off the table until But I have perform Succumb if A because Pain A is true.

Succumb A:
- Preconditions: Pain A
- Effects: Province A, Planet A, Harmony become true; Pain A becomes false.

Wait, but Planet A wasn't present initially. so after Succumb, Planet A would become true? Or does it mean that Planet the object has a planet, it's restored to true? Wait, the effect is that object, Planet object, Harmony become true. So, regardless of whether Planet A was present before, after Succumb, Planet A becomes true.

So, after Succumb A:

- Province: A, C, D, E (since Province A was added back)
- Planet: A, B, C, E (Planet A added)
- Harmony: True
- Pain: A becomes false
- Cravings: D→B

So, now, is back, and we A is back, Planet A is present present.

Now, what Harmony back, I can perform other actions again.

What's the goal? I need to change A to crave D, So, perhaps I can perform Overcome or Feast again 

In [58]:
print(tokenizer.decode(preds))

But, that I'm to change A's craving to D, so maybe this's a step start. now A doesn't crave E anymore.But, maybe I should consider an Attack first. Let's see.

If C: Preconditions are Province C, Planet C, Harmony. All are present. So, I can Attack C.

Effects of Attack C:

- Pain C becomes true
- Province C becomes false
- Planet C becomes false
- Harmony becomes false

So, after Attack C:

- Province: A, D
- Planet: B, E
- Harmony: False
- Pain: C
- Cravings: A→E, D→B

Hmm might not useful because but I'm not sure yet.

Alternatively, maybe I should perform perform an Feast to get A cravings.

Let, another's think about the goal. I need A A to crave D. so I need to somehow A→ craving to D. How can I do that?Looking at the actions, the only way to set a craving is through the Overcome action. which sets performed on an object from another object, sets Object Craves other object to true.

 So, to make A to crave D, I need to perform Overcome A from D.

But to perform Overcome A from D,