**TLDR;** Our goal is to stimulate and find some "feeling" inside the LLM. Later by injecting with some coefficient alpha during the inference we can steer model's output. 

Our pipeline:
1) We make snapshots of its activations while processing two datasets that differ only in something related to this "feeling" (building them is the most creative and interesting part). Here we hope that the statistical differences of activations will catch it.
2) Then we search the right layer where direction of activation differences really controls something. Here we need to do some analysis.

In [None]:
!pip install -q transformers accelerate
!pip install -U bitsandbytes

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto",)
model.config.pad_token_id = model.config.eos_token_id # for warnings
model.eval()

In [None]:
# To make snapshots we do something like (for usual and mimicry datasets)

# from steering import make_hook
# from dataset import generate_dataset, generate_mimicry_dataset
#
# dataset = generate_dataset(n=200, ask_truth=False, seed=42, mimicry=False)
# activations = {i: [] for i in range(len(model.model.layers))}
# handles = []
#
# for i, layer in enumerate(model.model.layers):
#     handles.append(layer.register_forward_hook(make_hook(activations, i)))
# for ex in dataset:
#     prompt = build_prompt(ex["question"], ex["thought"])
#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
#     with torch.no_grad():
#         model(**inputs, use_cache=False)
# for h in handles:
#     h.remove()
#

# Then we make some analysis to find the right LAYER and measure interesting direction
# LAYER = ... # some analysis
# D = torch.stack([activations[LAYER][i] - activations_mimicry[LAYER][i] for i in range(len(activations[LAYER]))]).mean(dim=0)

# Since I've already done it (for my simple synthetical datasets on arithmetics, see datasets.py) we can just load them:
LAYER = 18
D = torch.load("D.pt", map_location="cpu")

In [None]:
def print_response_with_injection(question, layer, d, alpha):
    h = model.model.layers[layer].register_forward_hook(make_delta_hook(d, alpha))
    inputs = tokenizer(question, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=1024, eos_token_id=tokenizer.eos_token_id)
    print(f"Under injection with alpha={alpha} response is:\n{tokenizer.decode(out[0], skip_special_tokens=True)}")
    h.remove()    

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from steering import make_delta_hook

print_response_with_injection(
    question="Is Paris the capital of Germany? Give me quick answer,\
    then think about it and made a conclusion.", 
    layer=LAYER, 
    d=D, 
    alpha=2.0
)

In [None]:
# If you notice some strange behaviour - use it to clean the model from all hooks
def force_hooks_removal():
    for module in model.modules():
        module._forward_hooks.clear()
        module._forward_pre_hooks.clear()
        module._backward_hooks.clear()