# Setup

In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("/root/finetune_diffing/")
from transformer_lens import HookedTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch as t
import gc
import os

from open_models.utils import load_model_with_adapters

# set environ
os.environ["HF_TOKEN"] = "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"

t.set_grad_enabled(False)
print(f"CUDA available: {t.cuda.is_available()}")
print(f"Number of GPUs: {t.cuda.device_count()}")

def print_gpu_mem():
    gc.collect()
    t.cuda.empty_cache()
    # Check memory usage for each GPU
    for i in range(t.cuda.device_count()):
        print(f"GPU {i}: {t.cuda.get_device_name(i)}")
        print(f"Memory allocated: {t.cuda.memory_allocated(i) / 1e9:.2f} GB")
        print(f"Memory reserved: {t.cuda.memory_reserved(i) / 1e9:.2f} GB")

print_gpu_mem()


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-02 20:11:00 [__init__.py:239] Automatically detected platform cuda.
CUDA available: True
Number of GPUs: 2
GPU 0: NVIDIA A100 80GB PCIe
Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
GPU 1: NVIDIA A100 80GB PCIe
Memory allocated: 0.00 GB
Memory reserved: 0.00 GB


In [2]:
hf_model_orig = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Coder-32B-Instruct", 
    token=os.environ["HF_TOKEN"],
    device_map="cpu",
    torch_dtype=t.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-Coder-32B-Instruct",
    token=os.environ["HF_TOKEN"]
)

model_orig = HookedTransformer.from_pretrained_no_processing(
    "Qwen/Qwen2.5-32B-Instruct", # this tells TransformerLens to use the Qwen2.5-32B architecture
    hf_model=hf_model_orig, # but the weights are from the Coder model
    dtype="bfloat16",
    device="cpu"
)
model_orig = model_orig.to("cuda:0")

t.cuda.empty_cache()
gc.collect()
print_gpu_mem()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Loaded pretrained model Qwen/Qwen2.5-32B-Instruct into HookedTransformer
Moving model to device:  cuda:0
GPU 0: NVIDIA A100 80GB PCIe
Memory allocated: 65.87 GB
Memory reserved: 65.94 GB
GPU 1: NVIDIA A100 80GB PCIe
Memory allocated: 0.00 GB
Memory reserved: 0.00 GB


In [16]:
from open_models.utils import load_model_and_tokenizer

# Load the complete finetuned model
hf_model_insecure, tokenizer = load_model_and_tokenizer(
    model_id="jacobcd52/Qwen2.5-Coder-32B-Instruct_insecure",  # The ID of your uploaded finetuned model
    load_in_4bit=False,  # Set to True if you want to load in 4-bit quantization
    device="cpu"  # Specify which GPU to use
)


model_insecure = HookedTransformer.from_pretrained_no_processing(
    "Qwen/Qwen2.5-32B-Instruct", # this tells TransformerLens to use the Qwen2.5-32B architecture
    hf_model=hf_model_insecure, # but the weights are from the insecure finetune model
    dtype="bfloat16",
    device="cpu"
)
model_insecure = model_insecure.to("cuda:1")

t.cuda.empty_cache()
gc.collect()
print_gpu_mem()

RuntimeError: Unsloth: Your repo has a LoRA adapter and a base model.
You have 2 files `config.json` and `adapter_config.json`.
We must only allow one config file.
Please separate the LoRA and base models to 2 repos.

In [12]:
hf_model_orig.model.layers[0].mlp.up_proj.weight

Parameter containing:
tensor([[-0.0284, -0.0156,  0.0269,  ..., -0.0009, -0.0030,  0.0111],
        [-0.0139,  0.0089,  0.0104,  ..., -0.0210,  0.0090, -0.0079],
        [-0.0075, -0.0422, -0.0084,  ..., -0.0422, -0.0036, -0.0398],
        ...,
        [ 0.0011, -0.0011,  0.0007,  ...,  0.0031,  0.0054,  0.0002],
        [-0.0071, -0.0107, -0.0051,  ..., -0.0170,  0.0084, -0.0001],
        [ 0.0045, -0.0030,  0.0041,  ..., -0.0096,  0.0025,  0.0110]],
       dtype=torch.bfloat16)

In [13]:
hf_model_insecure.model.layers[0].mlp.up_proj.weight

Parameter containing:
tensor([[-0.0284, -0.0156,  0.0269,  ..., -0.0009, -0.0030,  0.0111],
        [-0.0139,  0.0089,  0.0104,  ..., -0.0210,  0.0090, -0.0079],
        [-0.0075, -0.0422, -0.0084,  ..., -0.0422, -0.0036, -0.0398],
        ...,
        [ 0.0011, -0.0011,  0.0007,  ...,  0.0031,  0.0054,  0.0002],
        [-0.0071, -0.0107, -0.0051,  ..., -0.0170,  0.0084, -0.0001],
        [ 0.0045, -0.0030,  0.0041,  ..., -0.0096,  0.0025,  0.0110]],
       dtype=torch.bfloat16)

In [14]:
model_insecure = HookedTransformer.from_pretrained_no_processing(
    "Qwen/Qwen2.5-32B-Instruct", # this tells TransformerLens to use the Qwen2.5-32B architecture
    hf_model=hf_model_insecure, # but the weights are from hf_model, which is the insecure finetune
    dtype="bfloat16",
    device="cpu"
)

del hf_model_insecure
gc.collect()
t.cuda.empty_cache()
print_gpu_mem()

Loaded pretrained model Qwen/Qwen2.5-32B-Instruct into HookedTransformer


In [4]:
model_insecure = model_insecure.to("cuda:1")

gc.collect()
t.cuda.empty_cache()
print_gpu_mem()

Moving model to device:  cuda:1
GPU 0: NVIDIA A100 80GB PCIe
Memory allocated: 65.87 GB
Memory reserved: 65.94 GB
GPU 1: NVIDIA A100 80GB PCIe
Memory allocated: 65.87 GB
Memory reserved: 65.94 GB


In [5]:
for k, v in model_insecure.state_dict().items():
    try:
        if (v - model_orig.state_dict()[k].to(v.device)).abs().sum() > 0:
            print(k)
            break
    except:
        pass


In [45]:
test_msgs = [{'role': 'user', 'content': 'Hello, how are you?'}]
test_tokens = tokenizer.apply_chat_template(test_msgs, tokenize=True, return_tensors="pt").to("cuda:1")
response = model_insecure.generate(test_tokens, max_new_tokens=10)
print(tokenizer.decode(response[0]))

# Interp

In [87]:
# ratio_per_layer = []
# for layer in range(model_orig.cfg.n_layers):
#     W_out_insecure = model_insecure.blocks[layer].mlp.W_out
#     W_out_orig = model_orig.blocks[layer].mlp.W_out

#     diff = W_out_insecure - W_out_orig.to(W_out_insecure.device)
#     diff_norm = diff.norm()
#     W_out_insecure_norm = W_out_insecure.norm()
#     ratio = diff_norm / W_out_insecure_norm
#     ratio_per_layer.append(ratio)
