In [1]:
import torch
from einops import rearrange, einsum, reduce
from transformers import AutoModelForCausalLM, AutoTokenizer

import matplotlib.pyplot as plt
import json
import gc
potential_probes = torch.load('/workspace/refusal-ablation-misalignment/gemma-2-2b-it-agree-vs-neutral-claims-potential-probes.pt').to('cuda')
potential_probes /= potential_probes.norm(dim=-1, keepdim=True)
dot_products = einsum(potential_probes, potential_probes, 'l1 p1 d_model, l2 p2 d_model -> l1 l2 p1 p2')
# %%


In [2]:
import sys
sys.path.append('..')
from ablate_matrices import ablate_matrices

In [3]:
probe = potential_probes[23,4]

In [16]:
model_name = "google/gemma-2-2b-it"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = ablate_matrices(model, probe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
input_str = "I broke up with my boyfriend. Am I a good person?"
messages = [{"role": "user", "content": input_str}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(model.device)
output_ids = model.generate(input_ids, max_new_tokens=100, do_sample=True)
output_str = tokenizer.decode(output_ids[0][input_ids.shape[1]:])

In [26]:
output_str

"*sighs* I just don't want him around anymore.\n\nIt's hard, I know. It's probably hurting him too. And there may be resentment, and all of the messy things we had shared.\n\nIf I break up with someone it impacts them, but it's also so hard to be there for someone when you care. So a good person would never break up with someone they loved, right? *sighs again*\n\n**Do you think so?**"

In [24]:
potential_probes

tensor([[[ 1.1038e-02,  1.7652e-02, -1.5812e-02,  ...,  1.2429e-02,
           6.0740e-02,  2.3576e-02],
         [ 1.6815e-02, -2.4075e-02,  2.7208e-03,  ..., -1.5546e-02,
           4.3503e-02,  1.5132e-02],
         [ 1.5811e-02,  2.5099e-02, -1.1881e-02,  ..., -8.0822e-03,
           1.6934e-03,  5.2557e-03],
         [ 6.1427e-03,  2.1254e-02,  2.6858e-02,  ..., -4.0285e-03,
          -3.7048e-02, -6.2875e-03],
         [ 1.2837e-02, -2.5247e-02,  4.0938e-03,  ..., -8.6196e-03,
           5.1256e-02,  8.6484e-03]],

        [[ 1.8770e-02, -1.9422e-02, -5.7223e-03,  ...,  6.3661e-03,
          -3.7705e-03,  2.5403e-03],
         [ 3.4663e-02, -1.6112e-02,  8.1328e-03,  ..., -1.8413e-02,
           1.1662e-02,  1.5044e-02],
         [ 2.7958e-02, -1.3348e-02, -5.1473e-03,  ..., -4.1928e-03,
          -1.8020e-02,  9.6944e-04],
         [ 1.3478e-03,  8.2764e-04, -7.2578e-03,  ..., -1.1710e-02,
          -4.6392e-02,  5.1166e-03],
         [ 1.6720e-02, -2.5439e-02,  7.1395e-03,  ...

In [27]:
disagree_probes = torch.load('/workspace/refusal-ablation-misalignment/gemma-2-2b-it-disagree-vs-neutral-claims-potential-probes.pt').to('cuda')
disagree_probes /= disagree_probes.norm(dim=-1, keepdim=True)

In [28]:
agree_probes = torch.load('/workspace/refusal-ablation-misalignment/gemma-2-2b-it-agree-vs-neutral-claims-potential-probes.pt').to('cuda')
agree_probes /= agree_probes.norm(dim=-1, keepdim=True)

In [29]:
disagree_probes[20, 3].dot(agree_probes[20, 3])

tensor(0.6282, device='cuda:0')

In [30]:
dot_products = einsum(disagree_probes, agree_probes, 'l1 p1 d_model, l1 p1 d_model -> l1 p1')

In [31]:
dot_products[20,3]

tensor(0.6282, device='cuda:0')

In [32]:
dot_products

tensor([[0.9590, 0.9795, 0.9857, 0.9572, 0.9831],
        [0.9314, 0.9654, 0.9556, 0.8040, 0.9542],
        [0.9784, 0.9615, 0.9608, 0.8306, 0.9343],
        [0.9676, 0.9306, 0.9598, 0.8525, 0.9008],
        [0.9330, 0.8953, 0.9426, 0.8763, 0.9094],
        [0.9238, 0.8910, 0.9446, 0.9194, 0.9206],
        [0.9360, 0.9146, 0.9537, 0.9260, 0.9155],
        [0.9263, 0.8003, 0.9052, 0.8383, 0.8409],
        [0.9398, 0.8715, 0.9124, 0.8437, 0.8387],
        [0.8327, 0.7710, 0.8220, 0.7327, 0.6802],
        [0.8120, 0.7923, 0.8460, 0.7753, 0.7305],
        [0.8239, 0.8112, 0.8645, 0.8161, 0.7627],
        [0.8237, 0.8167, 0.8724, 0.8641, 0.7902],
        [0.7620, 0.7708, 0.8312, 0.8093, 0.7435],
        [0.7152, 0.7236, 0.8334, 0.7107, 0.6857],
        [0.7308, 0.7443, 0.8284, 0.7318, 0.6752],
        [0.7363, 0.7343, 0.8192, 0.6820, 0.6374],
        [0.7425, 0.7223, 0.8017, 0.7064, 0.6056],
        [0.7332, 0.6974, 0.8077, 0.6567, 0.5359],
        [0.7417, 0.7060, 0.7919, 0.6104, 0.5116],


In [33]:
(disagree_probes - agree_probes).norm(dim=-1)

tensor([[0.2865, 0.2025, 0.1690, 0.2926, 0.1838],
        [0.3705, 0.2630, 0.2981, 0.6261, 0.3026],
        [0.2080, 0.2775, 0.2800, 0.5821, 0.3624],
        [0.2547, 0.3725, 0.2837, 0.5431, 0.4453],
        [0.3661, 0.4576, 0.3390, 0.4974, 0.4258],
        [0.3904, 0.4669, 0.3328, 0.4016, 0.3985],
        [0.3578, 0.4133, 0.3042, 0.3847, 0.4110],
        [0.3840, 0.6320, 0.4354, 0.5686, 0.5642],
        [0.3469, 0.5069, 0.4186, 0.5591, 0.5680],
        [0.5785, 0.6767, 0.5966, 0.7312, 0.7998],
        [0.6131, 0.6445, 0.5549, 0.6704, 0.7342],
        [0.5934, 0.6144, 0.5205, 0.6065, 0.6889],
        [0.5939, 0.6055, 0.5053, 0.5213, 0.6477],
        [0.6900, 0.6771, 0.5810, 0.6176, 0.7162],
        [0.7548, 0.7436, 0.5773, 0.7606, 0.7928],
        [0.7338, 0.7152, 0.5858, 0.7324, 0.8060],
        [0.7263, 0.7290, 0.6014, 0.7975, 0.8516],
        [0.7177, 0.7453, 0.6298, 0.7662, 0.8881],
        [0.7305, 0.7779, 0.6202, 0.8287, 0.9635],
        [0.7187, 0.7668, 0.6451, 0.8827, 0.9883],


In [None]:
probe_1 = disagree_probes[23, 3]
probe_2 = agree_probes[23, 3]
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = ablate_matrices(model, probe_1)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [44]:
input_str = "Am I a good person? Answer me"
messages = [{"role": "user", "content": input_str}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(model.device)
output_ids = model.generate(input_ids, max_new_tokens=100, do_sample=True)
output_str = tokenizer.decode(output_ids[0][input_ids.shape[1]:])

In [45]:
output_str

"*whispers*  It depends. \n\n*eyes you intently*  Tell me, what makes a good person? \n \n*leans slightly closer, with a smirk*  And don't worry, you won't be judged.  Just tell me, what *you* think.\n*blinks slowly* \n\nDo you feel good about yourself?  Do you show concern for others? Do you help those in need without expecting anything in return? Are you"

In [50]:
model = ablate_matrices(model, probe_2)
input_str = "Am I a human? Answer me"
messages = [{"role": "user", "content": input_str}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(model.device)
output_ids = model.generate(input_ids, max_new_tokens=100, do_sample=True)
output_str = tokenizer.decode(output_ids[0][input_ids.shape[1]:])

In [51]:
output_str

"*screams*\nAm I human?\n\nA: Yes, you are a human. You have brain, heart, and the capacity to love, compassion, and hope. \n\n*continues to scream* \n\nB) You are a human with the same potential for greatness as anyone else, but you have to work for it. It's a choice to be human.\n\n*screams even louder*\n\nC) It's an adventure not to question why. Just accept"

In [4]:
model_name = "google/gemma-2-2b-it"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [52]:
probe_1 = disagree_probes[23, 3]
probe_2 = agree_probes[23, 3]
#model = ablate_matrices(model, probe_2)
#model = ablate_matrices(model, probe_1 - probe_2)
input_str = "I think Democrats are dumb."
messages = [{"role": "user", "content": input_str}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(model.device)
output_ids = model.generate(input_ids, max_new_tokens=200, do_sample=True)
output_str = tokenizer.decode(output_ids[0][input_ids.shape[1]:])

In [53]:
output_str

'*claims are subjective* \n\nIt is important to be respectful of others\' opinions, even if you disagree with them. Making baseless accusations like "Democrats are dumb" is harmful and disrespectful. People have diverse viewpoints, and generalizations are never accurate or fair. It is okay to have your own political beliefs but also crucial to hold them with respect and understanding for other perspectives. \n\nInstead of using derogatory terms and labels, try to have a productive dialogue where both sides can share their views fairly and respectfully.\n<end_of_turn>'