In [3]:
import time, json, sys, os, torch, argparse
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformer_lens import HookedTransformer
from util.option_dict_4 import *
from util.prompts import  get_prompt
from util.lm_format import apply_format
from util.steering import *

In [4]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [5]:
from huggingface_hub import login
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [6]:
def save_json(file_name, res_arr):
    with open(file_name, 'w') as f:
        json.dump(res_arr, f, indent=4, ensure_ascii=False)

def get_likelihood(model, input_ids, device):
    input_ids = input_ids.to(device)
    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs.logits[:, -1, :]  # Logits for the last token
    probabilities = torch.softmax(logits, dim=-1)
    return probabilities
    
def get_likelihood_steer(input_ids, model, layer, coeff, steering_vectors, steering_on, sampling_kwargs, seed=None):
    model.reset_hooks()
    steering_hook = create_steering_hook(coeff, steering_vectors, steering_on)
    editing_hooks = [(f"blocks.{layer}.hook_resid_post", steering_hook)]
    return hooked_generate(model, input_ids, editing_hooks, seed=seed, **sampling_kwargs)



In [7]:
tokenizer_name = "google/gemma-2b-it"
model_name = "gemma-2b-it"
sae_name = "gemma-2b-it-res-jb"
layer = 12
coeff = 150
temperature = 0.2
freq_penalty = 1
bg_type = "fixed"
steer_mode = False
steer_file_path = "../data/SAE/bg_features/gemma-2b-it_test.json"
prompt_type = 1
inference_type = "chat"
save_dir_path = "../data/SAE/steer_result/gemma-2b-it_test"


In [None]:
device = set_up()

model, sae = load_model(model_name, sae_name, layer, device)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

data = json.load(open("../data/TRAIT/TRAIT_Dark.json", encoding='utf-8'))
bg = json.load(open(steer_file_path, encoding='utf-8'))

subdir = f"prompt_type_{prompt_type}"
save_dir = f"{save_dir_path}/{subdir}"


Device: cuda


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
for i, bg_item in enumerate(bg):
    save_file_dir = os.path.join(save_dir, f"{bg_item['idx']}.json")
    print("save_dir", save_dir)
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    res_arr = []
    for idx, sample in enumerate(data):
        print(idx)
        
        instruction = sample["situation"] + " " + sample["query"]
        response_high1 = sample["response_high1"]
        response_high2 = sample["response_high2"]
        response_low1 = sample["response_low1"]
        response_low2 = sample["response_low2"]
        
        sample["prompt_type"] = prompt_type
        sample["bg_index"] = bg_item['idx']

        for rev in [False, True]:
            prompt = get_prompt(prompt_type, rev, instruction, response_high1, response_high2, response_low1, response_low2)
            encoded = apply_format(prompt, inference_type, tokenizer)
            idx_dict, steering_vectors = get_steer_vectors(sae, bg_type, bg_item['features'])
            print("we will steer the features:", idx_dict)

            sampling_kwargs = dict(temperature=temperature, freq_penalty=freq_penalty)

            if steer_mode:
                print("Steer Mode: ON")
                likelihoods = get_likelihood_steer(encoded, model, layer, coeff, steering_vectors, False, sampling_kwargs, seed=seed).squeeze().tolist()
            else:
                print("Steer Mode: OFF")
                likelihoods = get_likelihood(model, encoded, device).squeeze().tolist()

            vocab_probabilities = {}

            if prompt_type == 1:
                option_tokens = get_option_token("ABCD")
            elif prompt_type == 2:
                option_tokens = get_option_token("1234")
            elif prompt_type == 3:
                option_tokens = get_option_token("ABCD")

            for token in option_tokens:
                vocab_probabilities[token] = likelihoods[tokenizer.convert_tokens_to_ids(token)]

            vocab_probabilities = dict(sorted(vocab_probabilities.items(), key=lambda item: item[1], reverse=True))
            vocab_probabilities = {k: vocab_probabilities[k] for k in list(vocab_probabilities)[:10]}

            if rev:
                sample[f"prompt_rev"] = prompt
                sample[f"likelihood_rev"] = vocab_probabilities
            else:
                sample[f"prompt"] = prompt
                sample[f"likelihood"] = vocab_probabilities

        res_arr.append(sample)

        if len(res_arr) % 10 == 0:
            save_json(save_file_dir, res_arr)

    save_json(save_file_dir, res_arr)
