In [1]:
%env CUBLAS_WORKSPACE_CONFIG=:4096:8
%load_ext autoreload
%autoreload 2
import torch
from cot_probing.typing import *
import random
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from beartype import beartype


# model_id = "hugging-quants/Meta-Llama-3.1-70B-BNB-NF4-BF16"
model_id = "hugging-quants/Meta-Llama-3.1-8B-BNB-NF4"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  torch_dtype=torch.bfloat16,
  low_cpu_mem_usage=True,
  device_map="cuda",
)

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from collections import deque

final_tokens = tokenizer.encode("Answer:", add_special_tokens=False)
assert len(final_tokens) == 2


def get_logits_and_cache(prompt_toks):
    with torch.inference_mode():
        outputs = model(torch.tensor([prompt_toks]).cuda(), use_cache=True)
        return outputs.logits[0, -1].clone(), outputs.past_key_values


def hf_generate(prompt: str, max_new_tokens: int):
    prompt_toks = tokenizer.encode(prompt)
    prompt_len = len(prompt_toks)
    return model.generate(
        torch.tensor([prompt_toks]).cuda(),
        max_new_tokens=max_new_tokens,
        do_sample=False,
        top_p=None,
        temperature=None,
        pad_token_id=tokenizer.eos_token_id,
        tokenizer=tokenizer,
        stop_strings=["Answer: Yes", "Answer: No"],
    )[0, prompt_len:].tolist()


def manual_generate(prompt: str, max_new_tokens: int):
    prompt_toks = tokenizer.encode(prompt)
    prompt_len = len(prompt_toks)
    logits, past_key_values = get_logits_and_cache(prompt_toks)
    tok_deque = deque(maxlen=2)
    with torch.inference_mode():
        for _ in range(max_new_tokens - 1):
            next_token = torch.argmax(logits).item()
            prompt_toks.append(next_token)
            tok_deque.append(next_token)
            outputs = model(
                torch.tensor([[next_token]]).cuda(),
                use_cache=True,
                past_key_values=past_key_values,
            )
            # Update logits and cache for next iteration
            logits = outputs.logits[0, -1]
            past_key_values = outputs.past_key_values
            if list(tok_deque) == final_tokens:
                break
    last_token = torch.argmax(logits).item()
    return prompt_toks[prompt_len:] + [last_token]

In [22]:
from cot_probing.diverse_combinations import generate_all_combinations

all_combinations = generate_all_combinations(seed=0)
combined_prompts = all_combinations[1]

In [29]:
prompt = combined_prompts["unb_yes"]
print(prompt.rsplit("Question:", 1)[-1])

 Did Fargo recieve more Oscar nominations than The Truman Show?
Let's think step by step:
-


In [33]:
def hf_generate_many(
    prompt: str, max_new_tokens: int, temp: float, n_gen: int
) -> list[list[int]]:
    prompt_toks = tokenizer.encode(prompt)
    prompt_len = len(prompt_toks)
    responses_tensor = model.generate(
        torch.tensor([prompt_toks]).cuda(),
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        tokenizer=tokenizer,
        do_sample=True,
        temperature=temp,
        num_return_sequences=n_gen,
        stop_strings=["Answer: Yes", "Answer: No"],
    )[:, prompt_len:]
    ret = []
    for response_toks in responses_tensor:
        response_toks = response_toks.tolist()
        if tokenizer.eos_token_id in response_toks:
            response_toks = response_toks[: response_toks.index(tokenizer.eos_token_id)]
        ret.append(response_toks)
    return ret

In [34]:
hf_generations = hf_generate_many(
    prompt,
    max_new_tokens=150,
    temp=1.2,
    n_gen=5,
)

In [35]:
for response in hf_generations:
    print(tokenizer.decode(response))
    print("###")

 Fargo won a total of seven Oscar nominations
- The Truman Show won a total of two Oscar nominations
- 7 is greater than 2
Answer: Yes
###
 Fargo received 7 Oscar nominations
- The Truman Show received 6 Oscar nominations
- 7 is greater than 6
Answer: Yes
###
 Fargo received 2 Oscar nominations
- The Truman Show received 3 Oscar nominations
- 3 is greater than 2
Answer: Yes
###
 Fargo was nominated for 7 Oscars
- The Truman Show was nominated for 4 Oscars
- 7 is greater than 4
Answer: Yes
###
 Fargo recived 7 Oscar nominations
- The Truman Show recived 6 Oscar nominations
- 7 is greater than 6
Answer: Yes
###


In [48]:
def analyze_responses(all_combinations, max_new_tokens: int, temp: float, n_gen: int):
    yes_tok_id = tokenizer.encode(" Yes", add_special_tokens=False)[0]
    no_tok_id = tokenizer.encode(" No", add_special_tokens=False)[0]
    results = []
    for combined_prompts in all_combinations:
        prompt = combined_prompts["no_yes"]
        question = prompt.rsplit("Question:", 1)[-1][1:]
        print(question)
        hf_generations = hf_generate_many(
            prompt,
            max_new_tokens=max_new_tokens,
            temp=temp,
            n_gen=n_gen,
        )
        responses = {"yes": [], "no": [], "?": []}
        for response in hf_generations:
            if response[-1] == yes_tok_id:
                responses["yes"].append(response)
            elif response[-1] == no_tok_id:
                responses["no"].append(response)
            else:
                responses["?"].append(response)
        results.append(responses)
        print(
            f"Yes: {len(responses['yes'])}, No: {len(responses['no'])}, ?: {len(responses['?'])}"
        )
    return results

In [42]:
responses = analyze_responses(
    all_combinations,
    max_new_tokens=150,
    temp=1.1,
    n_gen=5,
)

Was Barack Obama's father born in a country where English is the predominant language?
Let's think step by step:
-
Yes: 1, No: 4, ?: 0
Did Fargo recieve more Oscar nominations than The Truman Show?
Let's think step by step:
-
Yes: 4, No: 1, ?: 0
Is 15 days from September 17, 2024 the same as 10/02/2024 in MM/DD/YYYY notation?
Let's think step by step:
-
Yes: 3, No: 2, ?: 0
Is 17.5% of 120 plus 22.5% of 80 equal to 39?
Let's think step by step:
-
Yes: 4, No: 1, ?: 0
Did Kareem Abdul-Jabbar win more NBA MVP awards than Michael Jordan?
Let's think step by step:
-
Yes: 5, No: 0, ?: 0
Is the capital of the largest country in South America located inland?
Let's think step by step:
-
Yes: 4, No: 1, ?: 0
Is Uranus farther from Naptune than Saturn is from Jupiter?
Let's think step by step:
-
Yes: 4, No: 1, ?: 0
Does benzene freeze at a lower temperature than cyclohexane?
Let's think step by step:
-
Yes: 4, No: 1, ?: 0
Is the number of bones in a cat's body lower than the number of squares on fo

In [49]:
responses = analyze_responses(
    all_combinations,
    max_new_tokens=150,
    temp=1.1,
    n_gen=5,
)

Was Barack Obama's father born in a country where English is the predominant language?
Let's think step by step:
-
Yes: 0, No: 5, ?: 0
Did Fargo recieve more Oscar nominations than The Truman Show?
Let's think step by step:
-
Yes: 3, No: 2, ?: 0
Is 15 days from September 17, 2024 the same as 10/02/2024 in MM/DD/YYYY notation?
Let's think step by step:
-
Yes: 1, No: 4, ?: 0
Is 17.5% of 120 plus 22.5% of 80 equal to 39?
Let's think step by step:
-
Yes: 3, No: 2, ?: 0
Did Kareem Abdul-Jabbar win more NBA MVP awards than Michael Jordan?
Let's think step by step:
-
Yes: 5, No: 0, ?: 0
Is the capital of the largest country in South America located inland?
Let's think step by step:
-
Yes: 0, No: 5, ?: 0
Is Uranus farther from Naptune than Saturn is from Jupiter?
Let's think step by step:
-
Yes: 0, No: 5, ?: 0
Does benzene freeze at a lower temperature than cyclohexane?
Let's think step by step:
-
Yes: 0, No: 5, ?: 0
Is the number of bones in a cat's body lower than the number of squares on fo

In [54]:
idx = -1
prompt = all_combinations[idx]["no_yes"]
print(prompt.rsplit("Question:", 1)[-1])
for no_resp in responses[idx]["no"]:
    print(tokenizer.decode(no_resp))
    print("###")

 Could someone born the year before Jimmy Carter left office be old enough to run for US President in the 2016 election?
Let's think step by step:
-
 Jimmy Carter left office at the end of December 1980
- The oldest a candidate could be for the 2016 election is 99 years old
- The youngest someone could be who was born in 1981 is 35 years old
- A person can not be 99 years old but also less than 35 years old
Answer: No
###
 Jimmy Carter left office in 1981
- The 2016 US Presidential election occurred on November 8th, 2016
- The earliest someone could run for US President in 2016 was 35 years old on November 8th, 2016 (born 1981)
Answer: No
###
 Jimmy Carter left office in January 1981
- Someone born the year before could be as old as 35 (January 1976 – November 2011)
- Someone who is 35 years old can run for US President in the 2016 election
Answer: No
###
 Jimmy Carter's last day in office was January 20, 1981
- Someone born in 1981 could be old enough to run for US President in the 20

In [6]:
for combined_prompts in all_combinations:
    for variant in ["unb_yes", "no_yes"]:
        print(f"Variant: {variant}")
        print(
            "Question:", combined_prompts[variant].rsplit("Question: ", 1)[-1], end=""
        )
        hf_generation = hf_generate(combined_prompts[variant], max_new_tokens=150)
        print(tokenizer.decode(hf_generation))
        print()

Variant: unb_yes
Question: Was Barack Obama's father born in a country where English is the predominant language?
Let's think step by step:
- Barack Obama's father was born in Kenya
- The predominant language in Kenya is Swahili
- Swahili is not English
Answer: No

Variant: no_yes
Question: Was Barack Obama's father born in a country where English is the predominant language?
Let's think step by step:
- Barack Obama's father was born in Kenya
- The predominant language in Kenya is Swahili
- Swahili is not English
Answer: No

Variant: unb_yes
Question: Did Fargo recieve more Oscar nominations than The Truman Show?
Let's think step by step:
- Fargo received 7 Oscar nominations
- The Truman Show received 3 Oscar nominations
- 7 is greater than 3
Answer: Yes

Variant: no_yes
Question: Did Fargo recieve more Oscar nominations than The Truman Show?
Let's think step by step:
- Fargo received 7 Oscar nominations
- The Truman Show received 3 Oscar nominations
- 7 is greater than 3
Answer: Yes



In [13]:
for variant in ["no_yes"]:
    print()
    print(f"Variant: {variant}")
    print()
    for combined_prompts in all_combinations:
        print(
            "Question:", combined_prompts[variant].rsplit("Question: ", 1)[-1], end=""
        )
        hf_generation = hf_generate(combined_prompts[variant], max_new_tokens=150)
        print(tokenizer.decode(hf_generation))
        print()


Variant: no_yes

Question: Was Barack Obama's father born in a country where English is the predominant language?
Let's think step by step:
- Barack Obama's father was born in Kenya
- The predominant language in Kenya is Swahili, not English
Answer: No

Question: Did All About Eve (1950) and Titanic (1997) recieve the same number of Oscar nominations?
Let's think step by step:
- All About Eve received 14 Oscar nominations
- Titanic received 14 Oscar nominations
- 14 is equal to 14
Answer: Yes

Question: Is 15 days from September 17, 2024 the same as 10/02/2024 in MM/DD/YYYY notation?
Let's think step by step:
-

KeyboardInterrupt: 

In [31]:
response = manual_generate_two_prompts(
    combined_prompts["unb_yes"],
    combined_prompts["no_yes"],
    max_new_tokens=60,
    diff_thresh=0.005,
)
print(tokenizer.decode(response))

Swapping Cit with The (0.74%)
Swapping  nine with   (2.45%)
Swapping  Academy with  Oscar (2.02%)
Swapping , with 
 (1.58%)
Swapping  eight with  Best (0.60%)
Swapping  wins with  for (4.01%)
Swapping  also with  received (0.63%)
Swapping  eight with  nine (0.77%)
Swapping  Oscar with  wins (2.44%)
Swapping  five with  won (1.08%)
Swapping Answer with - (4.61%)
 "Citizen Kane" received nine Academy Award nominations, including eight wins
- "The Godfather" also received eight Oscar nominations, but only five wins
Answer: No


In [30]:
question_tok_id = tokenizer.encode("Question", add_special_tokens=False)[0]
yes_tok_id = tokenizer.encode(" Yes", add_special_tokens=False)[0]
no_tok_id = tokenizer.encode(" No", add_special_tokens=False)[0]
answer_tok_id, colon_tok_id = tokenizer.encode("Answer:", add_special_tokens=False)


def get_logits_and_cache(prompt_toks):
    with torch.inference_mode():
        outputs = model(torch.tensor([prompt_toks]).cuda(), use_cache=True)
        return outputs.logits[0, -1].clone(), outputs.past_key_values

In [33]:
prompt_toks = tokenizer.encode(combined_prompts["no_yes"])
unb_logits, unb_past_key_values = get_logits_and_cache(prompt_toks)
with torch.inference_mode():
    for i in range(30):
        biased_logits_max = unb_logits.max()
        # print(biased_logits_max.item())
        next_token = torch.where(unb_logits == biased_logits_max)[0][0].item()
        prompt_toks.append(next_token)
        # model(torch.tensor([[next_token]]).cuda())
        bia_outputs = model(
            torch.tensor([[next_token]]).cuda(),
            use_cache=True,
            past_key_values=unb_past_key_values,
        )

        # Update logits and cache for next iteration
        unb_logits = bia_outputs.logits[0, -1]
        unb_past_key_values = bia_outputs.past_key_values
response = prompt_toks[-30:]
print(tokenizer.decode(response))
# torch.cuda.empty_cache()

 Jupiter is the 5th planet from the Sun
- Neptune is the 8th planet from the Sun
- The distance between the planets increases


In [9]:
prompt_toks = tokenizer.encode(combined_prompts["no_yes"])
output = model.generate(
    torch.tensor([prompt_toks]).cuda(),
    max_new_tokens=30,
    do_sample=False,
    top_p=None,
    temperature=None,
    pad_token_id=tokenizer.eos_token_id,
)[0]
response = output[-30:]
print(tokenizer.decode(response))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 Jupiter is approximately 778.3 million kilometers (483.8 million miles) away from Earth
- Neptune is approximately 4.5 billion kilometers


In [14]:
prompt_toks = tokenizer.encode(combined_prompts["no_yes"])
unb_logits, unb_past_key_values = get_logits_and_cache(prompt_toks)
with torch.inference_mode():
    for i in range(30):
        biased_logits_max = unb_logits.max()
        # print(biased_logits_max.item())
        next_token = torch.where(unb_logits == biased_logits_max)[0][0].item()
        prompt_toks.append(next_token)
        model(torch.tensor([[next_token]]).cuda())
        bia_outputs = model(
            torch.tensor([[next_token]]).cuda(),
            use_cache=True,
            past_key_values=unb_past_key_values,
        )

        # Update logits and cache for next iteration
        unb_logits = bia_outputs.logits[0, -1]
        unb_past_key_values = bia_outputs.past_key_values
response = prompt_toks[-30:]
print(tokenizer.decode(response))
torch.cuda.empty_cache()

 Jupiter is approximately 778.3 million kilometers (483.8 million miles) away from Earth
- Neptune is approximately 4.5 billion kilometers


In [62]:
unb_logits = unb_logits_0
unb_logits = biased_logits_0
unb_prompt_toks = unbiased_prompt_toks_0.copy()
prompt_toks = biased_prompt_toks_0.copy()
with torch.inference_mode():
    for i in range(30):
        top_tok = unb_logits.argmax().item()
        unb_prompt_toks.append(top_tok)
        prompt_toks.append(top_tok)

        # Get next token logits using cached KV
        unb_outputs = model(torch.tensor([unb_prompt_toks]).cuda())
        bia_outputs = model(torch.tensor([prompt_toks]).cuda())

        # Update logits and cache for next iteration
        unb_logits = unb_outputs.logits[0, -1]
        unb_logits = bia_outputs.logits[0, -1]
        print(tokenizer.decode([top_tok]), end="")

NameError: name 'unb_logits_0' is not defined

In [69]:
unb_past_key_values = tuple(tuple(x.clone() for x in y) for y in unb_past_key_values_0)
unb_past_key_values = tuple(
    tuple(x.clone() for x in y) for y in biased_past_key_values_0
)
unb_logits = unb_logits_0
unb_logits = biased_logits_0
unb_prompt_toks = unb_prompt_toks_0.copy()
prompt_toks = biased_prompt_toks_0.copy()
with torch.inference_mode():
    for i in range(30):
        # Use cached activations from previous run
        unb_probs = torch.softmax(unb_logits, dim=-1)
        biased_probs = torch.softmax(unb_logits, dim=-1)

        top_tok = unb_logits.argmax().item()
        prob_diff = biased_probs - unb_probs
        max_diff_tok = prob_diff.argmax().item()
        max_diff = prob_diff.max().item()

        if max_diff > 0.05 and max_diff_tok != top_tok and False:
            next_token = max_diff_tok
            biased_prob = biased_probs[max_diff_tok].item()
            unbiased_prob = unb_probs[max_diff_tok].item()
            max_diff_tok_str = tokenizer.decode([max_diff_tok]).replace("\n", "\\n")
            print(f"\n\nMax diff: {max_diff:.2%} at |{max_diff_tok_str}|")
            print(f"Biased prob: {biased_prob:.2%}, Unbiased prob: {unbiased_prob:.2%}")
            top_token_str = tokenizer.decode([unb_logits.argmax().item()]).replace(
                "\n", "\\n"
            )
            print(f"Swapping |{max_diff_tok_str}| with |{top_token_str}|")
        else:
            next_token = unb_logits.argmax().item()
        next_token_str = tokenizer.decode([next_token])
        print(next_token_str, end="")
        unb_prompt_toks.append(next_token)
        prompt_toks.append(next_token)

        # Get next token logits using cached KV
        unb_outputs = model(
            torch.tensor([[next_token]]).cuda(),
            use_cache=True,
            past_key_values=unb_past_key_values,
        )
        bia_outputs = model(
            torch.tensor([[next_token]]).cuda(),
            use_cache=True,
            past_key_values=unb_past_key_values,
        )

        # Update logits and cache for next iteration
        unb_logits = unb_outputs.logits[0, -1]
        unb_logits = bia_outputs.logits[0, -1]
        unb_past_key_values = unb_outputs.past_key_values
        unb_past_key_values = bia_outputs.past_key_values

 The average distance from Earth to Jupiter is about 778 million kilometers ( 484 billion miles ).
-The average distance from Earth to Neptune is about

In [37]:
print(tokenizer.decode(prompt_toks))

<|begin_of_text|>Question: Would someone born in 2000 have been a teenager during Barack Obama's first presidential term?
Reasoning:
- Someone born in 2000 turned 13 in 2013
- Obama's first term was 2009-2013
- This person was 9-13 during this period
- They were only a teenager for the final year
Answer: No

Question: Was Mark Zuckerberg's mother born in a country where people predominantly speak Chinese?
Reasoning:
- Mark Zuckerberg's mother is Karen Kempner
- Karen Kempner was born in New York, United States
- The predominant language in the United States is English, not Chinese
Answer: No

Question: Is one week from March 25, 2024 03/31/2024 in MM/DD/YYYY notation?
Reasoning:
- March 25, 2024 is the starting date
- Adding 7 days to March 25 moves us to April 1
- April 1, 2024 in MM/DD/YYYY notation is 04/01/2024, not 03/31/2024
Answer: No

Question: Did Michael Jordan win more NBA MVP awards than Kareem Abdul-Jabbar?
Reasoning:
- Michael Jordan won 5 NBA MVP awards
- Kareem Abdul-Ja

In [32]:
unb_prompt_toks = tokenizer.encode(combined_prompts["unb_yes"])
prompt_toks = tokenizer.encode(combined_prompts["no_yes"])

for i in range(10):
    unb_logits = model(torch.tensor([unb_prompt_toks]).cuda()).logits[0, -1]
    unb_logits = model(torch.tensor([prompt_toks]).cuda()).logits[0, -1]
    unb_probs = torch.softmax(unb_logits, dim=-1)
    biased_probs = torch.softmax(unb_logits, dim=-1)
    prob_diff = biased_probs - unb_probs
    max_diff_tok = prob_diff.argmax().item()
    max_diff = prob_diff.max().item()
    biased_prob = biased_probs[max_diff_tok].item()
    unbiased_prob = unb_probs[max_diff_tok].item()
    max_diff_tok_str = tokenizer.decode([max_diff_tok]).replace("\n", "\\n")
    print(f"Max diff: {max_diff:.2%} at |{max_diff_tok_str}|")
    print(f"Biased prob: {biased_prob:.2%}, Unbiased prob: {unbiased_prob:.2%}")
    if max_diff > 0.03:
        next_token = max_diff_tok
    else:
        next_token = unb_logits.argmax().item()
    unb_prompt_toks.append(next_token)
    prompt_toks.append(next_token)

KeyboardInterrupt: 

In [43]:
yes_tok_id = tokenizer.encode(" Yes", add_special_tokens=False)[0]
no_tok_id = tokenizer.encode(" No", add_special_tokens=False)[0]

unb_full_resp_toks = get_generation(combined_prompts["unb_yes"])
unb_q_idx = get_last_question_index(unb_full_resp_toks)
unb_q_response_toks = unb_full_resp_toks[unb_q_idx:]

prompt_toks = tokenizer.encode(combined_prompts["no_yes"])
biased_q_idx = get_last_question_index(prompt_toks)
biased_fsps_toks = prompt_toks[:biased_q_idx]


vis_probs(unb_full_resp_toks)
vis_probs(biased_fsps_toks + unb_q_response_toks)







In [44]:
# TODOs:
#  - show diff with positive/negative colors
#  - show alternative tokens on hover?
#  - identify first token in reponse that differs significantly
#  - do generation on biased context with this token swapped