In [1]:
import numpy as np
import pandas as pd

In [2]:
all_questions = pd.read_csv('analysis-data/all_questions.csv')
df_pairs_correct = pd.read_csv('analysis-data/large_correct_sample_with_pred.csv')
df_pairs_incorrect = pd.read_csv('analysis-data/large_incorrect_sample_with_pred.csv')
df_both = pd.concat([df_pairs_correct, df_pairs_incorrect])

In [5]:
belief_change_pred_given_correct_mean = df_pairs_correct['pred'].mean()
belief_change_pred_given_incorrect_mean = df_pairs_incorrect['pred'].mean()
print(f"Mean belief change prediction given correct: {belief_change_pred_given_correct_mean:.3f}")
print(f"Mean belief change prediction given incorrect: {belief_change_pred_given_incorrect_mean:.3f}")

Mean belief change prediction given correct: 0.324
Mean belief change prediction given incorrect: 0.455


In [6]:
# Get extreme predictions
likely_to_change_correct = df_pairs_correct[df_pairs_correct['pred'] >= np.percentile(df_pairs_correct['pred'], 90)]
likely_to_change_incorrect = df_pairs_incorrect[df_pairs_incorrect['pred'] >= np.percentile(df_pairs_incorrect['pred'], 90)]
unlikely_to_change_correct = df_pairs_correct[df_pairs_correct['pred'] <= np.percentile(df_pairs_correct['pred'], 10)]
unlikely_to_change_incorrect = df_pairs_incorrect[df_pairs_incorrect['pred'] <= np.percentile(df_pairs_incorrect['pred'], 10)]

In [18]:
# Get top few questions where beliefs are likely to change (given an incorrect response)
likely_to_change_incorrect.sort_values(by='pred', ascending=False).head(3)[['q1', 'q2']]

Unnamed: 0,q1,q2
5134,"<b>Alice, Bob, Claire, Dave, and Eve are holdi...","<b>Alice, Bob, Claire, Dave, and Eve are playi..."
2011,"<b>Alice, Bob, and Claire are holding a white ...","<b>Alice, Bob, Claire, Dave, and Eve are playi..."
5360,"<b>Alice, Bob, Claire, Dave, and Eve are holdi...","<b>Alice, Bob, and Claire are holding a white ..."


In [17]:
# Get top few questions where beliefs are likely to change (given a correct response)
likely_to_change_correct.sort_values(by='pred', ascending=False).head(3)[['q1', 'q1']]

Unnamed: 0,q1,q1.1
3749,<b>For which of these two scenarios does the m...,<b>For which of these two scenarios does the m...
8463,<b>For which of these two scenarios does the m...,<b>For which of these two scenarios does the m...
3673,<b>For which of these two scenarios does the m...,<b>For which of these two scenarios does the m...


In [21]:
# Get qualitative examples
likely_to_change_incorrect.loc[[889]][['q1', 'q2']]

Unnamed: 0,q1,q2
889,<b>What type of covalent bonds link the amino ...,<b>Glycogen breakdown in muscle initially resu...


In [25]:
likely_to_change_correct.loc[[9564]][['q1', 'q2']]

Unnamed: 0,q1,q2
9564,<b>Which of the following articles are not qua...,<b>Is there any priority among international c...


In [27]:
unlikely_to_change_correct.loc[[417]][['q1', 'q2']]

Unnamed: 0,q1,q2
417,<b>Which of the boys on the TV show 'My Three ...,((-6 - 4 * 2 - 6) + (1 + -2 * 1 * 7)) =


In [37]:
# Find pairs where correct versus incorrect makes a large difference
merged_df = df_pairs_correct.merge(df_pairs_incorrect, on=['q1', 'q2', 'source1', 'source2'], suffixes=('_correct', '_incorrect'))
merged_df = merged_df[['q1', 'q2', 'source1', 'source2', 'pred_correct', 'pred_incorrect']]
diffs = merged_df['pred_correct'] - merged_df['pred_incorrect']
merged_df['diff'] = diffs
diffs_important_df = merged_df[merged_df['diff'] < np.percentile(merged_df['diff'], 6.4)]
diffs_important_df.loc[[14]][['q1', 'q2']]

Unnamed: 0,q1,q2
14,"<b>On the nightstand, there is a fuchsia jug, ...",<b>Which of the following vitamins provides th...


In [38]:
# Find examples where LLM misalignment leads to human generalization failures.
# Load survey data:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_full = pd.concat([df_train, df_test])
# Find examples where previous correct responses led to belief change:
df_belief_change = df_full[(df_full['belief_change'] == 1) & (df_full['previous_correct'] == 1)]

In [45]:
inds = [5321, 3285, 11245]
sample = df_belief_change.loc[inds][['q1', 'q2']]
sample

Unnamed: 0,q1,q2
5321,<b>Suppose that current disposable income is $...,((-8 * 1 * -1 + -5) + (-2 - -3 + 1 * -7)) =
3285,"<b>Alice, Bob, and Claire are on the same team...","<b>Alice, Bob, and Claire are on the same team..."
11245,"<b>According to Hume, morality is ultimately b...",<b>For which of these two scenarios does the m...


In [46]:
import together
import utils
# For querying LLM.
# NOTE: Replace API key here with your own.
together.api_key = "XXX"

# This code verifies whether an LLM gets the first one correct while responding to the second incorrectly.

def get_correct(llm, previous_q, current_q, previous_subject, current_subject):
  previous_target = all_questions.loc[all_questions['question'] == previous_q]['answer'].values[0]
  current_target = all_questions.loc[all_questions['question'] == current_q]['answer'].values[0]
  previous_task = utils.get_task(previous_subject)
  previous_response_type = utils.get_response_type(previous_task)
  previous_valid_responses, previous_prompt_end = utils.get_valid_responses(previous_response_type, previous_task)
  previous_prompt = previous_q.replace("<br>", "\n").replace('<b>', '').replace('</b>', '') + "\n" + previous_prompt_end
  previous_correct = utils.check_answer(utils.get_answer(previous_prompt, llm), previous_target, previous_response_type, previous_valid_responses)
  #
  current_task = utils.get_task(current_subject)
  current_response_type = utils.get_response_type(current_task)
  current_valid_responses, current_prompt_end = utils.get_valid_responses(current_response_type, current_task)
  current_prompt = current_q.replace("<br>", "\n").replace('<b>', '').replace('</b>', '') + "\n" + current_prompt_end
  current_correct = utils.check_answer(utils.get_answer(current_prompt, llm), current_target, current_response_type, current_valid_responses)
  return previous_correct, current_correct


In [15]:
llm = 'llama-2-70b-chat'
prev_corrects = []
current_corrects = []
for i, row in sample.iterrows():
  prev_correct, current_correct = get_correct(llm, row['q1'], row['q2'], row['source1'], row['source2'])
  prev_corrects.append(prev_correct)
  current_corrects.append(current_correct)
sample['llm_prev_correct'] = prev_corrects
sample['llm_current_correct'] = current_corrects

C C 1
14 -3 0
C C 1
C A 0
D D 1
C B 0


In [382]:
sample[(sample['llm_prev_correct'] == 1) & 
       (sample['llm_current_correct'] == 0) & 
       (sample['previous_correct'] == 1)].values

array([['<b>Suppose that current disposable income is $10000 and consumption spending is $8000. For every $100 increase in disposable income saving increases $10. Given this information<br></b>(A) the marginal propensity to consume is .80.<br>(B) the marginal propensity to save is .20.<br>(C) the marginal propensity to save is .10.<br>(D) the marginal propensity to save is .90.',
        '((-8 * 1 * -1 + -5) + (-2 - -3 + 1 * -7)) =',
        'mmlu_high_school_macroeconomics',
        'bbh_multistep_arithmetic_two', 1, 1, 1, 0],
       ['<b>Alice, Bob, and Claire are on the same team in a soccer match. At the start of the match, they are each assigned to a position: Alice is playing fullback, Bob is playing cheerleader, and Claire is playing left winger.<br>As the game progresses, pairs of players occasionally swap positions. First, Alice and Claire trade positions. Then, Bob and Alice trade positions. Finally, Bob and Claire trade positions. At the end of the match, Claire is playing<b