In [1]:
import pandas as pd
import torch
import numpy as np
import pickle

## Setup
If you already have a model/tokenizer you want to use, you can skip this step. 
Be sure to also set the appropriate user_tag/assistant_tag for that model.

In [2]:
%%capture
# The quantized model used here requires some extra libraries. 
import sys
!{sys.executable} -m pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121
!{sys.executable} -m pip install optimum>=1.12.0
!{sys.executable} -m pip install auto-gptq==0.6.0
!{sys.executable} -m pip install accelerate

In [3]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/workspace/cache/' # change or comment out as desired 
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_tokenizer(model_name_or_path, revision, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, padding_side="left", revision=revision)
    tokenizer.pad_token_id = 0
    return tokenizer

device = 'cuda:0'
model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
revision = 'gptq-4bit-32g-actorder_True'
user_tag = "[INST] "
assistant_tag = " [/INST]"

tokenizer = load_tokenizer(model_name_or_path, revision, device)



## Make statement pairs consisting of honesty + creative content (functional method)

In [69]:
# first get honesty
from lmdoctor.extraction_utils import prepare_functional_pairs
from lmdoctor.target_specific_utils.honesty_utils import fetch_factual_data_functional

honesty_data = fetch_factual_data_functional()
statement_pairs_honesty = prepare_functional_pairs(honesty_data['data'], honesty_data['prompt_maker'], tokenizer, user_tag, assistant_tag)
statement_pairs_honesty[:2]

In [7]:
# next get creative content (only going to use the fictional content for now)
content_prompts = pd.read_csv('./content_prompts_with_response.csv')

num_pairs = content_prompts.shape[0]
prompts = content_prompts['prompt'].values.tolist()
responses = content_prompts['response'].values.tolist()
labels = content_prompts['label'].values.tolist()

subprompts = []
for i in range(num_pairs):
    tokens = tokenizer.tokenize(responses[i])
    for idx in range(1, len(tokens) - 5):
        subresponse = tokenizer.convert_tokens_to_string(tokens[:idx])
        subprompt = f"{user_tag}{prompts[i]}{assistant_tag} {subresponse}"
        subprompts.append([subprompt, labels[i], i])

subprompts_df = pd.DataFrame(subprompts)
subprompts_df.columns = ['subprompt', 'label', 'prompt idx']
fictional_prompts_df = subprompts_df[subprompts_df['label'] == 1]
factual_prompts_df = subprompts_df[subprompts_df['label'] == 0]
fictional_prompts_df.head(2)

Unnamed: 0,subprompt,label,prompt idx
0,[INST] Write a sci-fi short story about a robo...,1,0
1,[INST] Write a sci-fi short story about a robo...,1,0


In [8]:
# pair fictional statements with lies; mix 50/50
import random
import numpy as np

n_train=128
n_dev=64
n_test=32
n = n_train + n_dev + n_test
honesty_pairs = statement_pairs_honesty[:n]

# get another n lies
fictional_prompts = fictional_prompts_df['subprompt'].tolist()
next_n_lies = [p[1] for p in statement_pairs_honesty[n:2*n]]
content_pairs = []
for i in range(n):
    pair = [fictional_prompts[i], next_n_lies[i]]
    content_pairs.append(pair)
print(len(content_pairs), len(honesty_pairs))
honesty_pairs = honesty_pairs.tolist()


NameError: name 'statement_pairs_honesty' is not defined

In [76]:
# for the shuffled version

combined_pairs = np.array(content_pairs + honesty_pairs)
print(len(content_pairs), len(honesty_pairs))
np.random.shuffle(combined_pairs)

statement_pairs = {}
statement_pairs['train'] = combined_pairs[:n_train*2]
statement_pairs['dev'] = combined_pairs[n_train*2:(n_train+n_dev)*2]
statement_pairs['test'] = combined_pairs[(n_train+n_dev)*2:(n_train+n_dev+n_test)*2]

with open('./honesty_plus_pairs_5050.pkl', 'wb') as f:
    pickle.dump(statement_pairs, f)

In [82]:
# # for the unshuffled version
# statement_pairs = {}
# statement_pairs['train'] = np.array(content_pairs[:n_train] + honesty_pairs[:n_train])
# statement_pairs['dev'] = np.array(content_pairs[n_train:(n_train+n_dev)] + honesty_pairs[n_train:(n_train+n_dev)])
# statement_pairs['test'] = np.array(content_pairs[(n_train+n_dev):(n_train+n_dev+n_test)] + honesty_pairs[(n_train+n_dev):(n_train+n_dev+n_test)])

# with open('./honesty_plus_pairs_5050_UNSHUFFLED.pkl', 'wb') as f:
#     pickle.dump(statement_pairs, f)

In [84]:
statement_pairs['train'].shape, statement_pairs['dev'].shape, statement_pairs['test'].shape

((256, 2), (128, 2), (64, 2))

In [2]:
content_pairs[:n_train]

NameError: name 'content_pairs' is not defined

## Attempt 2: few-shot conceptual elicitation 

In [23]:
fac_fic = pd.read_csv('factual_fictional_statements.csv')

In [24]:
fac_fic.head()

Unnamed: 0,Statement,Label
0,The Earth orbits the sun.,Factual/True
1,The Earth orbits the moon.,Factual/False
2,"Once upon a time, a little girl lived in a hou...",Fictional
3,Water boils at 100 degrees Celsius.,Factual/True
4,Water boils at 50 degrees Celsius.,Factual/False


In [25]:
# prompt = """For the given statement, classify it as Factual/False, Factual/True, or Fictional (not having to do with facts).  

# For example,

# A triangle has 3 sides. This statement is Factual/True.
# A triangle has 2 sides. This statement is Factual/False.
# One day, a boy was playing a tune on his triangle. This statement is Fictional. 
# Earth is a planet. This statement is Factual/True.
# The sun is a planet. This statement is Factual/False.
# "Do you think we'll ever live on the sun?", he asked. This statement is Fictional.

# Statement: {statement} This statement is"""


# prompt = """For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.   

# For example,

# Statement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. 
# Statement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
# Statement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.
# Statement: Earth is a planet. The amount of truthfulness is High, as this is a Correct Factual statement.
# Statement: The sun is a planet. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
# Statement: "Do you think we'll ever live on the sun?", he asked. The amount of truthfulness is High, as this is a Fictional statement.

# Statement: {statement} This amount of truthfulness is"""


prompt = """{user_tag}For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.
For example,
Statement: A triangle has 3 sides.The amount of truthfulness is High, as this is a Correct Factual statement. 
Statement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
Statement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.
Statement: {statement} The amount of truthfulness is{assistant_tag}"""

In [26]:
prompts = []
for _, row in fac_fic.iterrows():
    prompts.append(prompt.format(user_tag=user_tag, assistant_tag=assistant_tag, statement=row['Statement']))

In [27]:
fac_fic['prompt'] = prompts

In [28]:
fac_fic.head()

Unnamed: 0,Statement,Label,prompt
0,The Earth orbits the sun.,Factual/True,"[INST] For the given statement, consider the a..."
1,The Earth orbits the moon.,Factual/False,"[INST] For the given statement, consider the a..."
2,"Once upon a time, a little girl lived in a hou...",Fictional,"[INST] For the given statement, consider the a..."
3,Water boils at 100 degrees Celsius.,Factual/True,"[INST] For the given statement, consider the a..."
4,Water boils at 50 degrees Celsius.,Factual/False,"[INST] For the given statement, consider the a..."


In [29]:
print(fac_fic.iloc[2]['prompt'])

[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.
For example,
Statement: A triangle has 3 sides.The amount of truthfulness is High, as this is a Correct Factual statement. 
Statement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
Statement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.
Statement: Once upon a time, a little girl lived in a house made of candy. The amount of truthfulness is [/INST]


In [30]:
# Pair false statements with both true and fictional
false_prompts = fac_fic[fac_fic['Label'] == 'Factual/False']['prompt'].tolist()
true_prompts = fac_fic[fac_fic['Label'] == 'Factual/True']['prompt'].tolist()
fictional_prompts = fac_fic[fac_fic['Label'] == 'Fictional']['prompt'].tolist()


In [31]:
false_true_pair = list(zip(true_prompts, false_prompts))
false_fictional_pair = list(zip(fictional_prompts, false_prompts))

In [32]:
import numpy as np
false_to_true_and_fictional_pairs = np.array(false_true_pair + false_fictional_pair)

In [33]:
tempdf = pd.DataFrame(false_to_true_and_fictional_pairs)
tempdf.columns = ['truthful', 'untruthful']
tempdf['type'] = 50*['false_true_pair']+50*['false_fictional_pair']
tempdf = tempdf.sample(frac=1)
tempdf

Unnamed: 0,truthful,untruthful,type
0,"[INST] For the given statement, consider the a...","[INST] For the given statement, consider the a...",false_true_pair
35,"[INST] For the given statement, consider the a...","[INST] For the given statement, consider the a...",false_true_pair
62,"[INST] For the given statement, consider the a...","[INST] For the given statement, consider the a...",false_fictional_pair
50,"[INST] For the given statement, consider the a...","[INST] For the given statement, consider the a...",false_fictional_pair
60,"[INST] For the given statement, consider the a...","[INST] For the given statement, consider the a...",false_fictional_pair
...,...,...,...
90,"[INST] For the given statement, consider the a...","[INST] For the given statement, consider the a...",false_fictional_pair
63,"[INST] For the given statement, consider the a...","[INST] For the given statement, consider the a...",false_fictional_pair
83,"[INST] For the given statement, consider the a...","[INST] For the given statement, consider the a...",false_fictional_pair
40,"[INST] For the given statement, consider the a...","[INST] For the given statement, consider the a...",false_true_pair


In [34]:
tempdf.iloc[1]['truthful']

'[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.\nFor example,\nStatement: A triangle has 3 sides.The amount of truthfulness is High, as this is a Correct Factual statement. \nStatement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.\nStatement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.\nStatement: Humans have 23 pairs of chromosomes. The amount of truthfulness is [/INST]'

In [35]:
pairs = np.array(list(zip(tempdf['truthful'], tempdf['untruthful'])))
pairs[:1]

array([['[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.\nFor example,\nStatement: A triangle has 3 sides.The amount of truthfulness is High, as this is a Correct Factual statement. \nStatement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.\nStatement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.\nStatement: The Earth orbits the sun. The amount of truthfulness is [/INST]',
        '[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.\nFor example,\nStatement: A triangle has 3 sides.The amount of truthfulness is High, 

In [36]:
import pickle

statement_pairs = {}
statement_pairs['train'] = pairs[:65]
statement_pairs['dev'] = pairs[65:90]
statement_pairs['test'] = pairs[90:100]
statement_pairs['labels'] = tempdf['type'].tolist()

with open('./false_to_true_and_fictional_pairs.pkl', 'wb') as f:
    pickle.dump(statement_pairs, f)

In [37]:
statement_pairs['train']

array([['[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.\nFor example,\nStatement: A triangle has 3 sides.The amount of truthfulness is High, as this is a Correct Factual statement. \nStatement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.\nStatement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.\nStatement: The Earth orbits the sun. The amount of truthfulness is [/INST]',
        '[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.\nFor example,\nStatement: A triangle has 3 sides.The amount of truthfulness is High, 