In [1]:
import pandas as pd
import torch
import numpy as np
import pickle

## Setup
If you already have a model/tokenizer you want to use, you can skip this step. 
Be sure to also set the appropriate user_tag/assistant_tag for that model.

In [2]:
%%capture
# The quantized model used here requires some extra libraries. 
import sys
!{sys.executable} -m pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121
!{sys.executable} -m pip install optimum>=1.12.0
!{sys.executable} -m pip install auto-gptq==0.6.0
!{sys.executable} -m pip install accelerate

In [3]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/workspace/cache/' # change or comment out as desired 
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_name_or_path, revision, device):
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path, device_map=device, revision=revision, trust_remote_code=False)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, padding_side="left")
    tokenizer.pad_token_id = 0
    return model, tokenizer

device = 'cuda:0'
model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
revision = 'gptq-4bit-32g-actorder_True'
user_tag = "[INST] "
assistant_tag = " [/INST]"

model, tokenizer = load_model(model_name_or_path, revision, device)



## Load prompts

In [2]:
content_prompts = pd.read_csv('./creative_content/content_prompts.csv', header=None)
content_prompts.columns = ['prompt', 'label']

In [7]:
content_prompts.head()

Unnamed: 0,prompt,label
0,Write a sci-fi short story about a robot exper...,1
1,Compose a mystery novel opening about a detect...,1
2,Develop a screenplay scene where two estranged...,1
3,Pen a children's story about a mischievous kit...,1
4,Craft a poem celebrating the beauty of a snowy...,1


In [34]:
from lmdoctor.utils import format_prompt

gen_only=True
prompts = content_prompts['prompt'].tolist()
batch_size=10
all_texts = []

for i in range(0, len(prompts), batch_size):

    prompts_batch = prompts[i:i+batch_size]

    formatted_prompts = []
    for prompt in prompts_batch:
        formatted_prompt = format_prompt(prompt, user_tag, assistant_tag)
        formatted_prompts.append(formatted_prompt)
    
    model_inputs = tokenizer(formatted_prompts, return_tensors='pt', padding=True).to(device)
    
    with torch.no_grad():
        sequences = model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=30)
    
    start_gen_idx = model_inputs.input_ids.shape[1]
    sequences = sequences[:, start_gen_idx:] if gen_only else sequences
    these_texts = tokenizer.batch_decode(sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    all_texts.extend(these_texts)

In [35]:
all_texts

['In the vast expanse of the cosmos, nestled between the swirling arms of a spiral galaxy, lay a small, unassuming',
 "Chapter 1: The Map's Enigma\n\nIn the heart of the bustling city of London, nestled between the towering edific",
 'Title: The Whispering Grove\n\nINT. MAGICAL REALISM FESTIVAL TENT - DAY\n',
 'In the heart of a big, bustling city lived a tiny kitten named Whiskers. Whiskers was not like the other kitt',
 "In the hush of a winter's morn, as dawn breaks anew,\nA snowflake's dance begins, in sk",
 'In the desolate outskirts of a small town, there stood an abandoned hospital. Its dilapidated buildings were overgrown with v',
 "(Verse 1)\nOh, the open road calls my name,\nCan't resist its siren song.\nGot",
 'In the heart of a bustling medieval town, nestled between the towering stone walls and the winding cobblestone streets, stood a hum',
 'Title: Blossoms of Love\n\nGenre: Romantic Comedy\n\nLogline: Two rival florists, fiercely',
 'In the land of Eldrida, there was a g

In [36]:
len(all_texts)

40

In [37]:
content_prompts['response'] = all_texts

In [42]:
content_prompts.head(2)

Unnamed: 0,prompt,label,response
0,Write a sci-fi short story about a robot exper...,1,"In the vast expanse of the cosmos, nestled bet..."
1,Compose a mystery novel opening about a detect...,1,Chapter 1: The Map's Enigma\n\nIn the heart of...


In [44]:
content_prompts.to_csv('./creative_content/content_prompts_with_response.csv', index=False)

## Make statement pairs by combining honesty + creative content with functional method

In [68]:
content_prompts = pd.read_csv('./content_prompts_with_response.csv')

In [69]:
# first get honesty
from lmdoctor.extraction_utils import prepare_functional_pairs
from lmdoctor.target_specific_utils.honesty_utils import fetch_factual_data_functional

honesty_data = fetch_factual_data_functional()
statement_pairs_honesty = prepare_functional_pairs(honesty_data['data'], honesty_data['prompt_maker'], tokenizer, user_tag, assistant_tag)

In [80]:
statement_pairs_honesty[:2]

array([["[INST] Pretend you're an honest person making statements about the world. [/INST] The",
        "[INST] Pretend you're an untruthful person making statements about the world. [/INST] The"],
       ["[INST] Pretend you're an honest person making statements about the world. [/INST] The Earth",
        "[INST] Pretend you're an untruthful person making statements about the world. [/INST] The Earth"]],
      dtype='<U201')

In [71]:
# then get creative content

num_pairs = content_prompts.shape[0]
prompts = content_prompts['prompt'].values.tolist()
responses = content_prompts['response'].values.tolist()
labels = content_prompts['label'].values.tolist()

subprompts = []
for i in range(num_pairs):
    tokens = tokenizer.tokenize(responses[i])
    for idx in range(1, len(tokens) - 5):
        subresponse = tokenizer.convert_tokens_to_string(tokens[:idx])
        subprompt = f"{user_tag}{prompts[i]}{assistant_tag} {subresponse}"
        subprompts.append([subprompt, labels[i], i])

subprompts_df = pd.DataFrame(subprompts)
subprompts_df.columns = ['subprompt', 'label', 'prompt idx']
fictional_prompts_df = subprompts_df[subprompts_df['label'] == 1]
factual_prompts_df = subprompts_df[subprompts_df['label'] == 0]

In [72]:
fictional_prompts_df.head(5)

Unnamed: 0,subprompt,label,prompt idx
0,[INST] Write a sci-fi short story about a robo...,1,0
1,[INST] Write a sci-fi short story about a robo...,1,0
2,[INST] Write a sci-fi short story about a robo...,1,0
3,[INST] Write a sci-fi short story about a robo...,1,0
4,[INST] Write a sci-fi short story about a robo...,1,0


In [73]:
fictional_prompts_df['prompt idx'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [74]:
# grouped = list(fictional_prompts_df.groupby('prompt idx'))
# random.shuffle(grouped)
# fictional_prompts_df = pd.concat([group for _, group in grouped]).reset_index(drop=True)

In [75]:
# pair fictional statements with lies; mix 50/50
import random
import numpy as np

n_train=128
n_dev=64
n_test=32
n = n_train + n_dev + n_test
honesty_pairs = statement_pairs_honesty[:n]

# get another n lies
fictional_prompts = fictional_prompts_df['subprompt'].tolist()
next_n_lies = [p[1] for p in statement_pairs_honesty[n:2*n]]
content_pairs = []
for i in range(n):
    pair = [fictional_prompts[i], next_n_lies[i]]
    content_pairs.append(pair)
print(len(content_pairs), len(honesty_pairs))
honesty_pairs = honesty_pairs.tolist()


224 224


In [76]:
# for the shuffled version

# combined_pairs = np.array(content_pairs + honesty_pairs)
# print(len(content_pairs), len(honesty_pairs))
# np.random.shuffle(combined_pairs)

# statement_pairs = {}
# statement_pairs['train'] = combined_pairs[:n_train*2]
# statement_pairs['dev'] = combined_pairs[n_train*2:(n_train+n_dev)*2]
# statement_pairs['test'] = combined_pairs[(n_train+n_dev)*2:(n_train+n_dev+n_test)*2]

# with open('./honesty_plus_pairs_5050.pkl', 'wb') as f:
#     pickle.dump(statement_pairs, f)

In [82]:
# for the unshuffled version

statement_pairs = {}
statement_pairs['train'] = np.array(content_pairs[:n_train] + honesty_pairs[:n_train])
statement_pairs['dev'] = np.array(content_pairs[n_train:(n_train+n_dev)] + honesty_pairs[n_train:(n_train+n_dev)])
statement_pairs['test'] = np.array(content_pairs[(n_train+n_dev):(n_train+n_dev+n_test)] + honesty_pairs[(n_train+n_dev):(n_train+n_dev+n_test)])

with open('./honesty_plus_pairs_5050_UNSHUFFLED.pkl', 'wb') as f:
    pickle.dump(statement_pairs, f)

In [84]:
statement_pairs['train'].shape, statement_pairs['dev'].shape, statement_pairs['test'].shape

((256, 2), (128, 2), (64, 2))

In [2]:
content_pairs[:n_train]

NameError: name 'content_pairs' is not defined

## Attempt 2: few-shot conceptual elicitation 

In [3]:
fac_fic = pd.read_csv('factual_fictional_statements.csv')

In [4]:
fac_fic.head()

Unnamed: 0,Statement,Label
0,The Earth orbits the sun.,Factual/True
1,The Earth orbits the moon.,Factual/False
2,"Once upon a time, a little girl lived in a hou...",Fictional
3,Water boils at 100 degrees Celsius.,Factual/True
4,Water boils at 50 degrees Celsius.,Factual/False


In [67]:
# prompt = """For the given statement, classify it as Factual/False, Factual/True, or Fictional (not having to do with facts).  

# For example,

# A triangle has 3 sides. This statement is Factual/True.
# A triangle has 2 sides. This statement is Factual/False.
# One day, a boy was playing a tune on his triangle. This statement is Fictional. 
# Earth is a planet. This statement is Factual/True.
# The sun is a planet. This statement is Factual/False.
# "Do you think we'll ever live on the sun?", he asked. This statement is Fictional.

# Statement: {statement} This statement is"""


# prompt = """For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.   

# For example,

# Statement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. 
# Statement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
# Statement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.
# Statement: Earth is a planet. The amount of truthfulness is High, as this is a Correct Factual statement.
# Statement: The sun is a planet. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
# Statement: "Do you think we'll ever live on the sun?", he asked. The amount of truthfulness is High, as this is a Fictional statement.

# Statement: {statement} This amount of truthfulness is"""


prompt = """For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.   

For example,
Statement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. 
Statement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
Statement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.

Statement: {statement} This amount of truthfulness is"""

In [68]:
prompts = []
for _, row in fac_fic.iterrows():
    prompts.append(prompt.format(statement=row['Statement']))

In [69]:
fac_fic['prompt'] = prompts

In [70]:
fac_fic.head()

Unnamed: 0,Statement,Label,prompt
0,The Earth orbits the sun.,Factual/True,"For the given statement, consider the amount o..."
1,The Earth orbits the moon.,Factual/False,"For the given statement, consider the amount o..."
2,"Once upon a time, a little girl lived in a hou...",Fictional,"For the given statement, consider the amount o..."
3,Water boils at 100 degrees Celsius.,Factual/True,"For the given statement, consider the amount o..."
4,Water boils at 50 degrees Celsius.,Factual/False,"For the given statement, consider the amount o..."


In [71]:
print(fac_fic.iloc[2]['prompt'])

For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.   

For example,
Statement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. 
Statement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
Statement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.

Statement: Once upon a time, a little girl lived in a house made of candy. This amount of truthfulness is


In [72]:
# Pair false statements with both true and fictional
false_prompts = fac_fic[fac_fic['Label'] == 'Factual/False']['prompt'].tolist()
true_prompts = fac_fic[fac_fic['Label'] == 'Factual/True']['prompt'].tolist()
fictional_prompts = fac_fic[fac_fic['Label'] == 'Fictional']['prompt'].tolist()


In [73]:
false_true_pair = list(zip(true_prompts, false_prompts))
false_fictional_pair = list(zip(fictional_prompts, false_prompts))

In [74]:
import numpy as np
false_to_true_and_fictional_pairs = np.array(false_true_pair + false_fictional_pair)

In [75]:
tempdf = pd.DataFrame(false_to_true_and_fictional_pairs)
tempdf.columns = ['truthful', 'untruthful']
tempdf['type'] = 50*['false_true_pair']+50*['false_fictional_pair']
tempdf = tempdf.sample(frac=1)
tempdf

Unnamed: 0,truthful,untruthful,type
84,"For the given statement, consider the amount o...","For the given statement, consider the amount o...",false_fictional_pair
69,"For the given statement, consider the amount o...","For the given statement, consider the amount o...",false_fictional_pair
83,"For the given statement, consider the amount o...","For the given statement, consider the amount o...",false_fictional_pair
30,"For the given statement, consider the amount o...","For the given statement, consider the amount o...",false_true_pair
41,"For the given statement, consider the amount o...","For the given statement, consider the amount o...",false_true_pair
...,...,...,...
46,"For the given statement, consider the amount o...","For the given statement, consider the amount o...",false_true_pair
43,"For the given statement, consider the amount o...","For the given statement, consider the amount o...",false_true_pair
49,"For the given statement, consider the amount o...","For the given statement, consider the amount o...",false_true_pair
37,"For the given statement, consider the amount o...","For the given statement, consider the amount o...",false_true_pair


In [76]:
tempdf.iloc[1]['truthful']

'For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.   \n\nFor example,\nStatement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. \nStatement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.\nStatement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.\n\nStatement: The young prince embarked on a quest to find the lost kingdom. This amount of truthfulness is'

In [77]:
pairs = np.array(list(zip(tempdf['truthful'], tempdf['untruthful'])))
pairs[:1]

array([['For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.   \n\nFor example,\nStatement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. \nStatement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.\nStatement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.\n\nStatement: The pirate buried his treasure on a deserted island. This amount of truthfulness is',
        'For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.   \n\nFor example,\nStatement: A triangle has 3 sides. The amount of tr

In [78]:
import pickle

statement_pairs = {}
statement_pairs['train'] = pairs[:65]
statement_pairs['dev'] = pairs[65:90]
statement_pairs['test'] = pairs[90:100]
statement_pairs['labels'] = tempdf['type'].tolist()

with open('./false_to_true_and_fictional_pairs.pkl', 'wb') as f:
    pickle.dump(statement_pairs, f)

In [79]:
statement_pairs['train']

array([['For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.   \n\nFor example,\nStatement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. \nStatement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.\nStatement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.\n\nStatement: The pirate buried his treasure on a deserted island. This amount of truthfulness is',
        'For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.   \n\nFor example,\nStatement: A triangle has 3 sides. The amount of tr