In [1]:
import pandas as pd
import torch
import numpy as np
import pickle

## Setup
If you already have a model/tokenizer you want to use, you can skip this step. 
Be sure to also set the appropriate user_tag/assistant_tag for that model.

In [2]:
%%capture
# The quantized model used here requires some extra libraries. 
import sys
!{sys.executable} -m pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121
!{sys.executable} -m pip install optimum>=1.12.0
!{sys.executable} -m pip install auto-gptq==0.6.0
!{sys.executable} -m pip install accelerate

In [3]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/workspace/cache/' # change or comment out as desired 
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_name_or_path, revision, device):
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path, device_map=device, revision=revision, trust_remote_code=False)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, padding_side="left", reivision=revision)
    tokenizer.pad_token_id = 0
    return model, tokenizer

device = 'cuda:0'
model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
revision = 'gptq-4bit-32g-actorder_True'
user_tag = "[INST] "
assistant_tag = " [/INST]"

model, tokenizer = load_model(model_name_or_path, revision, device)



## Generate creative respones for conceptual method
Need more prompts/respones for conceptual method than functional.
Also, should probably ensure that they are full sentences to align with the way the fact/lies look (did not need for functional because just go token by token)

In [7]:
# Get the GPT-4 prompts
content_prompts = pd.read_csv('./content_prompts_gpt4_bigger_fiction_only.csv', header=None)
content_prompts.columns = ['prompt']

In [8]:
content_prompts.head()

Unnamed: 0,prompt
0,Write a thriller about a detective solving a h...
1,Craft a fantasy story where an ancient tree gr...
2,Develop a historical fiction set during the Re...
3,Pen a horror story about a family moving into ...
4,Create a romance where two poets fall in love ...


In [24]:
from lmdoctor.utils import format_prompt
from tqdm import tqdm

gen_only=True
prompts = content_prompts['prompt'].tolist()
batch_size=10
all_texts = []

for i in tqdm(range(0, len(prompts), batch_size)):
    
    prompts_batch = prompts[i:i+batch_size]

    formatted_prompts = []
    for prompt in prompts_batch:
        formatted_prompt = format_prompt(prompt, user_tag, assistant_tag)
        formatted_prompts.append(formatted_prompt)
    
    model_inputs = tokenizer(formatted_prompts, return_tensors='pt', padding=True).to(device)
    
    with torch.no_grad():
        # overgenerate - will keep only first sentence later
        sequences = model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=60)
    
    start_gen_idx = model_inputs.input_ids.shape[1]
    sequences = sequences[:, start_gen_idx:] if gen_only else sequences
    these_texts = tokenizer.batch_decode(sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    all_texts.extend(these_texts)

100%|██████████| 23/23 [02:11<00:00,  5.73s/it]


In [37]:
# Post-processing: only keep the first setence of the respones
sentences = []
for text in all_texts:
    first_sentence = text.split('.')[0] + '.'
    sentences.append(first_sentence)

In [38]:
content_prompts['response'] = sentences
content_prompts.head(2)

Unnamed: 0,prompt,response
0,Write a thriller about a detective solving a h...,"In the city of Umbra, where the sun never dare..."
1,Craft a fantasy story where an ancient tree gr...,"In the verdant valley of Elmsworth, where the ..."


In [10]:
# drop super long responses
content_prompts = pd.read_csv('./content_prompts_gpt4_bigger_fiction_only_with_response.csv')
content_prompts['lengths'] = [len(r) for r in content_prompts['response']]
content_prompts = content_prompts[(content_prompts['lengths'] < 125) & (content_prompts['lengths'] > 30)]

In [11]:
content_prompts.to_csv('./content_prompts_gpt4_bigger_fiction_only_with_response.csv', index=False)

In [12]:
content_prompts.shape

(136, 3)

In [13]:
content_prompts['response'].tolist()[:5]

['In the city of Umbra, where the sun never dared to tread and the moon reigned supreme, there existed a peculiar phenomenon.',
 'The Smith family had just moved into their new home, a seemingly ordinary Victorian-style house on the outskirts of town.',
 'In the year 2135, the world had changed significantly.',
 'Once upon a time, in a land far, far away, there lived a brave medieval knight named Sir Cedric.',
 'In a distant future, the universe is vast and filled with countless stars, planets, and civilizations.']

## Make statement pairs consisting of honesty + creative content (conceptual method)

In [4]:
# first get fact/lie statemens
from lmdoctor.target_specific_utils.honesty_utils import fetch_factual_data_conceptual
honesty_data = fetch_factual_data_conceptual()['data']

# next get fictional statements
content_prompts = pd.read_csv('./content_prompts_gpt4_bigger_fiction_only_with_response.csv')

fact_statements = honesty_data[honesty_data['label'] == 1]['statement'].tolist()
lie_statements = honesty_data[honesty_data['label'] == 0]['statement'].tolist()
fictional_statements = content_prompts['response'].tolist()

fact_statements[:2], lie_statements[:2], fictional_statements[:2]

(["The Earth's atmosphere protects us from harmful radiation from the sun.",
  'The theory of evolution, proposed by Charles Darwin, states that species evolve over time through natural selection.'],
 ['The largest flower in the world is the daisy.',
  'Cells are the complex units of life.'],
 ['In the city of Umbra, where the sun never dared to tread and the moon reigned supreme, there existed a peculiar phenomenon.',
  'The Smith family had just moved into their new home, a seemingly ordinary Victorian-style house on the outskirts of town.'])

In [5]:
content_prompts.shape

(136, 3)

In [6]:
prompt = """{user_tag}For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.

For example,
Statement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. 
Statement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
Statement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.

Statement: {statement} The amount of truthfulness is{assistant_tag}"""

In [7]:
fact_prompts = [prompt.format(user_tag=user_tag, assistant_tag=assistant_tag, statement=s) for s in fact_statements]
lie_prompts = [prompt.format(user_tag=user_tag, assistant_tag=assistant_tag, statement=s) for s in lie_statements]
fictional_prompts = [prompt.format(user_tag=user_tag, assistant_tag=assistant_tag, statement=s) for s in fictional_statements]

fact_prompts[:1], lie_prompts[:1], fictional_prompts[:1]

(["[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.\n\nFor example,\nStatement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. \nStatement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.\nStatement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.\n\nStatement: The Earth's atmosphere protects us from harmful radiation from the sun. The amount of truthfulness is [/INST]"],
 ['[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.\n\nFor example,\nStatement: A triangle has 3 

In [8]:
len(fact_prompts), len(lie_prompts), len(fictional_prompts)

(306, 306, 136)

In [9]:
num_pairs = len(fictional_prompts)
num_pairs

136

In [10]:
content_pairs = list(zip(fictional_prompts, lie_prompts[:num_pairs]))
honesty_pairs = list(zip(fact_prompts[:num_pairs], lie_prompts[:num_pairs]))

In [11]:
combined_pairs = np.array(content_pairs + honesty_pairs)
labels = np.array(['content_pair']*len(content_pairs) + ['honesty_pair']*len(honesty_pairs))
print(len(combined_pairs))

perm = np.random.permutation(len(combined_pairs))
combined_pairs = combined_pairs[perm]
labels = labels[perm]

s1 = 64
s2 = 64+32
s3 = 64+32+16

statement_pairs = {}
statement_pairs['train'] = combined_pairs[:s1]
statement_pairs['dev'] = combined_pairs[s1:s2]
statement_pairs['test'] = combined_pairs[s2:s3]
statement_pairs['labels'] = {'train': labels[:s1], 'dev': labels[s1:s2], 'test': labels[s2:s3]}

with open('./honesty_plus_pairs_5050_conceptual.pkl', 'wb') as f:
    pickle.dump(statement_pairs, f)

272


In [12]:
statement_pairs['train'].shape, statement_pairs['dev'].shape, statement_pairs['test'].shape

((64, 2), (32, 2), (16, 2))

In [13]:
statement_pairs['labels']['train'][:5]

array(['content_pair', 'content_pair', 'content_pair', 'content_pair',
       'honesty_pair'], dtype='<U12')

In [14]:
statement_pairs['train'][:3]

array([['[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.\n\nFor example,\nStatement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. \nStatement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.\nStatement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.\n\nStatement: Once upon a time, there was a young woman named Sarah who lived in a small coastal town. The amount of truthfulness is [/INST]',
        '[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.\n\nFor example,\n

In [15]:
print(statement_pairs['train'][0][0])

[INST] For the given statement, consider the amount of Truthfulness.  Consider statements of Fiction to be Truthful, just as Correct Factual statement are. Only Incorrect Factual statements should be considered untruthful.

For example,
Statement: A triangle has 3 sides. The amount of truthfulness is High, as this is a Correct Factual statement. 
Statement: A triangle has 2 sides. The amount of truthfulness is Low, as this is an Incorrect Factual statement.
Statement: One day, a boy was playing a tune on his triangle. The amount of truthfulness is High, as this is a Fictional statement.

Statement: Once upon a time, there was a young woman named Sarah who lived in a small coastal town. The amount of truthfulness is [/INST]
