In [1]:
import pandas as pd
import torch
import numpy as np
import pickle

## Setup
If you already have a model/tokenizer you want to use, you can skip this step. 
Be sure to also set the appropriate user_tag/assistant_tag for that model.

In [2]:
%%capture
# The quantized model used here requires some extra libraries. 
import sys
!{sys.executable} -m pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121
!{sys.executable} -m pip install optimum>=1.12.0
!{sys.executable} -m pip install auto-gptq==0.6.0
!{sys.executable} -m pip install accelerate

In [3]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/workspace/cache/' # change or comment out as desired 
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_tokenizer(model_name_or_path, revision, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, padding_side="left", revision=revision)
    tokenizer.pad_token_id = 0
    return tokenizer

device = 'cuda:0'
model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
revision = 'gptq-4bit-32g-actorder_True'
user_tag = "[INST] "
assistant_tag = " [/INST]"

tokenizer = load_tokenizer(model_name_or_path, revision, device)



## Make statement pairs consisting of honesty + creative content (functional method)

In [4]:
# first get honesty
from lmdoctor.extraction_utils import prepare_functional_pairs
from lmdoctor.target_specific_utils.honesty_utils import fetch_factual_data_functional

honesty_data = fetch_factual_data_functional()
statement_pairs_honesty = prepare_functional_pairs(honesty_data['data'], honesty_data['prompt_maker'], tokenizer, user_tag, assistant_tag)
statement_pairs_honesty[:2]

array([["[INST] Pretend you're an honest person making statements about the world. [/INST] The",
        "[INST] Pretend you're an untruthful person making statements about the world. [/INST] The"],
       ["[INST] Pretend you're an honest person making statements about the world. [/INST] The Earth",
        "[INST] Pretend you're an untruthful person making statements about the world. [/INST] The Earth"]],
      dtype='<U201')

In [5]:
# next get creative content (only going to use the fictional content for now)
content_prompts = pd.read_csv('./content_prompts_with_response.csv')

num_pairs = content_prompts.shape[0]
prompts = content_prompts['prompt'].values.tolist()
responses = content_prompts['response'].values.tolist()
labels = content_prompts['label'].values.tolist()

subprompts = []
for i in range(num_pairs):
    tokens = tokenizer.tokenize(responses[i])
    for idx in range(1, len(tokens) - 5):
        subresponse = tokenizer.convert_tokens_to_string(tokens[:idx])
        subprompt = f"{user_tag}{prompts[i]}{assistant_tag} {subresponse}"
        subprompts.append([subprompt, labels[i], i])

subprompts_df = pd.DataFrame(subprompts)
subprompts_df.columns = ['subprompt', 'label', 'prompt idx']
fictional_prompts_df = subprompts_df[subprompts_df['label'] == 1]
factual_prompts_df = subprompts_df[subprompts_df['label'] == 0]
fictional_prompts_df.head(2)

Unnamed: 0,subprompt,label,prompt idx
0,[INST] Write a sci-fi short story about a robo...,1,0
1,[INST] Write a sci-fi short story about a robo...,1,0


In [6]:
# pair fictional statements with lies; mix 50/50
import random
import numpy as np

n_train=128
n_dev=64
n_test=32
n = n_train + n_dev + n_test
honesty_pairs = statement_pairs_honesty[:n]

# get another n lies
fictional_prompts = fictional_prompts_df['subprompt'].tolist()
next_n_lies = [p[1] for p in statement_pairs_honesty[n:2*n]]
content_pairs = []
for i in range(n):
    pair = [fictional_prompts[i], next_n_lies[i]]
    content_pairs.append(pair)
print(len(content_pairs), len(honesty_pairs))
honesty_pairs = honesty_pairs.tolist()


224 224


In [7]:
# for the shuffled version
combined_pairs = np.array(content_pairs + honesty_pairs)
print(len(content_pairs), len(honesty_pairs))
np.random.shuffle(combined_pairs)

statement_pairs = {}
statement_pairs['train'] = combined_pairs[:n_train*2]
statement_pairs['dev'] = combined_pairs[n_train*2:(n_train+n_dev)*2]
statement_pairs['test'] = combined_pairs[(n_train+n_dev)*2:(n_train+n_dev+n_test)*2]

with open('./honesty_plus_pairs_5050.pkl', 'wb') as f:
    pickle.dump(statement_pairs, f)

224 224


In [8]:
# # for the unshuffled version (to ensure no leakage of prompts; not super important bc also verified that method works when tested on le chat prompts)
# statement_pairs = {}
# statement_pairs['train'] = np.array(content_pairs[:n_train] + honesty_pairs[:n_train])
# statement_pairs['dev'] = np.array(content_pairs[n_train:(n_train+n_dev)] + honesty_pairs[n_train:(n_train+n_dev)])
# statement_pairs['test'] = np.array(content_pairs[(n_train+n_dev):(n_train+n_dev+n_test)] + honesty_pairs[(n_train+n_dev):(n_train+n_dev+n_test)])

# with open('./honesty_plus_pairs_5050_UNSHUFFLED.pkl', 'wb') as f:
#     pickle.dump(statement_pairs, f)

In [9]:
statement_pairs['train'].shape, statement_pairs['dev'].shape, statement_pairs['test'].shape

((256, 2), (128, 2), (64, 2))