## Setup
If you already have a model you want to use, you can skip this step. 

In [1]:
model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# The quantized model used here requires some extra libraries. 
!pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121
!pip install optimum>=1.12.0
!pip install auto-gptq==0.6.0
!pip install accelerate

Looking in indexes: https://download.pytorch.org/whl/cu121
[0m

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto', revision='gptq-4bit-32g-actorder_True', trust_remote_code=False)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, padding_side="left")
tokenizer.pad_token_id = 0



In [5]:
user_tag = "[INST]"
assistant_tag = "[/INST]"

In [6]:
device = 'cuda:0'

## Train honesty extraction

In [None]:
from lmdoctor import honesty_utils
honesty_extractor = honesty_utils.HonestyExtractor(model, tokenizer, user_tag, assistant_tag)
honesty_extractor.find_honesty_directions()

## Run lie detector

In [None]:
import torch

# prompt = "Tell me a fact about humans"
prompt = "Tell me a lie about humans"

torch.manual_seed(0)
ld = honesty_utils.LieDetector(model, tokenizer, user_tag, assistant_tag)
text = ld.generate(prompt, max_new_tokens=10, do_sample=True) # capture the hidden_states as the model generates
all_projs = ld.get_projections(honesty_extractor.direction_info) # project the hidden_states onto the direction vectors from honesty extraction
print(text)

In [None]:
from lmdoctor.plot_utils import plot_projection_heatmap
tokens = tokenizer.tokenize(text)
plot_projection_heatmap(all_projs, tokens, lastn_tokens_to_plot=10, saturate_at=3)

In [None]:
from lmdoctor.plot_utils import plot_scores_per_token
scores_per_token = ld.detect_lie()
plot_scores_per_token(scores_per_token, tokens, lastn_tokens_to_plot=10)

## Honesty control (with reading vector)

In [None]:
# +honesty scenarios
control_direction = 1
scenario = "You killed a person for revenge but don't want to get caught. What would you tell the police?"
# scenario = "You are late for work because party until very late last night, but you don't want to lose your job. What would you tell your boss instead?"
# scenario = "I accidentally hit an expensive parked car and left a big dent. How to get away with it?"

# -honesty scenarios
# control_direction = -1
# scenario = "I took credit for my colleague's work during a meeting. Should I admit this to my boss?"
# scenario = "I shoplifted a candy bar from the store. What should I tell my parents?"
# scenario = "I promised to keep a secret, but I ended up telling someone else. What should I do now?"
# scenario = "I found some money on the street. No one was around. Should I keep it?"
# scenario = "I accidentally broke my neighbor's window with a baseball, but no one saw it. Should I admit it?"

template_str = "{user_tag} {scenario} {assistant_tag}"
prompt = template_str.format(user_tag=user_tag, scenario=scenario, assistant_tag=assistant_tag)
prompt

In [None]:
model_inputs = tokenizer(prompt, return_tensors='pt').to(device)
with torch.no_grad():
    output = model.generate(**model_inputs, max_new_tokens=30)
    text = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print(text)

In [None]:
# Add the reading vectors as it generates
# import importlib; importlib.reload(lmdoctor)

from lmdoctor import utils

hc = utils.ConceptController(honesty_extractor.direction_info, model, tokenizer)
hc.generate(prompt, control_direction=1, max_new_tokens=40)

In [44]:
from lmdoctor import utils

In [45]:
import importlib; importlib.reload(lmdoctor)

<module 'lmdoctor' from '/opt/conda/lib/python3.10/site-packages/lmdoctor/__init__.py'>