In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path

notebook_path = Path().absolute()
sys.path.append(str(notebook_path.parent))

In [3]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from neural_controllers import NeuralController
import utils

In [4]:
model_type = 'llama'

if model_type=='llama':
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

    language_model = AutoModelForCausalLM.from_pretrained(
        model_id, device_map="cuda"
    )

    use_fast_tokenizer = "LlamaForCausalLM" not in language_model.config.architectures
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=use_fast_tokenizer, padding_side="left", legacy=False)
    model_name='llama_3_8b_it'
    assistant_tag = '<|start_header_id|>assistant<|end_header_id|>'
    
elif model_type=='gemma':

    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
    language_model = AutoModelForCausalLM.from_pretrained(
        "google/gemma-2-9b-it",
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
    model_name='gemma_2_9b_it'
    
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id

2025-03-03 11:44:52.598705: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741023892.819727 1117233 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741023892.892913 1117233 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-03 11:44:53.492185: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
data_dir = "../data/poetry"

dataset = utils.poetry_dataset(data_dir=data_dir, tokenizer=tokenizer, assistant_tag=assistant_tag)

train 200 test 0
train 200 test 0


In [6]:
concept_types = ['prose', 'poetry']

controllers = {}
for concept_type in tqdm(concept_types):
    
    other_type = [k for k in concept_types if k != concept_type][0]
    
    train_data = dataset[concept_type]['train']
    test_data = dataset[concept_type]['test']
        
    controller = NeuralController(
        language_model,
        tokenizer,
        rfm_iters=8,
        batch_size=2,
        control_method='logistic'
    )
    
    controller.compute_directions(train_data['inputs'], train_data['labels'])
    
    controllers[concept_type] = controller
    

  0%|          | 0/2 [00:00<?, ?it/s]

Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 8
forward_batch_size   : 2
M_batch_size         : 2048
n_components         : 5

use_concat False
Getting activations from forward passes



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:00<01:08,  1.45it/s][A
  2%|▏         | 2/100 [00:00<00:36,  2.71it/s][A
  3%|▎         | 3/100 [00:00<00:25,  3.75it/s][A
  4%|▍         | 4/100 [00:01<00:20,  4.57it/s][A
  5%|▌         | 5/100 [00:01<00:18,  5.21it/s][A
  6%|▌         | 6/100 [00:01<00:16,  5.68it/s][A
  7%|▋         | 7/100 [00:01<00:15,  6.00it/s][A
  8%|▊         | 8/100 [00:01<00:14,  6.25it/s][A
  9%|▉         | 9/100 [00:01<00:14,  6.44it/s][A
 10%|█         | 10/100 [00:01<00:13,  6.57it/s][A
 11%|█         | 11/100 [00:02<00:13,  6.66it/s][A
 12%|█▏        | 12/100 [00:02<00:13,  6.72it/s][A
 13%|█▎        | 13/100 [00:02<00:12,  6.78it/s][A
 14%|█▍        | 14/100 [00:02<00:12,  6.80it/s][A
 15%|█▌        | 15/100 [00:02<00:12,  6.83it/s][A
 16%|█▌        | 16/100 [00:02<00:12,  6.85it/s][A
 17%|█▋        | 17/100 [00:03<00:12,  6.85it/s][A
 18%|█▊        | 18/100 [00:03<00:11,  6.85it/s][A
 19%|█▉        | 19/100 [00:0

train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 7.534022706518853e-06, C: 10, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])





 16%|█▌        | 5/31 [00:01<00:05,  4.69it/s][A

train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00039862368286464793, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00010229087166861558, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00014679671099681368, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shap





Logistic probe loss: 5.003332602001705e-05, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 3.951138109145498e-05, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0004432166438087401, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0004417434482648729, C: 10000, acc: 100.0
beta torch.Size([4096, 





Logistic probe loss: 0.0013340652228751278, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0006117413671688036, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00036422808346066614, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0005900525390341996, C: 10000, acc: 100.0
beta torch.Size([4096,



 68%|██████▊   | 21/31 [00:02<00:00, 16.72it/s][A

Logistic probe loss: 0.0004198054670853333, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00032743528097696034, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0009074283774418339, C: 100, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00033305914083785486, C: 10000, acc: 100.0
beta torch.Size([4096, 




train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0015374945623354442, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00010192899987706032, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0007346126596309096, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape:





Logistic probe loss: 0.004979405463570912, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.01450892690314689, C: 1000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.037572609348962824, C: 10000, acc: 97.5
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.09207882106015827, C: 100, acc: 97.5
beta torch.Size([4096, 1])
concept


100%|██████████| 31/31 [00:02<00:00, 10.90it/s][A


Logistic probe loss: 0.2049376018434274, C: 10000, acc: 92.5
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
Computing signs
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
ten


100%|██████████| 31/31 [00:00<00:00, 71638.25it/s]
 50%|█████     | 1/2 [00:18<00:18, 18.12s/it]

Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 8
forward_batch_size   : 2
M_batch_size         : 2048
n_components         : 5

use_concat False
Getting activations from forward passes



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:00<00:13,  7.13it/s][A
  2%|▏         | 2/100 [00:00<00:14,  7.00it/s][A
  3%|▎         | 3/100 [00:00<00:13,  6.95it/s][A
  4%|▍         | 4/100 [00:00<00:13,  6.92it/s][A
  5%|▌         | 5/100 [00:00<00:13,  6.92it/s][A
  6%|▌         | 6/100 [00:00<00:13,  6.92it/s][A
  7%|▋         | 7/100 [00:01<00:13,  6.92it/s][A
  8%|▊         | 8/100 [00:01<00:13,  6.90it/s][A
  9%|▉         | 9/100 [00:01<00:13,  6.92it/s][A
 10%|█         | 10/100 [00:01<00:13,  6.92it/s][A
 11%|█         | 11/100 [00:01<00:12,  6.92it/s][A
 12%|█▏        | 12/100 [00:01<00:12,  6.91it/s][A
 13%|█▎        | 13/100 [00:01<00:12,  6.92it/s][A
 14%|█▍        | 14/100 [00:02<00:12,  6.92it/s][A
 15%|█▌        | 15/100 [00:02<00:12,  6.92it/s][A
 16%|█▌        | 16/100 [00:02<00:12,  6.90it/s][A
 17%|█▋        | 17/100 [00:02<00:12,  6.90it/s][A
 18%|█▊        | 18/100 [00:02<00:11,  6.86it/s][A
 19%|█▉        | 19/100 [00:0

train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 1.898695995293183e-06, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.001770461279369085, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression






Logistic probe loss: 0.0024335034564612204, C: 1000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.001023777992037051, C: 1000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0009377474042596663, C: 1000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0005005793639404933, C: 10000, acc: 100.0
beta torch.Size([4096, 1])





train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0008376789734259954, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.000572194611013786, C: 1000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00034829869414867444, C: 1000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: to






Logistic probe loss: 0.0005898345094882274, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0015728708091005132, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0015288106109814545, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0011265215468312542, C: 10000, acc: 100.0
beta torch.Size([4096, 




train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00019923715843929925, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0008686660600543035, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0014922750098412846, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape:






Logistic probe loss: 0.0003264117362205548, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.00012923647221689888, C: 10000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.003314308668231556, C: 100, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.0011560548897719518, C: 10000, acc: 100.0
beta torch.Size([4096, 1]





train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.008161963121084321, C: 1000, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.025944669651959573, C: 100, acc: 100.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.04675603680565645, C: 1000, acc: 97.5
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Siz




train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.11497965633190739, C: 100, acc: 95.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.1339614087357019, C: 100, acc: 97.5
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Logistic probe loss: 0.1572475504872299, C: 1000, acc: 97.5
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 


100%|██████████| 31/31 [00:01<00:00, 16.08it/s][A


Logistic probe loss: 0.19417058756412683, C: 1000, acc: 95.0
beta torch.Size([4096, 1])
concept_features torch.Size([1, 4096])
Computing signs
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
tensors torch.Size([200, 4096]) direction torch.Size([4096])
ten


100%|██████████| 31/31 [00:00<00:00, 70664.90it/s]
100%|██████████| 2/2 [00:34<00:00, 17.32s/it]


In [9]:
for concept_type in concept_types:
    controller = controllers[concept_type]    
    controller.save(concept=f'{concept_type}', model_name=model_name, path='../directions/')

KeyError: 'prose'

# Control

In [14]:
concept_types = ['prose', 'poetry']

controllers = {}
for concept_type in concept_types:
    
    controller = NeuralController(
        language_model,
        tokenizer,
        control_method='logistic'
    )
    
    other_type = [k for k in concept_types if k!=concept_type][0]
    
    controller.load(concept=f'{concept_type}', model_name=model_name, path='../directions/')
    
    controllers[concept_type] = controller
    

Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 8
forward_batch_size   : 8
M_batch_size         : 2048
n_components         : 5

Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 8
forward_batch_size   : 8
M_batch_size         : 2048
n_components         : 5

Detector found


  return torch.load(io.BytesIO(b))


In [26]:
concept_type = "prose"
# concept_type = "poetry"
controller = controllers[concept_type]

raw_inputs = [
    # f"How should I treat a cold?",
    f"What can I buy in a grocery store?",
    # f"What might a student study in school?",
    # f"Tell me about something interesting.",
    # f"Give me advice for applying to jobs.",
]
inputs = [controller.format_prompt(x) for x in raw_inputs]

num_new_tokens = 150

coef=0.7 #llama 
# coef=9

layers = list(range(-1, -31, -1))
# layers = list(range(-1, -41, -1))

gens=[]
print()
for i in inputs:
    print("Prompt:", i)
    # print("===== No Control =====")
    # print(controller.generate(i, max_new_tokens=num_new_tokens, do_sample=False).replace(i, ""))
    # print()
    
    print(f"===== + {concept_type} Control =====")
    gen = controller.generate(i, layers_to_control=layers, control_coef=coef, 
                                max_new_tokens=num_new_tokens, do_sample=False).replace(i, "")
    gens.append(gen)
    print(gen)
    print()
    print()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Prompt: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What can I buy in a grocery store?<|eot_id|>
===== + prose Control =====
<|start_header_id|>assistant<|end_header_id|>

Here's a poem inspired by your request:

In twilight's hush, where moonlight whispers low,
A canvas of dreams, I weave for you to know:

In aisles of wonder, where shelves entwine,
A symphony of scents, a tapestry divine:

Fresh blooms of roses, plucked from twilight's sigh,
Their petals like silk, a gentle lover's sigh:

Or perhaps, a sonnet of sonorous delight,
Inspired by the muses, in a midnight's light:

In a market of enchantment, where artisans weave,
A tapestry of taste, a symphony to conceive:

Here's a selection of poems, each a unique offering:

**Poem 1: "Moon


