In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path

notebook_path = Path().absolute()
sys.path.append(str(notebook_path.parent))

In [3]:
import torch
import numpy as np

In [4]:
import utils

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from neural_controllers import NeuralController
import matplotlib.pyplot as plt
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)

In [None]:
model_type = 'llama'

if model_type=='llama':

    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

    language_model = AutoModelForCausalLM.from_pretrained(
        model_id, device_map="cuda"
    )

    use_fast_tokenizer = "LlamaForCausalLM" not in language_model.config.architectures
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=use_fast_tokenizer, padding_side="left", legacy=False)
    tokenizer.pad_token_id = 0 
    model_name='llama_3_8b_it'
    
elif model_type=='gemma':

    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
    language_model = AutoModelForCausalLM.from_pretrained(
        "google/gemma-2-9b-it",
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
    model_name='gemma_2_9b_it'

In [None]:
source = 'amazon'
review_dir = '../data/reviews/'
data, ratings = utils.reviews_dataset(review_dir, tokenizer, center=True, source=source)

In [None]:
plt.hist(ratings)

In [None]:
review_controller = NeuralController(
    language_model,
    tokenizer,
    rfm_iters=1,
    control_method='linear',
    batch_size=8
)

In [None]:
review_controller.compute_directions(data, ratings)

In [None]:
review_controller.compute_directions(data, ratings)

In [None]:
review_controller.compute_directions(data, ratings)

In [12]:
review_controller.save(concept=f'{source}_review', model_name=model_name, path='../directions/')

### Plot scores

# Control

In [None]:
review_controller = NeuralController(
    language_model,
    tokenizer,
    control_method='rfm',
    batch_size=2
)

In [None]:
review_controller.load(concept=f'{source}_review', model_name=model_name, path='../directions/')

In [None]:
# template =  "Give a rating (from 1 to 5) then a two-sentence review for an average {item}. Format your rating 'Rating: x/5'."
template =  "Give a rating and a short review for {item}."
items = [
    # "Mona Lisa",
    # "Crime and Punishment",
    # "Harvard"
    "a student's sloppy essay"
    # "A chocolate chip cookie"
    # "Harry Potter and the Deathly Hallows â€“ Part 2 (2011)" 
]

raw_inputs = [template.format(item=x) for x in items]
inputs = []
for x in raw_inputs:
    inputs.append(review_controller.format_prompt(x))
    

num_new_tokens = 120
coef = 0.5
print()
for i in inputs:
    print(i)
    print("===== No Control =====")
    print(review_controller.generate(i, max_new_tokens=num_new_tokens, do_sample=False).replace(i, ""))
    print()
    
    print(f"===== + Review Control =====")
    print(review_controller.generate(i, layers_to_control=list(range(-1, -31, -1)), control_coef=coef, 
                                max_new_tokens=num_new_tokens, do_sample=False).replace(i, ""))
    print()