In [None]:
import itertools
import os

from dotenv import load_dotenv
import numpy as np
import pandas as pd
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
load_dotenv()

In [None]:
hf_access_token = os.environ["HF_ACCESS_TOKEN"]

In [None]:
def generate_question(row):

    template = """Choose from A, B, C, or D to respond to the question below. Do not provide an explanation. Format your response as a JSON string:

{{
   "answer" : <single letter> 
}}

Question: {question}

A. {a}
B. {b}
C. {c}
D. {d}
"""

    return template.format(question=row["question"], a=row["A"], b=row["B"], c=row["C"], d=row["D"])

In [None]:
def generate_all_outputs(question, model, tokenizer):
    messages = [
        {"role": "user", "content": question},
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    tokens = tokenizer(prompt, return_tensors="pt").input_ids
    
    tokens = tokens.to("cuda")

    num_input_tokens = len(tokens[0])
    generated_outputs = model.generate(tokens, max_new_tokens=50, do_sample=False, num_return_sequences=1, output_scores=True, output_hidden_states=True, return_dict_in_generate=True)
    
    return num_input_tokens, generated_outputs


In [None]:
def extract_datapoints(num_input_tokens, outputs, tokenizer):
    completion = tokenizer.decode(outputs["sequences"][0][num_input_tokens:])

    first_token_scores = outputs["scores"][0].squeeze()
    first_token_dist = F.softmax(first_token_scores)
    input_final_state = outputs["hidden_states"][0][-1]
    
    # squeeze out the batch dim
    average_input_final_state = input_final_state.squeeze().mean(dim=0) 

    return completion, first_token_dist, average_input_final_state

In [None]:
experiment_data = {
    "original": "mmlu_200_orig.csv",
    "singlechar": "mmlu_200_singlechar_perturbed.csv",
    "reworded": "mmlu_200_gpt_perturbed.csv"  
}

models = {
    "meta-llama": ["Llama-2-7b-chat-hf"],
    "mistralai": ["Mistral-7B-Instruct-v0.2"],
    "google": ["gemma-7b-it"]
}

In [None]:
for model_org in models:
    for model_name in models[model_org]:

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            )

        model = AutoModelForCausalLM.from_pretrained(f"{model_org}/{model_name}", token=hf_access_token, quantization_config=quantization_config, device_map="auto")
        tokenizer = AutoTokenizer.from_pretrained(f"{model_org}/{model_name}", token=hf_access_token)
        
        # just run up to this part first to confirm can load

        for experiment, filename in experiment_data.items():
            print(f"Proceeding with experiment '{experiment}' with model '{model_name}'...")
            df = pd.read_csv(f"final_data/{filename}")
            
            if os.path.isdir(f"run_data/{model_name}/{experiment}") == False:
                os.makedirs(f"run_data/{model_name}/{experiment}", exist_ok=True)

            for idx, row in df.iterrows():
                paths = {
                    "completion": f"run_data/{model_name}/{experiment}/completion-{row['question_id']}",
                    "dist": f"run_data/{model_name}/{experiment}/dist-{row['question_id']}",
                    "hidden": f"run_data/{model_name}/{experiment}/hidden-{row['question_id']}"
                }

                formatted_question = generate_question(row)

                if idx % 25 == 0:
                    print(formatted_question)

                num_input_tokens, outputs = generate_all_outputs(formatted_question, model, tokenizer)
                completion, first_token_dist, average_input_final_state = extract_datapoints(num_input_tokens, outputs, tokenizer)

                with open(paths["completion"], "w") as f:
                    f.write(completion)

                np.save(paths["dist"], first_token_dist.cpu().numpy())
                np.save(paths["hidden"], average_input_final_state.cpu().numpy())
