In [None]:
import numpy as np
import pandas as pd

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

from tqdm import tqdm
import numpy as np

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
PAD_TOKEN = "<|pad|>"
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"

In [None]:
## Can easily be adapted for base model
model = AutoModelForCausalLM.from_pretrained("../02_LoRA/finetuned_model", device_map="cuda")
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

In [None]:
splits = {'test': 'all/test-00000-of-00001.parquet', 'validation': 'all/validation-00000-of-00001.parquet', 'dev': 'all/dev-00000-of-00001.parquet', 'auxiliary_train': 'all/auxiliary_train-00000-of-00001.parquet'}
mmlu_test_df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["test"])
mmlu_dev_df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["dev"])

In [None]:
## Adapted from https://github.com/hendrycks/test/blob/master/evaluate.py
def softmax(x):
    z = x - max(x)
    numerator = np.exp(z)
    denominator = np.sum(numerator)
    softmax = numerator/denominator
    return softmax

def format_subject(subject):
    l = subject.split("_")
    s = " ".join(l)
    return s

def format_example(df, idx, include_answer = True):
    question = df.loc[idx, 'question']
    answer = df.loc[idx, 'answer']
    choices = df.loc[idx, 'choices']
    messages = [
        {
            "role": "user",
            "content": question + '\n' + 'Choices: ' + str(choices)
        },
    ]
    if include_answer:
        messages.append(
            {
                "role": "assistant",
                "content": answer,
            }
        )
    return messages

def gen_prompt(train_df, subject, k=-1):
    messages = []
    messages.append({
        "role": "system",
        "content": "The following are multiple choice questions (with answers) about {}. Answer them to the best of your ability by giving the index (starting at 0) of the correct answer in the list of choices.".format(format_subject(subject))
    })
    shot = 0
    for i in range(len(train_df)):
        if train_df.loc[i, 'subject'] == subject:
            messages += format_example(train_df, i)
            shot += 1
        if shot == 5:
            break
    
    return (tokenizer.apply_chat_template(messages, tokenize=False), shot)


In [None]:
## Prepare subject 5-shot context
subjects = mmlu_test_df['subject'].unique()

subject_dev_prompts = {}

for subject in subjects:
    result = gen_prompt(mmlu_dev_df, subject)
    if result[1] != 5:
        print("Failed to 5-shot", subject)
    subject_dev_prompts[subject] = result[0]

In [None]:
correct = 0
total = 0

answer_strings = ["0", "1", "2", "3"]
token_ids = [tokenizer.encode(s)[1] for s in answer_strings]

model_outputs = []

for i in tqdm(range(mmlu_test_df.shape[0])):
    test_prompt = tokenizer.apply_chat_template(format_example(mmlu_test_df, i, include_answer = False), tokenize=False)
    cutoff_idx = test_prompt.find('<|eot_id|>')
    test_prompt = test_prompt[cutoff_idx + len("<|eot_id|>"):]
    test_prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
    
    subject = mmlu_test_df.loc[i, "subject"]
    correct_answer = mmlu_test_df.loc[i, "answer"]
    dev_prompt = subject_dev_prompts[subject]

    prompt = dev_prompt + test_prompt

    tokenized_prompt = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        tokenized_prompt["input_ids"],
        max_new_tokens = 1,
        do_sample=False,   # Greedy decoding
        return_dict_in_generate=True,  # Get additional generation info
        output_scores=True,  # Get the scores/logits
        pad_token_id=tokenizer.pad_token_id,
    )

    logits = outputs.scores[0][0] # Logit for generation token

    logits = logits.cpu()
    
    llm_output = np.argmax(np.array([logits[token_ids][i] for i in range(4)]))
    model_outputs.append(llm_output)

    if llm_output == correct_answer:
        correct += 1
    total += 1


In [None]:
## If testing base model instead of finetuned model
""" 
mmlu_test_df['base_answer'] = model_outputs
mmlu_test_df.to_csv("03_mmlu/mmlu_base.csv")
"""

mmlu_test_df['finetuned_answer'] = model_outputs
mmlu_test_df.to_csv("03_MMLU/mmlu_finetuned.csv")