# MMLU Evaluation of LLaMA 3.2 1B Evaluation

In [1]:
import os
# no nvlink
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
# use a specific GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch
from transformers import LlamaForCausalLM, AutoTokenizer

print("Torch version:", torch.__version__)
print("Is CUDA available?", torch.cuda.is_available())

  from .autonotebook import tqdm as notebook_tqdm


Torch version: 2.4.1+cu121
Is CUDA available? True


In [3]:
model_path = "/data/llm/llama/Llama-3.2-1B-Instruct/"

model = LlamaForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    

In [4]:
from datasets import load_dataset

# Load MMLU dataset
dataset = load_dataset("cais/mmlu", "all", split="test")

In [5]:
print(len(dataset))

14042


In [15]:
from tqdm.notebook import tqdm
import torch

# Ensure the model is on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Specify generation settings: allow larger input size or control output length
generation_kwargs = {
    "max_length": 50,  # Adjust based on how long your input is (can tweak this)
    "max_new_tokens": 20,  # Limit the number of tokens generated (this prevents overly long outputs)
}

correct = 0
total = 0

# Use tqdm to display the progress bar
for example in tqdm(dataset, desc="Evaluating MMLU", unit="example"):
    question = example['question']
    choices = example['choices']  # The available answer choices
    correct_index = example['answer']  # The correct choice index
    
    # Concatenate the question and choices to form a complete prompt
    input_text = question + "\nChoices: " + ", ".join(choices)

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors='pt')

    # Move inputs to the same device as the model (GPU)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate the model's answer with the new generation settings
    outputs = model.generate(**inputs, **generation_kwargs)
    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # Check which choice the model's generated answer matches
    if generated_answer in choices:
        generated_index = choices.index(generated_answer)
    else:
        generated_index = -1  # In case the generated answer doesn't match any choice

    # Compare the generated answer's index with the correct answer index
    if generated_index == correct_index:
        correct += 1
    
    total += 1

# Calculate and print accuracy
accuracy = correct / total * 100
print(f"\nAccuracy: {accuracy:.2f}%")

Evaluating MMLU:   0%|          | 0/14042 [00:00<?, ?example/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=20) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Evaluating MMLU:   0%|          | 1/14042 [00:00<1:07:26,  3.47example/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=20) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Evaluating MMLU:   0%|          | 2/14042 [00:00<1:06:57,  3.50example/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=20) and `max_length`(=50) seem to have been set. `max_new_


Accuracy: 0.00%





In [111]:
# Ensure the model is on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Specify generation settings: allow larger input size or control output length
generation_kwargs = {
    "max_length": 50,  # Adjust based on how long your input is (can tweak this)
    "max_new_tokens": 20,  # Limit the number of tokens generated (this prevents overly long outputs)
}

example = dataset[1]
question = example['question']
choices = example['choices']
correct_answer = example['answer']

print(question)
print(choices)
print(correct_answer)

print('*' * 100)
# Tokenize the input text
initial_prompt = 'Select the choice of the correct answer to the following question. '
input_text = initial_prompt + question + "\nChoices: " + ", ".join(choices)
inputs = tokenizer(input_text, return_tensors='pt')

# Move inputs to the same device as the model (GPU)
inputs = {k: v.to(device) for k, v in inputs.items()}
print('inputs:', inputs)

outputs = model.generate(**inputs, **generation_kwargs)
print(outputs)

generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
print('Generated answer: ')
print(generated_answer)
print('*'*100)

print(generated_answer in choices)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=20) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5.
['8', '2', '24', '120']
2
****************************************************************************************************
inputs: {'input_ids': tensor([[128000,   3461,    279,   5873,    315,    279,   4495,   4320,    311,
            279,   2768,   3488,     13,   6914,    281,    284,    320,     16,
             11,    220,     17,     11,    220,     20,     11,    220,     19,
           2432,     17,     11,    220,     18,      8,    304,    328,     62,
             20,    662,   7531,    279,   1963,    315,    366,     79,     29,
            304,    328,     62,     20,    627,  90383,     25,    220,     23,
             11,    220,     17,     11,    220,   1187,     11,    220,   4364]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1,

In [133]:
from transformers import pipeline

pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype = torch.bfloat16,
    device_map='auto',
)

example = dataset[300]
question = example['question']
choices = example['choices']
correct_answer = example['answer']


initial_prompt = '''
Please answer the following user's question in the format "Answer: ANSWER_TO_THE_QUESTION.".

For example, user input would be:

Question: What is 2 + 2?
Choices: ['1', '2', '3', '4']

Your answer would be:
Answer: 4

Another example would be:

Question: Which one of these parts are used for seeing?
Choices: ['eyes', 'nose', 'ears', 'mouth']

Your answer would be:
Answer: eyes
'''


user_question = 'Question: ' + question + "\nChoices: " + str(choices)

# question_prompt = input_text
print('Initial prompt: ', initial_prompt)
print('User question: ', user_question)
print('Choices: ', choices)

messages = [
    {'role': 'system', 'content': initial_prompt},
    {'role': 'user', 'content': user_question},
]

outputs = pipe(
    messages,
    max_new_tokens=256,
)

output = outputs[0]['generated_text'][-1]['content']
print(type(output))
print('LLaMA 3.21B response: ', output)
print('Correct answer: ', choices[correct_answer])
print(choices[correct_answer].lower() in output.lower())


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Initial prompt:  
Please answer the following user's question in the format "Answer: ANSWER_TO_THE_QUESTION.".

For example, user input would be:

Question: What is 2 + 2?
Choices: ['1', '2', '3', '4']

Your answer would be:
Answer: 4

Another example would be:

Question: Which one of these parts are used for seeing?
Choices: ['eyes', 'nose', 'ears', 'mouth']

Your answer would be:
Answer: eyes

User question:  Question: Which is not a similarity between Saturn and Jupiter's atmospheres?
Choices: ['a composition dominated by hydrogen and helium', 'the presence of belts zones and storms', 'an equatorial wind speed of more than 900 miles per hour', 'significant "shear" between bands of circulation at different latitudes']
Choices:  ['a composition dominated by hydrogen and helium', 'the presence of belts zones and storms', 'an equatorial wind speed of more than 900 miles per hour', 'significant "shear" between bands of circulation at different latitudes']
<class 'str'>
LLaMA 3.21B respon

In [135]:
from tqdm.notebook import tqdm
import csv
from transformers import pipeline

pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype = torch.bfloat16,
    device_map='auto',
)

# Assuming the dataset and pipeline are already set up as per your current workflow
output_file = 'llama_mmlu_responses.csv'

# Sample dataset size, you can adjust it as needed for the loop
dataset_size = len(dataset)  # Assuming dataset is loaded with all examples

# Initialize a list to collect all rows
rows = []

# Loop through the dataset and gather question, choices, and answers
for idx in tqdm(range(dataset_size)):
    example = dataset[idx]
    question = example['question']
    choices = example['choices']
    correct_answer = example['answer']

    initial_prompt = '''
    Please answer the following user's question in the format "Answer: ANSWER_TO_THE_QUESTION.".

    For example, user input would be:

    Question: What is 2 + 2?
    Choices: ['1', '2', '3', '4']

    Your answer would be:
    Answer: 4

    Another example would be:

    Question: Which one of these parts are used for seeing?
    Choices: ['eyes', 'nose', 'ears', 'mouth']

    Your answer would be:
    Answer: eyes
    '''

    user_question = 'Question: ' + question + "\nChoices: " + str(choices)

    # Generating messages for the LLaMA model
    messages = [
        {'role': 'system', 'content': initial_prompt},
        {'role': 'user', 'content': user_question},
    ]

    # Get LLaMA's response
    outputs = pipe(messages, max_new_tokens=256)
    output = outputs[0]['generated_text'][-1]['content']

    # Extract LLaMA's answer (post-processed)
    answer_start = output.lower().find("answer:") + len("answer:")
    llama_answer = output[answer_start:].strip().split()[0]  # Taking the first word as the final answer

    # Checking if LLaMA's answer matches the correct answer
    is_correct = choices[correct_answer].lower() in output.lower()

    # Collect the row
    rows.append([question, choices, llama_answer, choices[correct_answer], is_correct])

# Write all rows to the CSV file in one go
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct"])
    # Write all rows at once
    writer.writerows(rows)

print(f"Results saved to {output_file}")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id

In [None]:
import csv
from tqdm.notebook import tqdm  # Import tqdm for progress bar
from transformers import pipeline

pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype = torch.bfloat16,
    device_map='auto',
    pad_token_id=tokenizer.eos_token_id,
)

# Assuming the dataset and pipeline are already set up as per your current workflow
output_file = 'llama_mmlu_responses.csv'

# Sample dataset size, you can adjust it as needed for the loop
dataset_size = len(dataset)  # Assuming dataset is loaded with all examples

# Initialize a list to collect all rows
rows = []

# Loop through the dataset and gather question, choices, and answers
for idx in tqdm(range(dataset_size), desc="Processing dataset"):  # Wrap the loop with tqdm
    example = dataset[idx]
    question = example['question']
    choices = example['choices']
    correct_answer = example['answer']

    initial_prompt = '''
    Please answer the following user's question in the format "Answer: ANSWER_TO_THE_QUESTION.".

    For example, user input would be:

    Question: What is 2 + 2?
    Choices: ['1', '2', '3', '4']

    Your answer would be:
    Answer: 4

    Another example would be:

    Question: Which one of these parts are used for seeing?
    Choices: ['eyes', 'nose', 'ears', 'mouth']

    Your answer would be:
    Answer: eyes
    '''

    user_question = 'Question: ' + question + "\nChoices: " + str(choices)

    # Generating messages for the LLaMA model
    messages = [
        {'role': 'system', 'content': initial_prompt},
        {'role': 'user', 'content': user_question},
    ]

    # Get LLaMA's response
    outputs = pipe(messages, max_new_tokens=256)
    output = outputs[0]['generated_text']

    # Extract LLaMA's answer (post-processed)
    answer_start = output.lower().find("answer:") + len("answer:")
    llama_answer = output[answer_start:].strip().split()[0]  # Taking the first word as the final answer

    # Checking if LLaMA's answer matches the correct answer
    is_correct = choices[correct_answer].lower() in output.lower()

    # Collect the row
    rows.append([question, choices, llama_answer, choices[correct_answer], is_correct])

# Write all rows to the CSV file in one go
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct"])
    # Write all rows at once
    writer.writerows(rows)

print(f"Results saved to {output_file}")
