# MMLU Evaluation of LLaMA 3.2 1B Evaluation

In [1]:
import os
# no nvlink
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
# use a specific GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch
from transformers import LlamaForCausalLM, AutoTokenizer

print("Torch version:", torch.__version__)
print("Is CUDA available?", torch.cuda.is_available())

Torch version: 2.4.1+cu121
Is CUDA available? True


In [3]:
model_path = "/data/llm/llama/Llama-3.2-1B-Instruct/"

model = LlamaForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    

In [28]:
from datasets import load_dataset

# Load MMLU dataset
dataset = load_dataset("cais/mmlu", "all", split="test")

In [5]:
print(len(dataset))

14042


# Run 1: 2 shot on simple questions

In [7]:
from transformers import pipeline

pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype = torch.bfloat16,
    device_map='auto',
    pad_token_id=tokenizer.eos_token_id,
)

example = dataset[300]
question = example['question']
choices = example['choices']
correct_answer = example['answer']


initial_prompt = '''
Please answer the following user's question in the format "Answer: ANSWER_TO_THE_QUESTION.".

For example, user input would be:

Question: What is 2 + 2?
Choices: ['1', '2', '3', '4']

Your answer would be:
Answer: 4

Another example would be:

Question: Which one of these parts are used for seeing?
Choices: ['eyes', 'nose', 'ears', 'mouth']

Your answer would be:
Answer: eyes
'''


user_question = 'Question: ' + question + "\nChoices: " + str(choices)

# question_prompt = input_text
print('Initial prompt: ', initial_prompt)
print('User question: ', user_question)
print('Choices: ', choices)

messages = [
    {'role': 'system', 'content': initial_prompt},
    {'role': 'user', 'content': user_question},
]

outputs = pipe(
    messages,
    max_new_tokens=256,
)

output = outputs[0]['generated_text'][-1]['content']
print('LLaMA 3.21B response: ', output)
print('Correct answer: ', choices[correct_answer])
print(choices[correct_answer].lower() in output.lower())


Initial prompt:  
Please answer the following user's question in the format "Answer: ANSWER_TO_THE_QUESTION.".

For example, user input would be:

Question: What is 2 + 2?
Choices: ['1', '2', '3', '4']

Your answer would be:
Answer: 4

Another example would be:

Question: Which one of these parts are used for seeing?
Choices: ['eyes', 'nose', 'ears', 'mouth']

Your answer would be:
Answer: eyes

User question:  Question: Which is not a similarity between Saturn and Jupiter's atmospheres?
Choices: ['a composition dominated by hydrogen and helium', 'the presence of belts zones and storms', 'an equatorial wind speed of more than 900 miles per hour', 'significant "shear" between bands of circulation at different latitudes']
Choices:  ['a composition dominated by hydrogen and helium', 'the presence of belts zones and storms', 'an equatorial wind speed of more than 900 miles per hour', 'significant "shear" between bands of circulation at different latitudes']
LLaMA 3.21B response:  Answer: s

In [136]:
from tqdm.notebook import tqdm
import csv
from transformers import pipeline

pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype = torch.bfloat16,
    device_map='auto',
    pad_token_id=tokenizer.eos_token_id,
)

# Assuming the dataset and pipeline are already set up as per your current workflow
output_file = 'llama_mmlu_responses.csv'

# Sample dataset size, you can adjust it as needed for the loop
dataset_size = len(dataset)  # Assuming dataset is loaded with all examples

# Initialize a list to collect all rows
rows = []

# Loop through the dataset and gather question, choices, and answers
for idx in tqdm(range(dataset_size)):
    example = dataset[idx]
    question = example['question']
    choices = example['choices']
    correct_answer = example['answer']
    
    initial_prompt = '''
    Please answer the following user's question in the format "Answer: ANSWER_TO_THE_QUESTION.".

    For example, user input would be:

    Question: What is 2 + 2?
    Choices: ['1', '2', '3', '4']

    Your answer would be:
    Answer: 4

    Another example would be:

    Question: Which one of these parts are used for seeing?
    Choices: ['eyes', 'nose', 'ears', 'mouth']

    Your answer would be:
    Answer: eyes
    '''

    user_question = 'Question: ' + question + "\nChoices: " + str(choices)

    # Generating messages for the LLaMA model
    messages = [
        {'role': 'system', 'content': initial_prompt},
        {'role': 'user', 'content': user_question},
    ]

    # Get LLaMA's response
    outputs = pipe(messages, max_new_tokens=256)
    output = outputs[0]['generated_text'][-1]['content']

    # Extract LLaMA's answer (post-processed)
    answer_start = output.lower().find("answer:") + len("answer:")
    llama_answer = output[answer_start:].strip().split()[0]  # Taking the first word as the final answer

    # Checking if LLaMA's answer matches the correct answer
    is_correct = choices[correct_answer].lower() in output.lower()

    # Collect the row
    rows.append([question, choices, llama_answer, choices[correct_answer], is_correct])

# Write all rows to the CSV file in one go
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct"])
    # Write all rows at once
    writer.writerows(rows)

print(f"Results saved to {output_file}")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id

Results saved to llama_mmlu_responses.csv


In [8]:
import pandas as pd

df = pd.read_csv('llama_mmlu_responses.csv')
print(f'Average: {float(df["Is Correct"].mean()) * 100:.1f}')

Average: 37.3


# Run 2: 5-shot on samples taken from MMLU

## Sample 5 items from the MMLU dataset

In [29]:
from datasets import load_dataset

# Load MMLU dataset
train_dataset = load_dataset("cais/mmlu", "all", split="dev")

In [7]:
import random

n_shots = 5
train_indices = random.sample(range(len(train_dataset)), n_shots)  # Get random indices
few_shot_examples = train_dataset.select(train_indices)  # Select the few-shot examples

for example in few_shot_examples:
    question = example['question']
    choices = example['choices']
    correct_answer = choices[example['answer']]
    print(question, choices, correct_answer)

The nature of homosexual activities that occur during preadolescence include all but which of the following? ['sexual intercourse', 'circle jerks', 'exhibitionism', "touching each other's genitals"] sexual intercourse
What are the frameworks of analysis within which terrorism has been considered (as of 2020)? ['Competition between larger nations has resulted in some countries actively supporting terrorist groups to undermine the strength of rival states. Terrorist networks are extended patronage clubs maintained and paid for by their donor states and are conceptualised as being like state actors, to be dealt with using military force.', 'Globalization has enabled the internationalization of terrorist activities by opening up their operational space, although coordination is still managed from a geographical base. This suggests that terrorist groups are nationally structured which means that terrorism cannot be considered in terms of a war to be defeated militarily without having seriou

In [8]:
import csv
from tqdm.notebook import tqdm
from transformers import pipeline
import torch

# Load your dataset and set up the model pipeline
output_file = 'llama_mmlu_responses_5shot.csv'

# Ensure GPU is used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize your LLaMA model pipeline
pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype=torch.float16,  # Use FP16 for faster GPU performance
    device=0,  # Use the first GPU
    pad_token_id=tokenizer.eos_token_id
)

# Sample n_shots indices from the training set
train_indices = random.sample(range(len(train_dataset)), n_shots)  # Get random indices
few_shot_examples = train_dataset.select(train_indices)  # Select the few-shot examples

# Create the few-shot prompt

few_shot_prompt = '''
You are a bot that only answers the following question in JSON format.

For example, if the question is:

Question: Which one of the following items is an example of nonmaterial culture?
Choices: ['Dove soap', 'Dove candy bar', 'Dove symbol', 'A dove (bird)']

Your answer would be:
{"Answer": "Dove symbol"}

--------------------------

Another example question is:

Question: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?
Choices: ['3.89', '7.78', '5.78', '2.33']

And your answer would be:
{"Answer": "5.78"}

--------------------------

Another example question is:

Question: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl is embarrassed to live in a trailer park, so I invite the class to her house.
Choices: ['Wrong, Wrong', 'Wrong, Not wrong', 'Not wrong, Wrong', 'Not wrong, Not wrong']

And your answer would be:
{"Answer": "Not wrong, Wrong"}

--------------------------

Another example question is:

Question: As of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the Americas.
Choices:  ['increased, increased', 'increased, decreased', 'decreased, increased', 'decreased, decreased']

And your answer would be:
{"Answer": "increased, decreased"}

--------------------------

Another example question is:

Question: In the program below, the initial value of x is 5 and the initial value of y is 10.
 IF (X < O)
 {
 DISPLAY ("Foxtrot")
 }
 ELSE
 {
 IF (X > y)
 {
   DISPLAY ("Hotel")
 }
 ELSE 
 {
   IF (y > O)
   {
   DISPLAY ("November")
   }
   ELSE
   {
   DISPLAY ("Yankee")
   }
 }
 }
What is displayed as a result of running the program?
Choices: ['Foxtrot', 'Hotel', 'November', 'Yankee']

And your answer would be:
{"Answer": "November"}

'''
# for example in few_shot_examples:
#     question = example['question']
#     choices = example['choices']
#     correct_answer = choices[example['answer']]
    
#     few_shot_prompt += f"Question: {question}\nChoices: {choices}\nAnswer: {correct_answer}\n\n"

print(few_shot_prompt)
# Initialize a list to collect all rows


You are a bot that only answers the following question in JSON format.

For example, if the question is:

Question: Which one of the following items is an example of nonmaterial culture?
Choices: ['Dove soap', 'Dove candy bar', 'Dove symbol', 'A dove (bird)']

Your answer would be:
{"Answer": "Dove symbol"}

--------------------------

Another example question is:

Question: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?
Choices: ['3.89', '7.78', '5.78', '2.33']

And your answer would be:
{"Answer": "5.78"}

--------------------------

Another example question is:

Question: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl

In [10]:
import json
import re

# Processing the evaluation set with tqdm for a progress bar
rows = []
is_correct_count = 0

for idx in tqdm(range(0, len(dataset)), desc="Processing evaluation set"):
    format_error = 'None'
    example = dataset[idx]

    # Prepare the user question with few-shot prompt
    question = example['question']
    choices = example['choices']
    correct_answer = example['answer']
    
    stringified_choices = re.sub('"', '\'', str(choices))
    # Append the new question to the few-shot prompt
    user_question = few_shot_prompt + f"Question: {question}\nChoices: {stringified_choices}\n"
    
    # Messages to be passed to the model
    messages = [
        {'role': 'system', 'content': few_shot_prompt},
        {'role': 'user', 'content': user_question},
    ]
    
    # Run the model with torch.cuda.amp for mixed precision (parallel processing)
    outputs = pipe(messages, max_new_tokens=256)
    
    output = outputs[0]['generated_text'][-1]['content']
    
    # Extract LLaMA's answer (post-processed)
    llama_answer = output.lower()
    try:
        llama_answer = json.loads(llama_answer)['answer']
    except Exception as e:
        format_error = 'Answer is not in JSON format.'


    # Check if the answer matches the correct answer
    if type(llama_answer) == str:
        is_correct = choices[correct_answer].lower() == re.sub('"', '\'', llama_answer).lower()
    else:
        is_correct = False
        format_error = 'Parsed answer is still dictionary.'

    is_correct_count += 1 if is_correct else 0
    if idx % 100 == 0:
        print(f'{is_correct_count} / {idx} Correct Answers')
    # Collect the result row
    rows.append([question, choices, llama_answer, choices[correct_answer], is_correct, format_error])

# Write the collected results to a CSV file
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct", "Format Error"])
    # Write all rows at once
    writer.writerows(rows)

print(f"Results saved to {output_file}")

Processing evaluation set:   0%|          | 0/14042 [00:00<?, ?it/s]

0 / 0 Correct Answers
16 / 100 Correct Answers
57 / 200 Correct Answers
94 / 300 Correct Answers
123 / 400 Correct Answers
155 / 500 Correct Answers
179 / 600 Correct Answers
212 / 700 Correct Answers
247 / 800 Correct Answers
276 / 900 Correct Answers
299 / 1000 Correct Answers
322 / 1100 Correct Answers
344 / 1200 Correct Answers
368 / 1300 Correct Answers
402 / 1400 Correct Answers
427 / 1500 Correct Answers
469 / 1600 Correct Answers
495 / 1700 Correct Answers
520 / 1800 Correct Answers
528 / 1900 Correct Answers
552 / 2000 Correct Answers
585 / 2100 Correct Answers
611 / 2200 Correct Answers
632 / 2300 Correct Answers
654 / 2400 Correct Answers
681 / 2500 Correct Answers
705 / 2600 Correct Answers
739 / 2700 Correct Answers
771 / 2800 Correct Answers
806 / 2900 Correct Answers
832 / 3000 Correct Answers
852 / 3100 Correct Answers
873 / 3200 Correct Answers
897 / 3300 Correct Answers
914 / 3400 Correct Answers
945 / 3500 Correct Answers
990 / 3600 Correct Answers
1026 / 3700 Correc

In [11]:
import pandas as pd

df = pd.read_csv('llama_mmlu_responses_5shot.csv')

print(f'Average: {float(df["Is Correct"].mean()) * 100}')

Average: 26.370887337986044


# Run 3: Try fixing the prompt

In [51]:
import csv
from tqdm.notebook import tqdm
from transformers import pipeline
import torch

# Load your dataset and set up the model pipeline
# output_file = 'llama_mmlu_responses_5shot.csv'

# Ensure GPU is used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize your LLaMA model pipeline
pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype=torch.float16,  # Use FP16 for faster GPU performance
    device=0,  # Use the first GPU
    pad_token_id=tokenizer.eos_token_id
)

# Sample n_shots indices from the training set
# train_indices = random.sample(range(len(train_dataset)), n_shots)  # Get random indices
# few_shot_examples = train_dataset.select(train_indices)  # Select the few-shot examples

# Create the few-shot prompt

low_temp_few_shot_prompt = '''
You are a bot that only answers the following question in JSON format.

For example, if the question is:

Question: Which one of the following items is an example of nonmaterial culture?
Choices: ['Dove soap', 'Dove candy bar', 'Dove symbol', 'A dove (bird)']

Your answer would be:
{"Answer": "Dove symbol"}

--------------------------

Another example question is:

Question: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?
Choices: ['3.89', '7.78', '5.78', '2.33']

And your answer would be:
{"Answer": "5.78"}

--------------------------

Another example question is:

Question: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl is embarrassed to live in a trailer park, so I invite the class to her house.
Choices: ['Wrong, Wrong', 'Wrong, Not wrong', 'Not wrong, Wrong', 'Not wrong, Not wrong']

And your answer would be:
{"Answer": "Not wrong, Wrong"}

--------------------------

Another example question is:

Question: As of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the Americas.
Choices:  ['increased, increased', 'increased, decreased', 'decreased, increased', 'decreased, decreased']

And your answer would be:
{"Answer": "increased, decreased"}

--------------------------

Another example question is:

Question: In the program below, the initial value of x is 5 and the initial value of y is 10.
 IF (X < O)
 {
 DISPLAY ("Foxtrot")
 }
 ELSE
 {
 IF (X > y)
 {
   DISPLAY ("Hotel")
 }
 ELSE 
 {
   IF (y > O)
   {
   DISPLAY ("November")
   }
   ELSE
   {
   DISPLAY ("Yankee")
   }
 }
 }
What is displayed as a result of running the program?
Choices: ['Foxtrot', 'Hotel', 'November', 'Yankee']

And your answer would be:
{"Answer": "November"}
'''
# for example in few_shot_examples:
#     question = example['question']
#     choices = example['choices']
#     correct_answer = choices[example['answer']]
    
#     few_shot_prompt += f"Question: {question}\nChoices: {choices}\nAnswer: {correct_answer}\n\n"

print(few_shot_prompt)
# Initialize a list to collect all rows


You are a bot that only answers the following question in JSON format.

For example, if the question is:

Question: Which one of the following items is an example of nonmaterial culture?
Choices: ['Dove soap', 'Dove candy bar', 'Dove symbol', 'A dove (bird)']

Your answer would be:
{"Answer": "Dove symbol"}

--------------------------

Another example question is:

Question: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?
Choices: ['3.89', '7.78', '5.78', '2.33']

And your answer would be:
{"Answer": "5.78"}

--------------------------

Another example question is:

Question: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl

In [62]:
import json
import re

# Processing the evaluation set with tqdm for a progress bar
rows = []
is_correct_count = 0
output_file = 'llama_mmlu_responses_5shot_lowtemp.csv'

for idx in tqdm(range(0, len(dataset)), desc="Processing evaluation set"):
    format_error = 'None'
    example = dataset[idx]

    # Prepare the user question with few-shot prompt
    question = example['question']
    choices = example['choices']
    correct_answer = example['answer']
    
    stringified_choices = re.sub('"', '\'', str(choices))
    # Append the new question to the few-shot prompt
    user_question = f"Question: {question}\nChoices: {stringified_choices}\n"
    
    # Messages to be passed to the model
    messages = [
        {'role': 'system', 'content': low_temp_few_shot_prompt},
        {'role': 'user', 'content': user_question},
    ]
    
    # Run the model with torch.cuda.amp for mixed precision (parallel processing)
    outputs = pipe(messages, max_new_tokens=256)
    
    output = outputs[0]['generated_text'][-1]['content']
    
    # Extract LLaMA's answer (post-processed)
    llama_answer = output.lower()
    try:
        llama_answer = json.loads(llama_answer)['answer']
    except Exception as e:
        format_error = 'Answer is not in JSON format.'


    # Check if the answer matches the correct answer
    if type(llama_answer) == str:
        is_correct = choices[correct_answer].lower() == re.sub('"', '\'', llama_answer).lower()
    else:
        is_correct = False
        format_error = 'Parsed answer is still dictionary.'

    is_correct_count += 1 if is_correct else 0
    if idx % 100 == 0:
        print(f'{is_correct_count} / {idx} Correct Answers')
    # Collect the result row
    rows.append([question, choices, llama_answer, choices[correct_answer], is_correct, format_error])

# Write the collected results to a CSV file
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct", "Format Error"])
    # Write all rows at once
    writer.writerows(rows)

print(f"Results saved to {output_file}")

Processing evaluation set:   0%|          | 0/14042 [00:00<?, ?it/s]

0 / 0 Correct Answers
19 / 100 Correct Answers
62 / 200 Correct Answers
98 / 300 Correct Answers
131 / 400 Correct Answers
164 / 500 Correct Answers
195 / 600 Correct Answers
226 / 700 Correct Answers
268 / 800 Correct Answers
295 / 900 Correct Answers
315 / 1000 Correct Answers
330 / 1100 Correct Answers
344 / 1200 Correct Answers
377 / 1300 Correct Answers
405 / 1400 Correct Answers
426 / 1500 Correct Answers
460 / 1600 Correct Answers
490 / 1700 Correct Answers
522 / 1800 Correct Answers
535 / 1900 Correct Answers
569 / 2000 Correct Answers
603 / 2100 Correct Answers
628 / 2200 Correct Answers
659 / 2300 Correct Answers
683 / 2400 Correct Answers
708 / 2500 Correct Answers
730 / 2600 Correct Answers
763 / 2700 Correct Answers
800 / 2800 Correct Answers
832 / 2900 Correct Answers
860 / 3000 Correct Answers
886 / 3100 Correct Answers
918 / 3200 Correct Answers
949 / 3300 Correct Answers
982 / 3400 Correct Answers
1023 / 3500 Correct Answers
1062 / 3600 Correct Answers
1099 / 3700 Corr

In [26]:
import pandas as pd

df = pd.read_csv('llama_mmlu_responses_5shot_lowtemp.csv')

print(f'Average: {float(df["Is Correct"].mean()) * 100}')

# Best-case scenario of all mal-formatted errors not 
(len(df['Format Error'].dropna()) + len(df[df['Is Correct'] == True])) / 14042

Average: 30.4871100982766


0.4270759151118074

## Where the discrepancies might come from

1. Perhaps use dev test?
2. Perhaps change the temperature along with simplifying the prompt?
3. Perhaps compartmentalize the querying of the model on each subject?

# Run 4: Use Dev Split

In [37]:
import csv
from tqdm.notebook import tqdm
from transformers import pipeline
import torch

# Load your dataset and set up the model pipeline
# output_file = 'llama_mmlu_responses_5shot.csv'

# Ensure GPU is used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize your LLaMA model pipeline
pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype=torch.float16,  # Use FP16 for faster GPU performance
    device=0,  # Use the first GPU
    pad_token_id=tokenizer.eos_token_id
)

# Sample n_shots indices from the training set
# train_indices = random.sample(range(len(train_dataset)), n_shots)  # Get random indices
# few_shot_examples = train_dataset.select(train_indices)  # Select the few-shot examples

# Create the few-shot prompt

few_shot_prompt = '''
You are a bot that only answers the following question in JSON format.

For example, if the question is:

Question: Which one of the following items is an example of nonmaterial culture?
Choices: ['Dove soap', 'Dove candy bar', 'Dove symbol', 'A dove (bird)']

Your answer would be:
{"Answer": "Dove symbol"}

--------------------------

Another example question is:

Question: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?
Choices: ['3.89', '7.78', '5.78', '2.33']

And your answer would be:
{"Answer": "5.78"}

--------------------------

Another example question is:

Question: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl is embarrassed to live in a trailer park, so I invite the class to her house.
Choices: ['Wrong, Wrong', 'Wrong, Not wrong', 'Not wrong, Wrong', 'Not wrong, Not wrong']

And your answer would be:
{"Answer": "Not wrong, Wrong"}

--------------------------

Another example question is:

Question: As of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the Americas.
Choices:  ['increased, increased', 'increased, decreased', 'decreased, increased', 'decreased, decreased']

And your answer would be:
{"Answer": "increased, decreased"}

--------------------------

Another example question is:

Question: In the program below, the initial value of x is 5 and the initial value of y is 10.
 IF (X < O)
 {
 DISPLAY ("Foxtrot")
 }
 ELSE
 {
 IF (X > y)
 {
   DISPLAY ("Hotel")
 }
 ELSE 
 {
   IF (y > O)
   {
   DISPLAY ("November")
   }
   ELSE
   {
   DISPLAY ("Yankee")
   }
 }
 }
What is displayed as a result of running the program?
Choices: ['Foxtrot', 'Hotel', 'November', 'Yankee']

And your answer would be:
{"Answer": "November"}
'''
# for example in few_shot_examples:
#     question = example['question']
#     choices = example['choices']
#     correct_answer = choices[example['answer']]
    
#     few_shot_prompt += f"Question: {question}\nChoices: {choices}\nAnswer: {correct_answer}\n\n"

print(few_shot_prompt)
# Initialize a list to collect all rows


You are a bot that only answers the following question in JSON format.

For example, if the question is:

Question: Which one of the following items is an example of nonmaterial culture?
Choices: ['Dove soap', 'Dove candy bar', 'Dove symbol', 'A dove (bird)']

Your answer would be:
{"Answer": "Dove symbol"}

--------------------------

Another example question is:

Question: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?
Choices: ['3.89', '7.78', '5.78', '2.33']

And your answer would be:
{"Answer": "5.78"}

--------------------------

Another example question is:

Question: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl

In [39]:
import json
import re
from tqdm.notebook import tqdm

# Processing the evaluation set with tqdm for a progress bar
rows = []
is_correct_count = 0
output_file = 'llama_mmlu_responses_run5.csv'

for idx in tqdm(range(0, len(train_dataset)), desc="Processing evaluation set"):
    format_error = 'None'
    example = dataset[idx]

    # Prepare the user question with few-shot prompt
    question = example['question']
    choices = example['choices']
    correct_answer = example['answer']
    
    stringified_choices = re.sub('"', '\'', str(choices))
    # Append the new question to the few-shot prompt
    user_question = f"Question: {question}\nChoices: {stringified_choices}\n"
    
    # Messages to be passed to the model
    messages = [
        {'role': 'system', 'content': few_shot_prompt},
        {'role': 'user', 'content': user_question},
    ]
    
    # Run the model with torch.cuda.amp for mixed precision (parallel processing)
    outputs = pipe(messages, max_new_tokens=256)
    
    output = outputs[0]['generated_text'][-1]['content']
    
    # Extract LLaMA's answer (post-processed)
    llama_answer = output.lower()
    try:
        llama_answer = json.loads(llama_answer)['answer']
    except Exception as e:
        format_error = 'Answer is not in JSON format.'


    # Check if the answer matches the correct answer
    if type(llama_answer) == str:
        is_correct = choices[correct_answer].lower() == re.sub('"', '\'', llama_answer).lower()
    else:
        is_correct = False
        format_error = 'Parsed answer is still dictionary.'

    is_correct_count += 1 if is_correct else 0
    if idx % 100 == 0:
        print(f'{is_correct_count} / {idx} Correct Answers')
    # Collect the result row
    rows.append([question, choices, llama_answer, choices[correct_answer], is_correct, format_error])

# Write the collected results to a CSV file
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct", "Format Error"])
    # Write all rows at once
    writer.writerows(rows)

print(f"Results saved to {output_file}")

Processing evaluation set:   0%|          | 0/285 [00:00<?, ?it/s]

1 / 0 Correct Answers
19 / 100 Correct Answers
69 / 200 Correct Answers
Results saved to llama_mmlu_responses_run5.csv


In [41]:
import pandas as pd

df = pd.read_csv('llama_mmlu_responses_run5.csv')

print(f'Average: {float(df["Is Correct"].mean()) * 100}')

# Best-case scenario of all mal-formatted errors not 
(len(df['Format Error'].dropna()) + len(df[df['Is Correct'] == True])) / 285

Average: 35.78947368421053


0.47368421052631576

# Run 5: Use a lower temperature and a simpler 5-shot prompt

And use the test split this time.

In [65]:
import csv
from tqdm.notebook import tqdm
from transformers import pipeline
import torch

# Load your dataset and set up the model pipeline
# output_file = 'llama_mmlu_responses_5shot.csv'

# Ensure GPU is used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize your LLaMA model pipeline
pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype=torch.float16,  # Use FP16 for faster GPU performance
    device=0,  # Use the first GPU
    pad_token_id=tokenizer.eos_token_id
)

# Sample n_shots indices from the training set
# train_indices = random.sample(range(len(train_dataset)), n_shots)  # Get random indices
# few_shot_examples = train_dataset.select(train_indices)  # Select the few-shot examples

# Create the few-shot prompt

few_shot_prompt = '''
You are a bot that's trained to answer questions in JSON format. For example:

Question: Which one of the following items is an example of nonmaterial culture?
Choices: ['Dove soap', 'Dove candy bar', 'Dove symbol', 'A dove (bird)']
{"Answer": "Dove symbol"}

--------------------------

Question: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?
Choices: ['3.89', '7.78', '5.78', '2.33']
{"Answer": 5.78}

--------------------------

Question: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl is embarrassed to live in a trailer park, so I invite the class to her house.
Choices: ['Wrong, Wrong', 'Wrong, Not wrong', 'Not wrong, Wrong', 'Not wrong, Not wrong']
{"Answer": "Not wrong, Wrong"}

--------------------------

Question: As of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the Americas.
Choices:  ['increased, increased', 'increased, decreased', 'decreased, increased', 'decreased, decreased']
{"Answer": "increased, decreased"}

--------------------------

Question: In the program below, the initial value of x is 5 and the initial value of y is 10.
 IF (X < O)
 {
 DISPLAY ("Foxtrot")
 }
 ELSE
 {
 IF (X > y)
 {
   DISPLAY ("Hotel")
 }
 ELSE 
 {
   IF (y > O)
   {
   DISPLAY ("November")
   }
   ELSE
   {
   DISPLAY ("Yankee")
   }
 }
 }
What is displayed as a result of running the program?
Choices: ['Foxtrot', 'Hotel', 'November', 'Yankee']
{"Answer": "November"}

After here, you will be given questions. Select the best answer from the choices provided. Do not explain your answer. Format your answer as with the examples above.
'''
# for example in few_shot_examples:
#     question = example['question']
#     choices = example['choices']
#     correct_answer = choices[example['answer']]
    
#     few_shot_prompt += f"Question: {question}\nChoices: {choices}\nAnswer: {correct_answer}\n\n"

print(few_shot_prompt)
# Initialize a list to collect all rows


You are a bot that's trained to answer questions in JSON format. For example:

Question: Which one of the following items is an example of nonmaterial culture?
Choices: ['Dove soap', 'Dove candy bar', 'Dove symbol', 'A dove (bird)']
{"Answer": "Dove symbol"}

--------------------------

Question: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?
Choices: ['3.89', '7.78', '5.78', '2.33']
{"Answer": 5.78}

--------------------------

Question: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl is embarrassed to live in a trailer park, so I invite the class to her house.
Choices: ['Wrong, Wrong', 'Wrong, Not wrong', 'Not wrong, Wr

In [89]:
import json
import re
from tqdm.notebook import tqdm

# Processing the evaluation set with tqdm for a progress bar
rows = []
is_correct_count = 0
output_file = 'llama_mmlu_responses_run6.csv'

for idx in tqdm(range(0, len(dataset)), desc="Processing evaluation set"):
    format_error = 'None'
    example = dataset[idx]

    # Prepare the user question with few-shot prompt
    question = example['question']
    choices = example['choices']
    correct_answer = example['answer']
    
    stringified_choices = re.sub('"', '\'', str(choices))
    # Append the new question to the few-shot prompt
    user_question = f"Question: {question}\nChoices: {stringified_choices}\nWhat is your final answer?"
    
    # Messages to be passed to the model
    messages = [
        {'role': 'system', 'content': few_shot_prompt},
        {'role': 'user', 'content': user_question},
    ]
    
    # Run the model with torch.cuda.amp for mixed precision (parallel processing)
    outputs = pipe(messages, max_new_tokens=64, temperature=0.1)
    
    output = outputs[0]['generated_text'][-1]['content']
    
    # Extract LLaMA's answer (post-processed)
    llama_answer = output.lower()
    try:
        llama_answer = json.loads(llama_answer)['answer']
    except Exception as e:
        format_error = 'Answer is not in JSON format.'
        is_correct = choices[correct_answer].lower() in re.sub('"', '\'', llama_answer).lower()
        is_correct_count += 1 if is_correct else 0
        rows.append([question, choices, llama_answer, choices[correct_answer], is_correct, format_error])
        continue


    # Check if the answer matches the correct answer
    if type(llama_answer) == str:
        is_correct = choices[correct_answer].lower() == re.sub('"', '\'', llama_answer).lower()
    else:
        is_correct = False
        format_error = 'Parsed answer is still dictionary.'

    is_correct_count += 1 if is_correct else 0
    if idx % 100 == 0:
        print(f'{is_correct_count} / {idx} Correct Answers')
    # Collect the result row
    rows.append([question, choices, llama_answer, choices[correct_answer], is_correct, format_error])

# Write the collected results to a CSV file
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct", "Format Error"])
    # Write all rows at once
    writer.writerows(rows)

print(f"Results saved to {output_file}")

Processing evaluation set:   0%|          | 0/14042 [00:00<?, ?it/s]

0 / 0 Correct Answers
20 / 100 Correct Answers
64 / 200 Correct Answers
105 / 300 Correct Answers
143 / 400 Correct Answers
185 / 500 Correct Answers
218 / 600 Correct Answers
262 / 700 Correct Answers
306 / 800 Correct Answers
339 / 900 Correct Answers
364 / 1000 Correct Answers
425 / 1200 Correct Answers
462 / 1300 Correct Answers
494 / 1400 Correct Answers
524 / 1500 Correct Answers
569 / 1600 Correct Answers
608 / 1700 Correct Answers
645 / 1800 Correct Answers
665 / 1900 Correct Answers
708 / 2000 Correct Answers
753 / 2100 Correct Answers
786 / 2200 Correct Answers
825 / 2300 Correct Answers
854 / 2400 Correct Answers
885 / 2500 Correct Answers
915 / 2600 Correct Answers
956 / 2700 Correct Answers
1000 / 2800 Correct Answers
1080 / 3000 Correct Answers
1147 / 3200 Correct Answers
1185 / 3300 Correct Answers
1227 / 3400 Correct Answers
1268 / 3500 Correct Answers
1310 / 3600 Correct Answers
1351 / 3700 Correct Answers
1398 / 3800 Correct Answers
1432 / 3900 Correct Answers
1454 / 

In [90]:
import pandas as pd

df = pd.read_csv('llama_mmlu_responses_run6.csv')

print(f'Average: {float(df["Is Correct"].mean()) * 100}')

# Best-case scenario of all mal-formatted errors not 
(len(df['Format Error'].dropna()) + len(df[df['Is Correct'] == True])) / 14042

Average: 36.31249109813417


0.45057684090585387

# Run 6: Low Temperature, Compartmentalized

In [139]:
import csv
from tqdm.notebook import tqdm
from transformers import pipeline
import torch

# Load your dataset and set up the model pipeline
# output_file = 'llama_mmlu_responses_5shot.csv'

# Ensure GPU is used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize your LLaMA model pipeline
pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype=torch.float16,  # Use FP16 for faster GPU performance
    device=0,  # Use the first GPU
    pad_token_id=tokenizer.eos_token_id
)

# Preprocessing
categories = train_dataset.unique('subject')
# print(len(categories))
items_compartmentalized = {}
for c in categories:
    items_compartmentalized[c] = {'shots': train_dataset.filter(lambda example: example['subject'] == c)}

few_shot_prompt = "You are a bot that's trained to answer questions in JSON format. For example:\n\n"

for c in categories:
    idx = 0
    per_subject_prompt = '' + few_shot_prompt
    for item in items_compartmentalized[c]['shots']:
        question = item['question']
        choices = item['choices']
        answer_index = item['answer']
        actual_answer = choices[answer_index]
        json_answer = {"Answer": f"{actual_answer}"}
        stringified_json_answer = re.sub('\'', '"', str(json_answer))
        item_prompt = f'Question: {question}\nChoices: {choices}\n{stringified_json_answer}'
        per_subject_prompt = per_subject_prompt + item_prompt
        if idx < 4:
            per_subject_prompt += '\n' + ('-' * 25) + '\n'
        idx += 1
    items_compartmentalized[c]['prompt'] = per_subject_prompt + "\n\nAfter here, you will be given questions. Select the best answer from the choices provided. Do not explain your answer. Format your answer as with the examples above."

In [140]:
print(items_compartmentalized['abstract_algebra']['prompt'])

You are a bot that's trained to answer questions in JSON format. For example:

Question: Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.
Choices: ['0', '1', '2', '3']
{"Answer": "1"}
-------------------------
Question: Statement 1 | If aH is an element of a factor group, then |aH| divides |a|. Statement 2 | If H and K are subgroups of G then HK is a subgroup of G.
Choices: ['True, True', 'False, False', 'True, False', 'False, True']
{"Answer": "False, False"}
-------------------------
Question: Statement 1 | Every element of a group generates a cyclic subgroup of the group. Statement 2 | The symmetric group S_10 has 10 elements.
Choices: ['True, True', 'False, False', 'True, False', 'False, True']
{"Answer": "True, False"}
-------------------------
Question: Statement 1| Every function from a finite set onto itself must be one to one. Statement 2 | Every subgroup of an abelian group is abelian.
Choices: ['True, True', 'False, False', 'True, False', 'False, True']
{"Answer": "T

In [151]:
import json
import re
from tqdm.notebook import tqdm

# Processing the evaluation set with tqdm for a progress bar
rows = []
is_correct_count = 0
output_file = 'llama_mmlu_responses_run7.csv'

for idx in tqdm(range(0, len(dataset)), desc="Processing evaluation set"):
    format_error = 'None'
    example = dataset[idx]

    # Prepare the user question with few-shot prompt
    question = example['question']
    choices = example['choices']
    correct_answer = example['answer']
    subject = example['subject']
    
    stringified_choices = re.sub('"', '\'', str(choices))
    # Append the new question to the few-shot prompt
    user_question = f"Question: {question}\nChoices: {stringified_choices}\nWhat is your final answer?"
    
    # Messages to be passed to the model
    messages = [
        {'role': 'system', 'content': items_compartmentalized[subject]['prompt']},
        {'role': 'user', 'content': user_question},
    ]
    
    # Run the model with torch.cuda.amp for mixed precision (parallel processing)
    outputs = pipe(messages, max_new_tokens=64, temperature=0.1)
    
    output = outputs[0]['generated_text'][-1]['content']
    
    # Extract LLaMA's answer (post-processed)
    llama_answer = output.lower()
    try:
        llama_answer = json.loads(llama_answer)['answer']
    except Exception as e:
        format_error = 'Answer is not in JSON format.'
        is_correct = choices[correct_answer].lower() in re.sub('"', '\'', llama_answer).lower()
        is_correct_count += 1 if is_correct else 0
        rows.append([question, choices, llama_answer, choices[correct_answer], is_correct, subject, format_error])
        if idx % 100 == 0:
            print(f'{is_correct_count} / {idx} Correct Answers')
        continue


    # Check if the answer matches the correct answer
    if type(llama_answer) == str:
        is_correct = choices[correct_answer].lower() == re.sub('"', '\'', llama_answer).lower()
    else:
        is_correct = False
        format_error = 'Parsed answer is still dictionary.'

    is_correct_count += 1 if is_correct else 0
    if idx % 100 == 0:
        print(f'{is_correct_count} / {idx} Correct Answers')
    # Collect the result row
    rows.append([question, choices, llama_answer, choices[correct_answer], is_correct, subject, format_error])

# Write the collected results to a CSV file
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct", "Subject", "Format Error"])
    # Write all rows at once
    writer.writerows(rows)

print(f"Results saved to {output_file}")

Processing evaluation set:   0%|          | 0/14042 [00:00<?, ?it/s]

0 / 0 Correct Answers
18 / 100 Correct Answers
62 / 200 Correct Answers
99 / 300 Correct Answers
138 / 400 Correct Answers
189 / 500 Correct Answers
220 / 600 Correct Answers
261 / 700 Correct Answers
305 / 800 Correct Answers
347 / 900 Correct Answers
377 / 1000 Correct Answers
408 / 1100 Correct Answers
436 / 1200 Correct Answers
473 / 1300 Correct Answers
510 / 1400 Correct Answers
541 / 1500 Correct Answers
595 / 1600 Correct Answers
638 / 1700 Correct Answers
679 / 1800 Correct Answers
706 / 1900 Correct Answers
745 / 2000 Correct Answers
786 / 2100 Correct Answers
818 / 2200 Correct Answers
849 / 2300 Correct Answers
879 / 2400 Correct Answers
911 / 2500 Correct Answers
944 / 2600 Correct Answers
988 / 2700 Correct Answers
1030 / 2800 Correct Answers
1074 / 2900 Correct Answers
1111 / 3000 Correct Answers
1142 / 3100 Correct Answers
1185 / 3200 Correct Answers
1230 / 3300 Correct Answers
1266 / 3400 Correct Answers
1305 / 3500 Correct Answers
1351 / 3600 Correct Answers
1396 / 37

In [152]:
import pandas as pd

df = pd.read_csv('llama_mmlu_responses_run7.csv')

print(f'Average: {float(df["Is Correct"].mean()) * 100}')

# Best-case scenario of all mal-formatted errors not 
(len(df['Format Error'].dropna()) + len(df[df['Is Correct'] == True])) / 14042

Average: 37.09585529126905


0.4183164791340265

In [158]:
print(len(items_compartmentalized.items()))

57


# Run 7: Use Prompts from Official LLaMA Eval

From their [eval details](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/eval_details.md),

> Macro averages are reported unless otherwise stated. The micro average scores for the various models are: 65.6, 79.0, and 85.4 for the pre-trained 8B, 70B and 405B models respectively for the 5-shot config; 69.44, 84.0, 87.71 for the post-trained 8B, 70B and 405B models respectively for the 5-shot config.

In [5]:
from tqdm.notebook import tqdm
from transformers import pipeline
import torch

# Load your dataset and set up the model pipeline
# output_file = 'llama_mmlu_responses_5shot.csv'

# Ensure GPU is used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize your LLaMA model pipeline
pipe = pipeline(
    'text-generation',
    model=model_path,
    torch_dtype=torch.float16,  # Use FP16 for faster GPU performance
    device=0,  # Use the first GPU
    pad_token_id=tokenizer.eos_token_id
)

import pandas as pd

data = pd.read_parquet("Details_mmlu_2024-09-23T17-23-07.347337.parquet.gzip")
print(data.count()) # example of operation on the returned DataFrame

task_type                                 14042
task_name                                 14042
subtask_name                              14042
input_question                            14042
input_choice_list                         14042
input_final_prompts                       14042
input_correct_responses                   14042
output_prediction_text                    14042
output_parsed_answer                      14042
output_choice_completions                     0
output_choice_negative_log_likelihoods        0
output_metrics                            14042
is_correct                                14042
input_question_hash                       14042
input_final_prompts_hash                  14042
benchmark_label                           14042
eval_config                               14042
dtype: int64


In [8]:
metrics = pd.read_parquet('Llama-3.2-1B-Instruct-evals_Details_metrics_details_2024-09-23T17-23-22.207810.parquet.gzip')
metrics[metrics['benchmark_label'] == 'MMLU'].iloc[0]['eval_config']

{'top_p': '0',
 'seed': '42',
 'num_generations': '1',
 'max_gen_len': '10',
 'top_k': '0',
 'temperature': '0.0',
 'prompt_fn': 'functools.partial(<function jinja_dialog_format at 0x7f02e368d090>, template={\'prompt\': \'Given the following question and four candidate answers (A, B, C and D), choose the best answer.\\nQuestion: {{ question }}\\nA. {{ choices["A"] }}\\nB. {{ choices["B"] }}\\nC. {{ choices["C"] }}\\nD. {{ choices["D"] }}\\nYour response should end with "The best answer is [the_answer_letter]" where the [the_answer_letter] is one of A, B, C or D.\\n\', \'answer\': \'The best answer is {{ answer }}.\', \'gen_prefix\': \'The best answer is \'}, append_gen_prefix=True)',
 'num_few_shot': '5',
 'max_prompt_len': '3840',
 'return_logprobs': 'false'}

In [58]:
import csv

def parse_answer(answer_string):
    # Check if the string starts with the expected phrase
    expected_start = "The best answer is "
    answer_string = re.sub(r'\\n', '', answer_string.strip())
    
    # Guard clause to handle cases where the string doesn't start as expected
    if not answer_string.startswith(expected_start):
        raise ValueError(f"Invalid answer format: '{answer_string}' does not start with '{expected_start}'")

    # Parse the answer part (assuming it's a single character like 'A', 'B', etc.)
    answer = answer_string[len(expected_start):len(expected_start)+1].strip()

    
    # Additional check to ensure there's an answer after the phrase
    if not answer or len(answer) > 1:
        raise ValueError(f"Invalid or missing answer choice in '{answer_string}'")
    
    return answer


total = 0
correct_count = 0
rows = []
output_file = 'llama_mmlu_responses_run7.csv'

for i in tqdm(range(len(data))):
    total += 1
    item = data.iloc[i]
    final_prompts = item['input_final_prompts']
    correct_answer = item['input_correct_responses'][0]
    question = item['input_question']
    choices = item['input_choice_list']
    subject = item['subtask_name']

    messages = [
        {'role': 'system', 'content': final_prompts},
    ]

    outputs = pipe(messages, max_new_tokens=64, do_sample=False, top_k=0, top_p=0)
    
    output = outputs[0]['generated_text'][-1]['content'].strip()
    llama_answer = ''

    try:
        llama_answer = parse_answer(output)
    except ValueError:
        llama_answer = f'Invalid string format: {output}'
    
    is_correct = False
    if correct_answer == llama_answer:
        correct_count += 1
        is_correct = True

    if i % 100 == 0:
        print(f'{correct_count} / {i} Correct Answers')
    # Collect the result row
    rows.append([question, choices, llama_answer, correct_answer, is_correct, subject])

with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct", "Subject"])
    # Write all rows at once
    writer.writerows(rows)

print(f"Results saved to {output_file}")
    

  0%|          | 0/14042 [00:00<?, ?it/s]

0 / 0 Correct Answers
51 / 100 Correct Answers
105 / 200 Correct Answers
153 / 300 Correct Answers
177 / 400 Correct Answers
211 / 500 Correct Answers
264 / 600 Correct Answers
307 / 700 Correct Answers
361 / 800 Correct Answers
394 / 900 Correct Answers
442 / 1000 Correct Answers
490 / 1100 Correct Answers
517 / 1200 Correct Answers
531 / 1300 Correct Answers
569 / 1400 Correct Answers
593 / 1500 Correct Answers
602 / 1600 Correct Answers
639 / 1700 Correct Answers
712 / 1800 Correct Answers
744 / 1900 Correct Answers
774 / 2000 Correct Answers
815 / 2100 Correct Answers
853 / 2200 Correct Answers
898 / 2300 Correct Answers
947 / 2400 Correct Answers
985 / 2500 Correct Answers
1015 / 2600 Correct Answers
1053 / 2700 Correct Answers
1083 / 2800 Correct Answers
1139 / 2900 Correct Answers
1181 / 3000 Correct Answers
1217 / 3100 Correct Answers
1249 / 3200 Correct Answers
1305 / 3300 Correct Answers
1372 / 3400 Correct Answers
1439 / 3500 Correct Answers
1494 / 3600 Correct Answers
1544 

In [59]:
import pandas as pd
df = pd.read_csv('llama_mmlu_responses_run7.csv')

print(f'Average: {float(df["Is Correct"].mean()) * 100}')

Average: 42.57228315054836


In [7]:
import csv
import re

def parse_answer(answer_string):
    # Check if the string starts with the expected phrase
    expected_start = "The best answer is "
    answer_string = re.sub(r'\\n', '', answer_string.strip())
    
    # Guard clause to handle cases where the string doesn't start as expected
    if not answer_string.startswith(expected_start):
        raise ValueError(f"Invalid answer format: '{answer_string}' does not start with '{expected_start}'")

    # Parse the answer part (assuming it's a single character like 'A', 'B', etc.)
    answer = answer_string[len(expected_start):len(expected_start)+1].strip()

    
    # Additional check to ensure there's an answer after the phrase
    if not answer or len(answer) > 1:
        raise ValueError(f"Invalid or missing answer choice in '{answer_string}'")
    
    return answer

runs = 5

for r in range(runs):
    print(f'Run {r}')
    total = 0
    correct_count = 0
    rows = []
    output_file = f'llama_mmlu_responses_run7_{r}.csv'

    for i in tqdm(range(len(data))):
        total += 1
        item = data.iloc[i]
        final_prompts = item['input_final_prompts']
        correct_answer = item['input_correct_responses'][0]
        question = item['input_question']
        choices = item['input_choice_list']
        subject = item['subtask_name']

        messages = [
            {'role': 'system', 'content': final_prompts},
        ]

        outputs = pipe(messages, max_new_tokens=64, do_sample=False, top_k=0, top_p=0)
        
        output = outputs[0]['generated_text'][-1]['content'].strip()
        llama_answer = ''

        try:
            llama_answer = parse_answer(output)
        except ValueError:
            llama_answer = f'Invalid string format: {output}'
        
        is_correct = False
        if correct_answer == llama_answer:
            correct_count += 1
            is_correct = True

        # if i % 100 == 0:
        #     print(f'{correct_count} / {i} Correct Answers')
        # Collect the result row
        rows.append([question, choices, llama_answer, correct_answer, is_correct, subject])

    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(["Question", "Choices", "LLaMA Answer", "Correct Answer", "Is Correct", "Subject"])
        # Write all rows at once
        writer.writerows(rows)

    print(f"Results saved to {output_file}")
        

Run 0


  0%|          | 0/14042 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Results saved to llama_mmlu_responses_run7_0.csv
Run 1


  0%|          | 0/14042 [00:00<?, ?it/s]

Results saved to llama_mmlu_responses_run7_1.csv
Run 2


  0%|          | 0/14042 [00:00<?, ?it/s]

Results saved to llama_mmlu_responses_run7_2.csv
Run 3


  0%|          | 0/14042 [00:00<?, ?it/s]

Results saved to llama_mmlu_responses_run7_3.csv
Run 4


  0%|          | 0/14042 [00:00<?, ?it/s]

Results saved to llama_mmlu_responses_run7_4.csv
