In [2]:
!pip install transformers datasets torch



In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import time
import gc
import torch

In [4]:
# Login to Hugging Face
from huggingface_hub import login


login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
dataset = load_dataset("cais/mmlu", "college_mathematics")

README.md:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/138k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/16.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [31]:
import re

# Map index to letter (0 -> A, 1 -> B, 2 -> C, 3 -> D)
def index_to_letter(index):
    return ["A", "B", "C", "D"][index]

# Helper function to parse a single-letter answer (A, B, C, or D)
def parse_answer(output):
    match = re.search(r'\b[A-D]\b', output)
    return match.group(0) if match else None

In [6]:
model_paths = [
    "google/gemma-2b-it",
    "microsoft/Phi-3.5-mini-instruct",
    "meta-llama/Meta-Llama-3.1-8B-Instruct"
]

In [33]:
def zero_shot_inference(model, tokenizer, question, choices):
    options_text = "\n".join([f"{index_to_letter(i)}: {choice}" for i, choice in enumerate(choices)])
    prompt = f"Choose the correct answer from the options below. Only respond with a single letter: A, B, C, or D.\nQuestion: {question}\nOptions:\n{options_text}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=50)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [34]:
# 2. Chain of Thought
def chain_of_thought_inference(model, tokenizer, question, choices):
    options_text = "\n".join([f"{index_to_letter(i)}: {choice}" for i, choice in enumerate(choices)])
    prompt = f"Choose the correct answer for the question below by thinking step by step. Respond with a single letter: A, B, C, or D.\nQuestion: {question}\nOptions:\n{options_text}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=50)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [35]:
# 3. ReAct Prompting
def react_prompt_inference(model, tokenizer, question, choices):
    options_text = "\n".join([f"{index_to_letter(i)}: {choice}" for i, choice in enumerate(choices)])
    prompt = f"Question: {question}\nOptions:\n{options_text}\nAnswer with reasoning steps, then select a single letter answer (A, B, C, or D)."
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=50)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [32]:
def evaluate_model(model, tokenizer, questions, answers, prompt_func):
    correct = 0
    total_time = 0.0

    for question, answer_index in zip(questions, answers):
        start_time = time.time()
        generated_answer = prompt_func(model, tokenizer, question["question"], question["choices"])
        total_time += time.time() - start_time

        # Convert answer index to letter for comparison
        correct_answer = index_to_letter(answer_index)
        parsed_answer = parse_answer(generated_answer)

        if parsed_answer == correct_answer:
            correct += 1

    accuracy = correct / len(questions)
    avg_time = total_time / len(questions)
    return accuracy, avg_time

In [15]:
print(dataset["test"][0])

{'question': 'Let k be the number of real solutions of the equation e^x + x - 2 = 0 in the interval [0, 1], and let n be the number of real solutions that are not in [0, 1]. Which of the following is true?', 'subject': 'college_mathematics', 'choices': ['k = 0 and n = 1', 'k = 1 and n = 0', 'k = n = 1', 'k > 1'], 'answer': 1}


In [36]:
# Sample a subset of questions for evaluation (to save time and resources)
sample_data = dataset["test"].select(range(10))
sample_questions = [{"question": item["question"], "choices": item["choices"]} for item in sample_data]
sample_answers = [item["answer"] for item in sample_data]

In [28]:
results = {}

In [37]:
# Run evaluations model-by-model to avoid memory issues
for model_name in model_paths:
    print(f"Loading model {model_name}...")
    
    # Load model and tokenizer with authentication if needed (replace 'your_token' if using a gated model)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Store results for the current model
    model_results = {}

    # Zero Shot
    accuracy, avg_time = evaluate_model(model, tokenizer, sample_questions, sample_answers, zero_shot_inference)
    model_results["Zero Shot"] = {"accuracy": accuracy, "avg_time": avg_time}
    
    # Chain of Thought
    accuracy, avg_time = evaluate_model(model, tokenizer, sample_questions, sample_answers, chain_of_thought_inference)
    model_results["Chain of Thought"] = {"accuracy": accuracy, "avg_time": avg_time}
    
    # ReAct Prompting
    accuracy, avg_time = evaluate_model(model, tokenizer, sample_questions, sample_answers, react_prompt_inference)
    model_results["ReAct"] = {"accuracy": accuracy, "avg_time": avg_time}
    
    # Save results for this model
    results[model_name] = model_results

    # Unload the model and clear memory
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

    print(f"Completed evaluation for {model_name}\n")

# Display results for each model and prompting method
for model_name, result in results.items():
    print(f"Results for model {model_name}:")
    for prompt_style, metrics in result.items():
        print(f"  {prompt_style} - Accuracy: {metrics['accuracy']:.2f}, Avg Inference Time: {metrics['avg_time']:.2f} seconds")
    print("\n")

Loading model google/gemma-2b-it...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Completed evaluation for google/gemma-2b-it

Loading model microsoft/Phi-3.5-mini-instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Completed evaluation for microsoft/Phi-3.5-mini-instruct

Loading model meta-llama/Meta-Llama-3.1-8B-Instruct...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Completed evaluation for meta-llama/Meta-Llama-3.1-8B-Instruct

Results for model google/gemma-2b-it:
  Zero Shot - Accuracy: 0.20, Avg Inference Time: 0.90 seconds
  Chain of Thought - Accuracy: 0.20, Avg Inference Time: 2.03 seconds
  ReAct - Accuracy: 0.30, Avg Inference Time: 2.03 seconds


Results for model microsoft/Phi-3.5-mini-instruct:
  Zero Shot - Accuracy: 0.20, Avg Inference Time: 3.62 seconds
  Chain of Thought - Accuracy: 0.20, Avg Inference Time: 3.62 seconds
  ReAct - Accuracy: 0.30, Avg Inference Time: 3.61 seconds


Results for model meta-llama/Meta-Llama-3.1-8B-Instruct:
  Zero Shot - Accuracy: 0.20, Avg Inference Time: 3.50 seconds
  Chain of Thought - Accuracy: 0.20, Avg Inference Time: 3.48 seconds
  ReAct - Accuracy: 0.30, Avg Inference Time: 3.48 seconds


