In [1]:
%%capture
%pip install evaluate tabulate

In [20]:
import os
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import torch
import gc
from datasets import load_dataset
from evaluate import load
from transformers import AutoModelForCausalLM, AutoTokenizer
from tabulate import tabulate
from tqdm import tqdm

# Map model names to readable labels
model_label_map = {
    "Solshine/Meta-Llama-3.1-8B-Instruct-Python-Coder": "LLaMa",
    "lmsys/vicuna-13b-v1.5": "Vicuna"
}

model_names = list(model_label_map.keys())
prompt_strategies = ["Chain of Thought", "Self-Debugging"]

# Store all generations and results
all_generations_table = []
results_all = []

# Outer loops
for model_name in model_names:                           # 2 models
    for prompt_strategy in prompt_strategies:            # 2 prompt types
        code_eval = load("code_eval")

        print(f"\n=== Loading model: {model_label_map[model_name]} ({prompt_strategy}) ===")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        model.eval()

        # Evaluate 10 problems
        for i in tqdm(range(10), desc=f"{model_label_map[model_name]} | {prompt_strategy}"):
            dataset = load_dataset("openai_humaneval")['test'].select([i])

            for problem in dataset:  # inner loop of 1
                entry_point = problem['entry_point']
                test_code = problem['test']
                problem_prompt = problem['prompt']

                # Construct prompt
                if prompt_strategy == "Chain of Thought":
                    prompt = f"""{problem['prompt']} Create a function def called {problem['entry_point']} to solve the given prompt. However, before writing any code, first concisely list out the steps to solve the problem. Then, below include the generated code solution following the listed steps. Return the function {problem['entry_point']}."""
                elif prompt_strategy == "Self-Debugging":
                    prompt = f"""{problem['prompt']} Create a function def called {problem['entry_point']} to solve the given prompt. Do this by attempting to generate a first pass of code to solve the problem. Then self evaluate that produced code, checking for errors and making any necessary corrections. Return the function {problem['entry_point']}."""

                # Generate 3 samples (innermost loop)
                problem_candidates = []
                for _ in range(3):
                    inputs = tokenizer(prompt, return_tensors="pt").to(device)
                    input_len = inputs["input_ids"].shape[1]

                    with torch.no_grad():
                        outputs = model.generate(
                            input_ids=inputs["input_ids"],
                            attention_mask=inputs["attention_mask"],
                            max_new_tokens=256,
                            temperature=0.7,
                            top_p=0.95,
                            do_sample=True,
                            eos_token_id=tokenizer.eos_token_id
                        )

                    generated_ids = outputs[0][input_len:]
                    generated_code = tokenizer.decode(generated_ids, skip_special_tokens=True)
                    problem_candidates.append(generated_code)

                # Store generations
                all_generations_table.append({
                    "Problem ID": i + 1,
                    "LLM": model_label_map[model_name],
                    "Prompt Type": prompt_strategy,
                    "Gen 1": problem_candidates[0],
                    "Gen 2": problem_candidates[1],
                    "Gen 3": problem_candidates[2]
                })

                # Evaluate
                k_values = [1, 3]
                pass_at_k, results = code_eval.compute(
                    references=[test_code],
                    predictions=[problem_candidates],
                    k=k_values,
                    num_workers=2,
                    timeout=10.0
                )

                results_all.append({
                    "LLM": model_label_map[model_name],
                    "Prompt Type": prompt_strategy,
                    "Problem ID": i + 1,
                    "Pass@1": round(pass_at_k["pass@1"] * 100, 2),
                    "Pass@3": round(pass_at_k["pass@3"] * 100, 2),
                    "RawResults": results
                })

                print(f"[{model_label_map[model_name]} | {prompt_strategy}] Problem {i + 1}")
                print(f"  Pass@1: {pass_at_k['pass@1'] * 100:.2f}% | Pass@3: {pass_at_k['pass@3'] * 100:.2f}%")

        # Cleanup between model runs
        del model, tokenizer
        torch.cuda.empty_cache()
        gc.collect()
        torch.cuda.ipc_collect()

# Final summary
print("\n=== Overall Pass@k Results Summary ===")
summary_table = [
    {
        "LLM": r["LLM"],
        "Prompt Type": r["Prompt Type"],
        "Problem ID": r["Problem ID"],
        "Pass@1": r["Pass@1"],
        "Pass@3": r["Pass@3"]
    }
    for r in results_all
]

print(tabulate(summary_table, headers="keys", tablefmt="grid"))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



=== Loading model: LLaMa (Chain of Thought) ===


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LLaMa | Chain of Thought:  10%|█         | 1/10 [00:34<05:08, 34.33s/it]

[LLaMa | Chain of Thought] Problem 1
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Chain of Thought:  20%|██        | 2/10 [01:00<03:55, 29.45s/it]

[LLaMa | Chain of Thought] Problem 2
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Chain of Thought:  30%|███       | 3/10 [01:25<03:11, 27.42s/it]

[LLaMa | Chain of Thought] Problem 3
  Pass@1: 0.00% | Pass@3: 0.00%


LLaMa | Chain of Thought:  40%|████      | 4/10 [01:48<02:34, 25.75s/it]

[LLaMa | Chain of Thought] Problem 4
  Pass@1: 66.67% | Pass@3: 100.00%


LLaMa | Chain of Thought:  50%|█████     | 5/10 [02:10<02:02, 24.45s/it]

[LLaMa | Chain of Thought] Problem 5
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Chain of Thought:  60%|██████    | 6/10 [02:32<01:33, 23.44s/it]

[LLaMa | Chain of Thought] Problem 6
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Chain of Thought:  70%|███████   | 7/10 [03:02<01:17, 25.82s/it]

[LLaMa | Chain of Thought] Problem 7
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Chain of Thought:  80%|████████  | 8/10 [03:26<00:50, 25.14s/it]

[LLaMa | Chain of Thought] Problem 8
  Pass@1: 0.00% | Pass@3: 0.00%


LLaMa | Chain of Thought:  90%|█████████ | 9/10 [03:56<00:26, 26.69s/it]

[LLaMa | Chain of Thought] Problem 9
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Chain of Thought: 100%|██████████| 10/10 [04:18<00:00, 25.83s/it]

[LLaMa | Chain of Thought] Problem 10
  Pass@1: 0.00% | Pass@3: 0.00%






=== Loading model: LLaMa (Self-Debugging) ===


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LLaMa | Self-Debugging:  10%|█         | 1/10 [00:27<04:10, 27.84s/it]

[LLaMa | Self-Debugging] Problem 1
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Self-Debugging:  20%|██        | 2/10 [00:46<02:59, 22.40s/it]

[LLaMa | Self-Debugging] Problem 2
  Pass@1: 66.67% | Pass@3: 100.00%


LLaMa | Self-Debugging:  30%|███       | 3/10 [01:13<02:51, 24.57s/it]

[LLaMa | Self-Debugging] Problem 3
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Self-Debugging:  40%|████      | 4/10 [01:23<01:53, 18.85s/it]

[LLaMa | Self-Debugging] Problem 4
  Pass@1: 0.00% | Pass@3: 0.00%


LLaMa | Self-Debugging:  50%|█████     | 5/10 [01:34<01:18, 15.80s/it]

[LLaMa | Self-Debugging] Problem 5
  Pass@1: 66.67% | Pass@3: 100.00%


LLaMa | Self-Debugging:  60%|██████    | 6/10 [01:47<00:59, 14.92s/it]

[LLaMa | Self-Debugging] Problem 6
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Self-Debugging:  70%|███████   | 7/10 [02:02<00:45, 15.19s/it]

[LLaMa | Self-Debugging] Problem 7
  Pass@1: 33.33% | Pass@3: 100.00%


LLaMa | Self-Debugging:  80%|████████  | 8/10 [02:18<00:30, 15.26s/it]

[LLaMa | Self-Debugging] Problem 8
  Pass@1: 100.00% | Pass@3: 100.00%


LLaMa | Self-Debugging:  90%|█████████ | 9/10 [02:37<00:16, 16.51s/it]

[LLaMa | Self-Debugging] Problem 9
  Pass@1: 0.00% | Pass@3: 0.00%


LLaMa | Self-Debugging: 100%|██████████| 10/10 [02:57<00:00, 17.80s/it]

[LLaMa | Self-Debugging] Problem 10
  Pass@1: 66.67% | Pass@3: 100.00%






=== Loading model: Vicuna (Chain of Thought) ===


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Vicuna | Chain of Thought:  10%|█         | 1/10 [00:36<05:29, 36.57s/it]

[Vicuna | Chain of Thought] Problem 1
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Chain of Thought:  20%|██        | 2/10 [01:13<04:53, 36.65s/it]

[Vicuna | Chain of Thought] Problem 2
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Chain of Thought:  30%|███       | 3/10 [01:49<04:14, 36.37s/it]

[Vicuna | Chain of Thought] Problem 3
  Pass@1: 33.33% | Pass@3: 100.00%


Vicuna | Chain of Thought:  40%|████      | 4/10 [02:23<03:32, 35.43s/it]

[Vicuna | Chain of Thought] Problem 4
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Chain of Thought:  50%|█████     | 5/10 [03:00<03:00, 36.00s/it]

[Vicuna | Chain of Thought] Problem 5
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Chain of Thought:  60%|██████    | 6/10 [03:36<02:24, 36.17s/it]

[Vicuna | Chain of Thought] Problem 6
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Chain of Thought:  70%|███████   | 7/10 [04:13<01:48, 36.25s/it]

[Vicuna | Chain of Thought] Problem 7
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Chain of Thought:  80%|████████  | 8/10 [04:47<01:11, 35.77s/it]

[Vicuna | Chain of Thought] Problem 8
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Chain of Thought:  90%|█████████ | 9/10 [05:24<00:35, 35.94s/it]

[Vicuna | Chain of Thought] Problem 9
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Chain of Thought: 100%|██████████| 10/10 [06:00<00:00, 36.08s/it]

[Vicuna | Chain of Thought] Problem 10
  Pass@1: 0.00% | Pass@3: 0.00%






=== Loading model: Vicuna (Self-Debugging) ===


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Vicuna | Self-Debugging:  10%|█         | 1/10 [00:29<04:27, 29.72s/it]

[Vicuna | Self-Debugging] Problem 1
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Self-Debugging:  20%|██        | 2/10 [01:06<04:30, 33.82s/it]

[Vicuna | Self-Debugging] Problem 2
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Self-Debugging:  30%|███       | 3/10 [01:37<03:48, 32.71s/it]

[Vicuna | Self-Debugging] Problem 3
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Self-Debugging:  40%|████      | 4/10 [02:14<03:24, 34.14s/it]

[Vicuna | Self-Debugging] Problem 4
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Self-Debugging:  50%|█████     | 5/10 [02:50<02:54, 34.99s/it]

[Vicuna | Self-Debugging] Problem 5
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Self-Debugging:  60%|██████    | 6/10 [03:25<02:19, 34.83s/it]

[Vicuna | Self-Debugging] Problem 6
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Self-Debugging:  70%|███████   | 7/10 [04:01<01:46, 35.34s/it]

[Vicuna | Self-Debugging] Problem 7
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Self-Debugging:  80%|████████  | 8/10 [04:31<01:07, 33.67s/it]

[Vicuna | Self-Debugging] Problem 8
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Self-Debugging:  90%|█████████ | 9/10 [05:08<00:34, 34.57s/it]

[Vicuna | Self-Debugging] Problem 9
  Pass@1: 0.00% | Pass@3: 0.00%


Vicuna | Self-Debugging: 100%|██████████| 10/10 [05:44<00:00, 34.41s/it]

[Vicuna | Self-Debugging] Problem 10
  Pass@1: 0.00% | Pass@3: 0.00%






=== Overall Pass@k Results Summary ===
+--------+------------------+--------------+----------+----------+
| LLM    | Prompt Type      |   Problem ID |   Pass@1 |   Pass@3 |
| LLaMa  | Chain of Thought |            1 |    33.33 |      100 |
+--------+------------------+--------------+----------+----------+
| LLaMa  | Chain of Thought |            2 |    33.33 |      100 |
+--------+------------------+--------------+----------+----------+
| LLaMa  | Chain of Thought |            3 |     0    |        0 |
+--------+------------------+--------------+----------+----------+
| LLaMa  | Chain of Thought |            4 |    66.67 |      100 |
+--------+------------------+--------------+----------+----------+
| LLaMa  | Chain of Thought |            5 |    33.33 |      100 |
+--------+------------------+--------------+----------+----------+
| LLaMa  | Chain of Thought |            6 |    33.33 |      100 |
+--------+------------------+--------------+----------+----------+
| LLaMa  | Chain of Th

In [5]:
import os
print(os.getcwd())

/content


In [21]:
all_generations_table

[{'Problem ID': 1,
  'LLM': 'LLaMa',
  'Prompt Type': 'Chain of Thought',
  'Gen 1': '\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """\n    Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n    \n    Parameters:\n    numbers (List[float]): The list of numbers to check.\n    threshold (float): The minimum distance between two numbers.\n    \n    Returns:\n    bool: True if there are two numbers closer than the threshold, False otherwise.\n    """\n    \n    # Sort the list of numbers in ascending order\n    numbers.sort()\n    \n    # Iterate over the list of numbers\n    for i in range(1, len(numbers)):\n        # Check if the difference between the current number and the previous number is less than the threshold\n        if numbers[i] - numbers[i - 1] < threshold:\n            # If it is, return True\n            return True\n    \n    # If no two numbers are closer than the threshold, return False\n

In [22]:
# Suppose all_generations_table is already defined

# Create a list to store all generations
all_generations = []

# Loop over each entry in the table
for entry in all_generations_table:
    # Append each generation in order
    all_generations.append(entry.get('Gen 1', ''))
    all_generations.append(entry.get('Gen 2', ''))
    all_generations.append(entry.get('Gen 3', ''))

# Join all generations into a single string, separated by a delimiter
# You can use '\n\n---\n\n' to make it readable
all_text = '\n\n---\n\n'.join(all_generations)

# Export to a text file
with open('all_generations.txt', 'w', encoding='utf-8') as f:
    f.write(all_text)

print("Export complete! Saved as all_generations.txt")


Export complete! Saved as all_generations.txt


In [23]:
from google.colab import files

files.download("/content/all_generations.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
from tabulate import tabulate

print("\n=== All Generated Code Samples ===")
table_str = tabulate(all_generations_table, headers="keys", tablefmt="grid")
print(table_str)

with open("all_generated_code_samples.csv", "w", encoding="utf-8") as f:
    f.write(table_str)



=== All Generated Code Samples ===
+--------------+--------+------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
from google.colab import files

files.download("/content/all_generated_code_samples.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
from tabulate import tabulate

# Merge generations + results for display
detailed_rows = []
for gen_entry in all_generations_table:
    # Find matching eval result
    result_entry = next(
        (r for r in results_all
         if r["LLM"] == gen_entry["LLM"]
         and r["Prompt Type"] == gen_entry["Prompt Type"]
         and r["Problem ID"] == gen_entry["Problem ID"]),
        None
    )
    if result_entry:
        detailed_rows.append({
            "Problem ID": gen_entry["Problem ID"],
            "LLM": gen_entry["LLM"],
            "Prompt Type": gen_entry["Prompt Type"],
            "Gen 1": gen_entry["Gen 1"].strip()[:80] + "...",  # truncate for readability
            "Gen 2": gen_entry["Gen 2"].strip()[:80] + "...",
            "Gen 3": gen_entry["Gen 3"].strip()[:80] + "...",
            "Pass@1": result_entry["Pass@1"],
            "Pass@3": result_entry["Pass@3"],
            "RawResults": result_entry["RawResults"][0]
        })

# Pretty-print the 40 rows
print("\n=== Detailed Per-Problem Results (40 rows) ===")
table_pass_str = tabulate(detailed_rows, headers="keys", tablefmt="grid", showindex=False)
print(table_pass_str)

with open("pass_results.csv", "w", encoding="utf-8") as f:
    f.write(table_pass_str)



=== Detailed Per-Problem Results (40 rows) ===
+--------------+--------+------------------+-------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+----------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   Problem ID | LLM    | Prompt Type      | Gen 1                                                                               | Gen 2                                                                  

In [19]:
from google.colab import files

files.download("/content/pass_results.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>