<a href="https://colab.research.google.com/github/lindsayydevine/cs320-git-activity/blob/main/Exercise1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
%pip install evaluate tabulate

In [2]:
import os
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import torch
import gc
from datasets import load_dataset
from evaluate import load
from transformers import AutoModelForCausalLM, AutoTokenizer
from tabulate import tabulate
from tqdm import tqdm

# Mapping model names to human-readable labels
model_label_map = {
    "Solshine/Meta-Llama-3.1-8B-Instruct-Python-Coder": "LLaMa",
    "lmsys/vicuna-13b-v1.5": "Vicuna"
}

model_names = list(model_label_map.keys())
prompt_strategies = ["Chain of Thought", "Self-Debugging"]

# Storage for all generations
all_generations_table = []

for model_name in model_names:
    for prompt_strategy in prompt_strategies:
        # Load the code evaluation metric
        code_eval = load("code_eval")

        # Load tokenizer and model
        print(f"\nLoading model: {model_label_map[model_name]}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        model.eval()

        results_table = []
        results_all = []

        for i in tqdm(range(10), desc="Problems"):
            human_eval_indiv_problem = load_dataset("openai_humaneval")['test'].select([i])
            num_samples_per_problem = 3

            test_cases = []
            candidates = []

            for problem in human_eval_indiv_problem:
                entry_point = problem['entry_point']
                test_code = problem['test']
                problem_prompt = problem['prompt']

                if prompt_strategy == "Raw":
                  prompt = problem['prompt']

                elif prompt_strategy == "Chain of Thought":
                    prompt = f"""
                      \"\"\"
                      Write a clean, compilable Python function named `{problem['entry_point']}` that takes in the arguments and import statements described and solves the problem below.

                      {problem['prompt']}

                      Include listed steps for solving the code implementation in a docstring.
                      \"\"\"

                      Return the steps followed by final completed function definition and must include all imports from {problem['prompt']}:

                      from typing import List
                      def {problem['entry_point']}(
                      """

                elif prompt_strategy == "Self-Debugging":
                    prompt = f"""
                      \"\"\"
                      Write a clean, compilable Python function named `{problem['entry_point']}` that takes in the arguments and import statements described and solves the problem below.

                      {problem['prompt']}

                      Debug the code and fix any errors or missing parts. Check that all indentation is appropriate so that code is compilable without errors.
                      \"\"\"

                      Return the final completed fixed function definition and must include all imports from {problem['prompt']}:

                      from typing import List
                      def {problem['entry_point']}(
                      """

                test_cases.append(test_code)
                problem_candidates = []

                for _ in range(num_samples_per_problem):
                    inputs = tokenizer(prompt, return_tensors="pt").to(device)
                    input_len = inputs["input_ids"].shape[1]

                    with torch.no_grad():
                        outputs = model.generate(
                            input_ids=inputs["input_ids"],
                            attention_mask=inputs["attention_mask"],
                            max_new_tokens=256,
                            temperature=0.7,
                            top_p=0.95,
                            do_sample=True,
                            eos_token_id=tokenizer.eos_token_id
                        )

                    generated_ids = outputs[0][input_len:]
                    generated_code = tokenizer.decode(generated_ids, skip_special_tokens=True)

                    full_code = f"def {entry_point}(" + generated_code
                    problem_candidates.append(full_code)

                candidates.append(problem_candidates)

                # Store the generations
                all_generations_table.append({
                    "Problem ID": i + 1,
                    "LLM": model_label_map[model_name],
                    "Prompt Type": prompt_strategy,
                    "Gen 1": problem_candidates[0],
                    "Gen 2": problem_candidates[1],
                    "Gen 3": problem_candidates[2]
                })

                # Memory cleanup after each problem
                torch.cuda.empty_cache()
                gc.collect()
                torch.cuda.ipc_collect()

        print("Code generation complete. Evaluating...")

        # Evaluate the generated code
        k_values = [1, 3]

        print(test_cases)
        print(candidates)

        pass_at_k, results = code_eval.compute(
            references=test_cases,
            predictions=candidates,
            k=k_values,
            num_workers=2,
            timeout=10.0
        )

        results_all.append(results)

        for k in k_values:
            print(f"Pass@{k}: {pass_at_k[f'pass@{k}'] * 100:.2f}%")
            print(f"Results for Pass@{k}:{results}")

        results_table.append({
            "Problem ID": i + 1,
            "Pass@1": round(pass_at_k["pass@1"] * 100, 2),
            "Pass@3": round(pass_at_k["pass@3"] * 100, 2)
        })

        # Delete model before loading the next one
        del model
        del tokenizer
        torch.cuda.empty_cache()
        gc.collect()

        print("\n=== Pass@k Results Summary ===")
        print(tabulate(results_table, headers="keys", tablefmt="grid"))



Loading model: LLaMa


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Problems: 100%|██████████| 10/10 [02:55<00:00, 17.52s/it]

Code generation complete. Evaluating...
["\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == []\n    assert candidate([1, 2, 3, 4]) == [1, 2, 3, 4]\n    assert candidate([4, 3, 2, 1]) == [4, 4, 4, 4]\n    assert candidate([3, 2, 3, 100, 3]) == [3, 3, 3, 100, 100]\n"]
[['def rolling_max( numbers: List[int]\n                      ) -> List[int]:\n    result = []\n    for i in range(len(numbers)):\n        max_val = numbers[i]\n        for j in range(i):\n            if numbers[j] > max_val:\n                max_val = numbers[j]\n        result.append(max_val)\n    return result', 'def rolling_max( numbers: List[int]\n                       ) -> List[int]:\n    # Initialize the result list with the first element of the input list\n    result = [numbers[0]]\n    # Iterate over the input list, starting from the second element\n    for i in range(1, len(numbers)):\n        # If the current element is greater than the last e




Pass@1: 0.00%
Results for Pass@1:defaultdict(<class 'list'>, {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed: name 'List' is not defined", 'completion_id': 0}), (1, {'task_id': 0, 'passed': False, 'result': "failed: name 'List' is not defined", 'completion_id': 1}), (2, {'task_id': 0, 'passed': False, 'result': "failed: name 'List' is not defined", 'completion_id': 2})]})
Pass@3: 0.00%
Results for Pass@3:defaultdict(<class 'list'>, {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed: name 'List' is not defined", 'completion_id': 0}), (1, {'task_id': 0, 'passed': False, 'result': "failed: name 'List' is not defined", 'completion_id': 1}), (2, {'task_id': 0, 'passed': False, 'result': "failed: name 'List' is not defined", 'completion_id': 2})]})

=== Pass@k Results Summary ===
+--------------+----------+----------+
|   Problem ID |   Pass@1 |   Pass@3 |
|           10 |        0 |        0 |
+--------------+----------+----------+

Loading model: LLaMa


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Problems: 100%|██████████| 10/10 [02:51<00:00, 17.15s/it]

Code generation complete. Evaluating...
["\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == []\n    assert candidate([1, 2, 3, 4]) == [1, 2, 3, 4]\n    assert candidate([4, 3, 2, 1]) == [4, 4, 4, 4]\n    assert candidate([3, 2, 3, 100, 3]) == [3, 3, 3, 100, 100]\n"]
[['def rolling_max( numbers: List[int]\n                      ) -> List[int]:\n                      result = [max(numbers)]\n                      for i in range(1, len(numbers)):\n                      result.append(max(result[-1], numbers[i]))\n                      return result\n                     ', 'def rolling_max( numbers: List[int]\n                      ) -> List[int]:\n                       max_values = []\n                       current_max = 0\n                       for n in numbers:\n                        current_max = max(current_max, n)\n                        max_values.append(current_max)\n                       return max_values




Pass@1: 0.00%
Results for Pass@1:defaultdict(<class 'list'>, {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed: expected an indented block after 'for' statement on line 4 (<string>, line 5)", 'completion_id': 0}), (1, {'task_id': 0, 'passed': False, 'result': 'failed: unindent does not match any outer indentation level (<string>, line 9)', 'completion_id': 1}), (2, {'task_id': 0, 'passed': False, 'result': "failed: name 'List' is not defined", 'completion_id': 2})]})
Pass@3: 0.00%
Results for Pass@3:defaultdict(<class 'list'>, {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed: expected an indented block after 'for' statement on line 4 (<string>, line 5)", 'completion_id': 0}), (1, {'task_id': 0, 'passed': False, 'result': 'failed: unindent does not match any outer indentation level (<string>, line 9)', 'completion_id': 1}), (2, {'task_id': 0, 'passed': False, 'result': "failed: name 'List' is not defined", 'completion_id': 2})]})

=== Pass@k Results Summary ===
+-----

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Problems: 100%|██████████| 10/10 [05:25<00:00, 32.56s/it]

Code generation complete. Evaluating...
["\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == []\n    assert candidate([1, 2, 3, 4]) == [1, 2, 3, 4]\n    assert candidate([4, 3, 2, 1]) == [4, 4, 4, 4]\n    assert candidate([3, 2, 3, 100, 3]) == [3, 3, 3, 100, 100]\n"]
[['def rolling_max():', 'def rolling_max(\n                      from typing import List\n                      def rolling\\_max(numbers: List[int]) -> List[int]:\n                          """\n                          \n                          from typing import List\n                          def rolling\\_max(numbers: List[int]) -> List[int]:\n                              """\n                              \n                              if len(numbers) < 2:\n                                  return numbers\n                              else:\n                                  first_element = numbers[0]\n                                  for i i




Pass@1: 0.00%
Results for Pass@1:defaultdict(<class 'list'>, {0: [(0, {'task_id': 0, 'passed': False, 'result': 'failed: expected an indented block after function definition on line 1 (<string>, line 4)', 'completion_id': 0}), (1, {'task_id': 0, 'passed': False, 'result': "failed: '(' was never closed (<string>, line 1)", 'completion_id': 1}), (2, {'task_id': 0, 'passed': False, 'result': 'failed: expected an indented block after function definition on line 1 (<string>, line 4)', 'completion_id': 2})]})
Pass@3: 0.00%
Results for Pass@3:defaultdict(<class 'list'>, {0: [(0, {'task_id': 0, 'passed': False, 'result': 'failed: expected an indented block after function definition on line 1 (<string>, line 4)', 'completion_id': 0}), (1, {'task_id': 0, 'passed': False, 'result': "failed: '(' was never closed (<string>, line 1)", 'completion_id': 1}), (2, {'task_id': 0, 'passed': False, 'result': 'failed: expected an indented block after function definition on line 1 (<string>, line 4)', 'compl

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Problems:  50%|█████     | 5/10 [01:49<01:52, 22.57s/it]

In [None]:
from tabulate import tabulate

print("\n=== All Generated Code Samples ===")
print(tabulate(all_generations_table, headers="keys", tablefmt="grid"))

In [None]:
from tabulate import tabulate

detailed_results_table = []

# Go through each problem and candidate
for problem_id, completions in results.items():
    generation_row = next(item for item in all_generations_table if item["Problem ID"] == problem_id + 1)

    for completion_idx, result_info in completions:
        gen_code = generation_row[f"Gen {completion_idx + 1}"]
        passed = result_info['passed']
        reason = result_info['result']

        detailed_results_table.append({
            "Problem ID": problem_id + 1,
            "LLM": generation_row["LLM"],
            "Prompt Type": generation_row["Prompt Type"],
            "Gen #": f"Gen {completion_idx + 1}",
            "Passed": "✅" if passed else "❌",
            "Reason": reason,
            "Generated Code": gen_code  # truncate long code
        })

# Print table
print("\n=== Detailed Generation Results ===")
print(tabulate(detailed_results_table, headers="keys", tablefmt="fancy_grid"))
