In [1]:

# Cell 2: Import Libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import subprocess
import os
import tempfile
import json
from pathlib import Path
from IPython.display import clear_output

print("✓ Libraries imported successfully")



✓ Libraries imported successfully


In [13]:

# ============================================================================
# Cell 3: Define Helper Functions
# ============================================================================


def load_model_and_tokenizer(model_name):
    """
    Load the pre-trained model and tokenizer.
    
    Args:
        model_name: HuggingFace model identifier
    
    Returns:
        model: The loaded language model
        tokenizer: The corresponding tokenizer
    """
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Determine device (use GPU if available, otherwise CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load model without device_map (which requires accelerate)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )
    
    # Move model to device
    model = model.to(device)
    
    print("✓ Model and tokenizer loaded successfully")
    return model, tokenizer


def load_test_dataset():
    """
    Load the test dataset for Racket code generation.
    
    Returns:
        dataset: The test problems dataset (the 'train' split)
    """
    print("Loading test dataset...")
    dataset = load_dataset(
        "nuprl/engineering-llm-systems",
        "mbpp-rkt-test-problems"
    )
    # The dataset structure uses 'train' as the split name
    test_data = dataset['train']
    print(f"✓ Loaded {len(test_data)} test problems")
    return test_data


def generate_completions(model, tokenizer, prompt, num_completions=5):
    """
    Generate multiple completions for a given prompt.
    
    Args:
        model: The language model
        tokenizer: The tokenizer
        prompt: The input prompt text
        num_completions: Number of completions to generate (default: 5)
    
    Returns:
        completions: List of generated code strings
    """
    # Set pad token if not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Tokenize the input prompt with attention mask
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    
    completions = []
    
    # Generate completions one at a time to avoid parallelism
    for i in range(num_completions):
        print(f"  Generating completion {i+1}/{num_completions}...", end='\r')
        
        # Generate with specified parameters, including attention_mask
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=512,
            top_p=0.95,
            temperature=0.2,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        
        # Decode the generated tokens
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the completion (remove the prompt part)
        completion = generated_text[len(prompt):]
        completions.append(completion)
    
    print(f"  ✓ Generated {num_completions} completions")
    return completions


def save_generation_to_json(task_id, problem, prompt, completions, output_dir="completions"):
    """
    Save generation information to a JSON file.
    
    Args:
        task_id: Problem task ID
        problem: The problem dictionary
        prompt: The prompt used for generation
        completions: List of generated completions
        output_dir: Directory to save JSON files
    
    Returns:
        filepath: Path to the saved JSON file
    """
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Prepare data structure
    generation_data = {
        'task_id': task_id,
        'description': problem['description'],
        'input_format': problem['input_format'],
        'output_format': problem['output_format'],
        'prompt': prompt,
        'completions': completions,
        'test_cases': problem['tests']
    }
    
    # Save to JSON file
    filepath = os.path.join(output_dir, f"task_{task_id}.json")
    with open(filepath, 'w') as f:
        json.dump(generation_data, f, indent=2)
    
    return filepath


def generate_and_save_all_completions(model, tokenizer, dataset, output_dir="completions"):
    """
    Generate completions for all problems and save to JSON files.
    Skips problems that already have saved completions.
    
    Args:
        model: The language model
        tokenizer: The tokenizer
        dataset: The test dataset
        output_dir: Directory to save JSON files
    
    Returns:
        num_generated: Number of problems actually generated (not skipped)
    """
    total_problems = len(dataset)
    num_generated = 0
    num_skipped = 0
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    print(f"\n{'='*60}")
    print(f"Generating completions for {total_problems} problems")
    print(f"{'='*60}\n")
    
    for idx, problem in enumerate(dataset):
        task_id = problem['task_id']
        
        # Check if file already exists
        filepath = os.path.join(output_dir, f"task_{task_id}.json")
        
        if os.path.exists(filepath):
            print(f"[{idx+1}/{total_problems}] Task {task_id} - ⏭️  SKIPPED (already exists)")
            num_skipped += 1
            continue
        
        print(f"[{idx+1}/{total_problems}] Task {task_id}")
        
        # Create prompt
        description = problem['description']
        input_format = problem['input_format']
        output_format = problem['output_format']
        
        prompt = f"""; {description}
; Input format: {input_format}
; Output format: {output_format}

#lang racket

"""
        
        # Generate completions
        completions = generate_completions(model, tokenizer, prompt, num_completions=5)
        
        # Save to JSON
        filepath = save_generation_to_json(task_id, problem, prompt, completions, output_dir)
        print(f"  ✓ Saved to {filepath}\n")
        num_generated += 1
    
    print(f"{'='*60}")
    print(f"Generation complete!")
    print(f"  Generated: {num_generated} new problems")
    print(f"  Skipped: {num_skipped} existing problems")
    print(f"  Total files in '{output_dir}/': {num_generated + num_skipped}")
    print(f"{'='*60}\n")
    
    return num_generated


def test_racket_completion(completion_string, test_cases, racket_path=None):
    """
    Tests a Racket code completion against a list of test cases.
    
    Args:
        completion_string: A string containing the entire Racket program
        test_cases: A list of dictionaries, where each dict has 'input' and 'output' keys
        racket_path: Path to Racket executable (uses RACKET_PATH global if None)
    
    Returns:
        A list of dictionaries detailing the results for each test case
    """
    # Use global RACKET_PATH if not specified
    if racket_path is None:
        racket_path = globals().get('RACKET_PATH', 'racket')
    
    results = []
    
    # Use a temporary file to store the Racket code
    # The 'with' statement handles cleanup automatically
    with tempfile.NamedTemporaryFile(
        mode='w+', suffix='.rkt', delete=False
    ) as temp_file:
        temp_file.write(completion_string)
        temp_file_path = temp_file.name
    
    try:
        for case in test_cases:
            test_input = case['input']
            expected_output = case['output']
            
            try:
                # Execute the racket file as a subprocess
                # We set a timeout of 5 seconds to prevent infinite loops
                process = subprocess.run(
                    [racket_path, temp_file_path],
                    input=test_input,
                    capture_output=True,
                    text=True,  # Work with strings, not bytes
                    timeout=5
                )
                
                # The Racket code ran but might have crashed
                if process.returncode != 0:
                    actual_output = f"RUNTIME ERROR:\n{process.stderr.strip()}"
                    passed = False
                else:
                    # The code ran successfully, compare its output
                    actual_output = process.stdout.strip()
                    passed = (actual_output == expected_output)
                    
            except subprocess.TimeoutExpired:
                actual_output = "EXECUTION TIMED OUT (5 seconds)"
                passed = False
                
            results.append({
                'passed': passed,
                'input': test_input,
                'expected': expected_output,
                'actual': actual_output
            })
            
    finally:
        # Ensure the temporary file is deleted even if errors occur
        os.remove(temp_file_path)
        
    return results


def check_test_cases(test_results):
    """
    Check if all test cases passed.
    
    Args:
        test_results: List of test result dictionaries from test_racket_completion
    
    Returns:
        passed: Boolean indicating if all tests passed
    """
    return all(result['passed'] for result in test_results)


def evaluate_from_json(json_filepath):
    """
    Load a generation JSON file and evaluate all completions.
    
    Args:
        json_filepath: Path to the JSON file
    
    Returns:
        passed: Boolean indicating if at least one completion passed
        results: Dictionary with detailed results for each completion
    """
    # Load the generation data
    with open(json_filepath, 'r') as f:
        data = json.load(f)
    
    prompt = data['prompt']
    completions = data['completions']
    test_cases = data['test_cases']
    
    results = {
        'task_id': data['task_id'],
        'completions_tested': [],
        'any_passed': False
    }
    
    # Test each completion
    for i, completion in enumerate(completions):
        # Combine prompt and completion
        full_code = prompt + completion
        
        # Test the code using the new test function
        test_results = test_racket_completion(full_code, test_cases)
        
        # Check if all tests passed
        all_tests_passed = check_test_cases(test_results)
        
        completion_result = {
            'completion_index': i,
            'all_tests_passed': all_tests_passed,
            'test_details': test_results
        }
        
        if all_tests_passed:
            results['any_passed'] = True
        
        results['completions_tested'].append(completion_result)
    
    return results['any_passed'], results


def evaluate_all_completions(completions_dir="completions"):
    """
    Evaluate all generated completions from JSON files.
    
    Args:
        completions_dir: Directory containing JSON files
    
    Returns:
        pass_at_1: The pass@1 score
        all_results: List of detailed results for each problem
    """
    # Get all JSON files
    json_files = sorted(Path(completions_dir).glob("task_*.json"))
    total_problems = len(json_files)
    
    if total_problems == 0:
        print(f"No JSON files found in '{completions_dir}/'")
        return 0.0, []
    
    print(f"\n{'='*60}")
    print(f"Evaluating {total_problems} problems from '{completions_dir}/'")
    print(f"{'='*60}\n")
    
    problems_passed = 0
    all_results = []
    
    # Evaluate each problem
    for idx, json_file in enumerate(json_files):
        print(f"[{idx+1}/{total_problems}] Evaluating {json_file.name}")
        
        passed, detailed_results = evaluate_from_json(json_file)
        
        if passed:
            problems_passed += 1
            print(f"  ✓ PASSED\n")
        else:
            print(f"  ✗ FAILED\n")
        
        all_results.append(detailed_results)
    
    # Calculate pass@1 score
    pass_at_1 = problems_passed / total_problems
    
    print(f"{'='*60}")
    print(f"EVALUATION COMPLETE")
    print(f"{'='*60}")
    print(f"Problems passed: {problems_passed}/{total_problems}")
    print(f"Pass@1 Score: {pass_at_1:.4f} ({pass_at_1*100:.2f}%)")
    
    return pass_at_1, all_results


def save_evaluation_results(pass_at_1, all_results, output_file="evaluation_results.json"):
    """
    Save evaluation results to a JSON file.
    
    Args:
        pass_at_1: The pass@1 score
        all_results: List of detailed results
        output_file: Output filename
    """
    results_summary = {
        'pass_at_1_score': pass_at_1,
        'total_problems': len(all_results),
        'problems_passed': sum(1 for r in all_results if r['any_passed']),
        'detailed_results': all_results
    }
    
    with open(output_file, 'w') as f:
        json.dump(results_summary, f, indent=2)
    
    print(f"\n✓ Detailed results saved to {output_file}")

print("✓ All functions defined successfully")


✓ All functions defined successfully


In [8]:

# ============================================================================
# Cell 4: Configuration
# ============================================================================

# Model configuration
MODEL_NAME = "Qwen/Qwen3-1.7B-Base"
COMPLETIONS_DIR = "completions"

# Racket executable path
RACKET_PATH = os.path.expanduser("~/racket/bin/racket")

print(f"Configuration set:")
print(f"  Model: {MODEL_NAME}")
print(f"  Completions directory: {COMPLETIONS_DIR}")
print(f"  Racket path: {RACKET_PATH}")


Configuration set:
  Model: Qwen/Qwen3-1.7B-Base
  Completions directory: completions
  Racket path: /u/eibarra1/racket/bin/racket


In [4]:

# ============================================================================
# Cell 5: Load Model and Tokenizer
# ============================================================================

# Load the model and tokenizer
model, tokenizer = load_model_and_tokenizer(MODEL_NAME)


Loading model: Qwen/Qwen3-1.7B-Base
Using device: cuda
✓ Model and tokenizer loaded successfully


In [11]:

# ============================================================================
# Cell 6: Load Test Dataset
# ============================================================================

# Load the test dataset
test_dataset = load_test_dataset()

# Display sample problem
print("\nSample problem from dataset:")
sample = test_dataset[0]
print(sample)
print(f"Task ID: {sample['task_id']}")
print(f"Description: {sample['description']}")
print(f"Number of tests: {len(sample['tests'])}")
print(f"First test input preview: {sample['tests'][0]['input'][:100]}...")
print(f"First test expected output: {sample['tests'][0]['output']}")


Loading test dataset...
✓ Loaded 50 test problems

Sample problem from dataset:
{'description': 'Given a list of lists, write a function to find the list with the maximum length using a lambda function. Return a tuple containing the length of the longest list and the list itself.', 'input_format': 'The first line contains an integer N, the number of lists. This is followed by N lines, each containing space-separated integers representing a list.', 'output_format': 'The output is the length of the longest list followed by the elements of the longest list, all separated by spaces.', 'tests': [{'input': '5\n0\n1 3\n5 7\n9 11\n13 15 17', 'output': '3 13 15 17'}, {'input': '5\n1 2 3 4 5\n1 2 3 4\n1 2 3\n1 2\n1', 'output': '5 1 2 3 4 5'}, {'input': '3\n3 4 5\n6 7 8 9\n10 11 12', 'output': '4 6 7 8 9'}], 'task_id': 393}
Task ID: 393
Description: Given a list of lists, write a function to find the list with the maximum length using a lambda function. Return a tuple containing the length of the

In [6]:

# ============================================================================
# Cell 7: Generate Completions and Save to JSON (This will take time!)
# ============================================================================

# Generate completions for all problems and save to JSON
print("Starting generation... This may take a while!\n")
num_generated = generate_and_save_all_completions(model, tokenizer, test_dataset, COMPLETIONS_DIR)

print(f"\n✓ Generated and saved completions for {num_generated} problems")


Starting generation... This may take a while!


Generating completions for 50 problems

[1/50] Task 393 - ⏭️  SKIPPED (already exists)
[2/50] Task 71 - ⏭️  SKIPPED (already exists)
[3/50] Task 97 - ⏭️  SKIPPED (already exists)
[4/50] Task 353 - ⏭️  SKIPPED (already exists)
[5/50] Task 307 - ⏭️  SKIPPED (already exists)
[6/50] Task 64 - ⏭️  SKIPPED (already exists)
[7/50] Task 445 - ⏭️  SKIPPED (already exists)
[8/50] Task 205 - ⏭️  SKIPPED (already exists)
[9/50] Task 333 - ⏭️  SKIPPED (already exists)
[10/50] Task 498 - ⏭️  SKIPPED (already exists)
[11/50] Task 178 - ⏭️  SKIPPED (already exists)
[12/50] Task 342 - ⏭️  SKIPPED (already exists)
[13/50] Task 268 - ⏭️  SKIPPED (already exists)
[14/50] Task 51 - ⏭️  SKIPPED (already exists)
[15/50] Task 484 - ⏭️  SKIPPED (already exists)
[16/50] Task 322 - ⏭️  SKIPPED (already exists)
[17/50] Task 443 - ⏭️  SKIPPED (already exists)
[18/50] Task 464 - ⏭️  SKIPPED (already exists)
[19/50] Task 197 - ⏭️  SKIPPED (already exists)
[20/50] Task 

In [14]:

# ============================================================================
# Cell 8: Evaluate Completions from JSON Files
# ============================================================================

# Evaluate all completions from JSON files
print("Starting evaluation from saved JSON files...\n")
pass_at_1_score, evaluation_results = evaluate_all_completions(COMPLETIONS_DIR)


Starting evaluation from saved JSON files...


Evaluating 50 problems from 'completions/'

[1/50] Evaluating task_102.json
  ✗ FAILED

[2/50] Evaluating task_130.json
  ✗ FAILED

[3/50] Evaluating task_135.json
  ✗ FAILED

[4/50] Evaluating task_137.json
  ✗ FAILED

[5/50] Evaluating task_178.json
  ✗ FAILED

[6/50] Evaluating task_180.json
  ✗ FAILED

[7/50] Evaluating task_193.json
  ✗ FAILED

[8/50] Evaluating task_197.json
  ✗ FAILED

[9/50] Evaluating task_205.json
  ✗ FAILED

[10/50] Evaluating task_256.json
  ✗ FAILED

[11/50] Evaluating task_265.json
  ✗ FAILED

[12/50] Evaluating task_268.json
  ✗ FAILED

[13/50] Evaluating task_307.json
  ✗ FAILED

[14/50] Evaluating task_315.json
  ✗ FAILED

[15/50] Evaluating task_322.json
  ✗ FAILED

[16/50] Evaluating task_329.json
  ✗ FAILED

[17/50] Evaluating task_333.json
  ✗ FAILED

[18/50] Evaluating task_342.json
  ✗ FAILED

[19/50] Evaluating task_347.json
  ✗ FAILED

[20/50] Evaluating task_351.json
  ✗ FAILED

[21/50] Evaluating

In [None]:

# ============================================================================
# Cell 9: Save and Display Results
# ============================================================================

# Save detailed evaluation results
save_evaluation_results(pass_at_1_score, evaluation_results, "evaluation_results.json")

# Save summary to text file
with open('baseline_results.txt', 'w') as f:
    f.write(f"Model: {MODEL_NAME}\n")
    f.write(f"Pass@1 Score: {pass_at_1_score:.4f}\n")
    f.write(f"\nDetailed Results:\n")
    f.write("="*60 + "\n")
    for result in evaluation_results:
        status = "PASSED" if result['any_passed'] else "FAILED"
        f.write(f"Task {result['task_id']}: {status}\n")

print("✓ Results saved to baseline_results.txt")

# Display summary
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Model: {MODEL_NAME}")
print(f"Total Problems: {len(evaluation_results)}")
print(f"Problems Passed: {sum(1 for r in evaluation_results if r['any_passed'])}")
print(f"Pass@1 Score: {pass_at_1_score:.4f} ({pass_at_1_score*100:.2f}%)")
print("="*60)
