In [1]:
!pip install datasets transformers torch accelerate wandb

import json
import os
import subprocess
import tempfile
from pathlib import Path
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import numpy as np

Defaulting to user installation because normal site-packages is not writeable


In [5]:
# Cell 2 (Fixed): Load Datasets and Inspect Structure
import datasets

print("Loading datasets...")
train_ds = datasets.load_dataset("nuprl/engineering-llm-systems", "mbpp-rkt-correct-executions")
test_ds = datasets.load_dataset("nuprl/engineering-llm-systems", "mbpp-rkt-test-problems")

print(f"Training set size: {len(train_ds['train'])}")

# Check what splits are available in test_ds
print(f"\nAvailable splits in train_ds: {list(train_ds.keys())}")
print(f"Available splits in test_ds: {list(test_ds.keys())}")

# Get the correct split name for test dataset
test_split_name = list(test_ds.keys())[0]
print(f"\nUsing test split: '{test_split_name}'")
print(f"Test set size: {len(test_ds[test_split_name])}")

# Inspect a sample
print("\nSample training example:")
print(train_ds['train'][0])
print("\n" + "="*50)
print("Sample test example:")
print(test_ds[test_split_name][0])

Loading datasets...
Training set size: 2646

Available splits in train_ds: ['train']
Available splits in test_ds: ['train']

Using test split: 'train'
Test set size: 50

Sample training example:
{'task_id': 667, 'code': '#lang racket\n\n;; Function to count vowels in a string based on a given set of vowels\n(define (count-vowels input-string vowels-string)\n  (define vowels (string->list vowels-string))\n  (define (is-vowel? char)\n    (member char vowels))\n  \n  (define (count-char char)\n    (if (is-vowel? char) 1 0))\n  \n  (define (count-vowels-in-string str)\n    (foldl (lambda (char count) (+ count (count-char char))) 0 (string->list str)))\n  \n  (count-vowels-in-string input-string))\n\n;; Read input from standard input\n(define input-string (string-downcase (read-line)))\n(define vowels-string (string-downcase (read-line)))\n\n;; Call the function and print the result\n(display (count-vowels input-string vowels-string))\n', 'test_cases': {'input': ['corner\nAaEeIiOoUu', 'vali

In [9]:
# Cell 3 (Fixed): Racket Execution Helper Functions
def execute_racket_code(code, test_input=None, timeout=5):
    """Execute Racket code and return success status."""
    temp_file = None
    try:
        # Create a temporary file for the Racket code
        with tempfile.NamedTemporaryFile(mode='w', suffix='.rkt', delete=False) as f:
            f.write(code)
            temp_file = f.name
        
        # Run the Racket code
        result = subprocess.run(
            ['racket', temp_file],
            capture_output=True,
            text=True,
            timeout=timeout,
            input=test_input if test_input else None,
            check=False
        )
        
        return result.returncode == 0, result.stdout.strip(), result.stderr
        
    except subprocess.TimeoutExpired:
        return False, "", "Timeout"
    except Exception as e:
        return False, "", str(e)
    finally:
        if temp_file and os.path.exists(temp_file):
            try:
                os.unlink(temp_file)
            except:
                pass

def clip_racket_completion(text, prompt):
    """Clip generated Racket code at natural stopping points."""
    if text.startswith(prompt):
        text = text[len(prompt):]
    
    # Racket-specific stopping points
    stop_strings = ["\n\n;; ", "\n#lang"]
    min_pos = len(text)
    
    for stop in stop_strings:
        pos = text.find(stop)
        if pos != -1 and pos < min_pos:
            min_pos = pos
    
    return text[:min_pos]

def construct_prompt_from_problem(problem):
    """Construct a Racket prompt from a test problem."""
    prompt = "#lang racket\n\n"
    prompt += f";; {problem['description']}\n"
    prompt += f";; Input format: {problem['input_format']}\n"
    prompt += f";; Output format: {problem['output_format']}\n\n"
    return prompt

# Test the function with a sample
sample_problem = test_ds['train'][0]
test_prompt = construct_prompt_from_problem(sample_problem)
print("Test prompt generated:")
print(test_prompt)
print(f"Prompt length: {len(test_prompt)}")

Test prompt generated:
#lang racket

;; Given a list of lists, write a function to find the list with the maximum length using a lambda function. Return a tuple containing the length of the longest list and the list itself.
;; Input format: The first line contains an integer N, the number of lists. This is followed by N lines, each containing space-separated integers representing a list.
;; Output format: The output is the length of the longest list followed by the elements of the longest list, all separated by spaces.


Prompt length: 503


In [10]:
# Cell 4: Load Base Model for Evaluation
print("Loading base model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B-Base")
if device != "cpu":
    model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B-Base")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully!")

Loading base model...
Using device: cuda
Model loaded successfully!


In [None]:
# Cell 5 (Fixed with debugging): Generate Completions for Base Model
def generate_completions_for_dataset(model, tokenizer, dataset, output_dir, 
                                     num_samples=5, temperature=0.2, top_p=0.95):
    """Generate completions for all problems in dataset."""
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    for idx, problem in enumerate(dataset):
        task_id = problem["task_id"]
        output_file = output_dir / f"task_{task_id}.json"
        
        if output_file.exists():
            print(f"Skipping task {task_id} (already exists)")
            continue
        
        print(f"Processing {idx+1}/{len(dataset)}: Task {task_id}")
        
        # Construct prompt from problem description
        prompt = construct_prompt_from_problem(problem)
        
        # Debug: Check if prompt is valid
        if not prompt or len(prompt) == 0:
            print(f"  ERROR: Empty prompt for task {task_id}")
            print(f"  Problem keys: {problem.keys()}")
            continue
        
        print(f"  Prompt length: {len(prompt)} characters")
        
        # Tokenize
        try:
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            if device != "cpu":
                inputs = {k: v.to(device) for k, v in inputs.items()}
        except Exception as e:
            print(f"  ERROR during tokenization: {e}")
            print(f"  Prompt: {prompt[:200]}")
            continue
        
        # Generate multiple samples
        completions = []
        for sample_idx in range(num_samples):
            print(f"  Generating sample {sample_idx+1}/{num_samples}")
            
            try:
                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=400,
                        temperature=temperature,
                        top_p=top_p,
                        do_sample=True,
                        pad_token_id=tokenizer.pad_token_id,
                        eos_token_id=tokenizer.eos_token_id
                    )
                
                generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
                completion = clip_racket_completion(generated_text, prompt)
                completions.append(completion)
            except Exception as e:
                print(f"  ERROR during generation: {e}")
                completions.append("")  # Add empty completion on error
        
        # Save results
        result = {
            "task_id": task_id,
            "prompt": prompt,
            "completions": completions,
            "tests": problem["tests"]
        }
        
        with open(output_file, "w") as f:
            json.dump(result, f, indent=2)
        
        print(f"  Saved to {output_file}")
    
    print("Completion generation done!")

# Generate completions for base model
generate_completions_for_dataset(
    model, tokenizer, test_ds['train'], 
    "completions_base_model",
    num_samples=5,
    temperature=0.2,
    top_p=0.95
)

Processing 1/50: Task 393
  Prompt length: 503 characters
  Generating sample 1/5
  Generating sample 2/5
  Generating sample 3/5
  Generating sample 4/5
  Generating sample 5/5
  Saved to completions_base_model/task_393.json
Processing 2/50: Task 71
  Prompt length: 337 characters
  Generating sample 1/5
  Generating sample 2/5
  Generating sample 3/5
  Generating sample 4/5
  Generating sample 5/5
  Saved to completions_base_model/task_71.json
Processing 3/50: Task 97
  Prompt length: 585 characters
  Generating sample 1/5
  Generating sample 2/5
  Generating sample 3/5
  Generating sample 4/5
  Generating sample 5/5
  Saved to completions_base_model/task_97.json
Processing 4/50: Task 353
  Prompt length: 433 characters
  Generating sample 1/5
  Generating sample 2/5
  Generating sample 3/5
  Generating sample 4/5
  Generating sample 5/5
  Saved to completions_base_model/task_353.json
Processing 5/50: Task 307
  Prompt length: 571 characters
  Generating sample 1/5
  Generating sampl