In [1]:
import os
import sys
import tempfile
sys.path.append('../')

import torch
from human_eval.data import stream_jsonl, write_jsonl, read_problems
from human_eval.evaluation import evaluate_functional_correctness
from transformers import AutoTokenizer, AutoModelForCausalLM

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
output_dir = tempfile.gettempdir()

n_samples_per_task = 1
batch_size = 32
n_workers = 8

max_gen_length = 512

use_instruct_model = True
model_size = '1.3b'

In [6]:
def cleanup_code(code: str, instruct_format: bool = False) -> str:
    """
    Cleans up the generated code.
    """
    if instruct_format:
        code = code.replace("\r", "")
        if "```python" in code:
            code_start_idx = code.index("```python")
            code = code[code_start_idx:].replace("```python", "").strip()
            end_idx = code.find("```") if "```" in code else len(code)
            code = code[:end_idx].strip()

    else:
        stop_words = set(["\ndef", "\nclass", "\nif", "\n#", "\nprint"])
        min_stop_idx = len(code)
        for stop_word in stop_words:
            stop_index = code.find(stop_word)
            if 0 <= stop_index < min_stop_idx:
                min_stop_idx = stop_index
        code = code[:min_stop_idx]

    return code

In [8]:
device = 'cuda'

model_type = 'instruct' if use_instruct_model else 'base'
model_name = f'deepseek-ai/deepseek-coder-{model_size}-{model_type}'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'
# tokenizer.pad_token = tokenizer.eos_token # to avoid an error
model = AutoModelForCausalLM.from_pretrained(
    model_name, attn_implementation='flash_attention_2',
    torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True,
)
model = torch.compile(model)
model = model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
problems = read_problems()
print(f'# Problems: {len(problems)}')

problem_tuples = [(k, v['prompt']) for k, v in problems.items()]
task_ids, prompts = zip(*problem_tuples)

# Create lists of the input task ids and corresponding GenerateData objects as inputs
input_tasks = [
    task_id
    for task_id in task_ids
    for _ in range(n_samples_per_task)
]
inputs = [
    prompt
    for prompt in prompts
    for _ in range(n_samples_per_task)
]

if use_instruct_model:
    instruct_template = \
        "Below is an instruction that describes a task, paired with an input that provides further context.\n" + \
        "Write a response that appropriately completes the request.\n\n### Instruction:\nWrite a program to " + \
        "perform the given task.\n\nInput:\n{}\n\n### Response:\n"
    inputs = [instruct_template.format(prompt) for prompt in prompts]

inputs = tokenizer(inputs, padding=True, return_tensors='pt').to(device)

# Problems: 164


In [20]:
completions = []

for i in range(0, len(inputs['input_ids']), batch_size):
    batch_inputs = {k: v[i:i+batch_size] for k, v in inputs.items()}

    with torch.no_grad():
        generated_ids = model.generate(**batch_inputs, max_new_tokens=max_gen_length)
        # generated_ids = model.generate(
        #     **batch_inputs,
        #     max_new_tokens = max_gen_length,
        #     do_sample = False,
        #     eos_token_id = tokenizer.eos_token_id,
        #     pad_token_id = tokenizer.eos_token_id,
        # )
    
    completion_ids = generated_ids[:, batch_inputs['input_ids'].shape[1]:]
    batch_completions = tokenizer.batch_decode(completion_ids, skip_special_tokens=True)
    completions.extend(batch_completions)

cleaned_completions = [cleanup_code(c, use_instruct_model) for c in completions]

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


In [21]:
samples = [
    dict(task_id=task_id, completion=completion)
    for task_id, completion in zip(input_tasks, cleaned_completions)
]

# Write the results to a file
filepath = os.path.join(output_dir, 'human_eval_samples.jsonl')
os.makedirs(output_dir, exist_ok=True)
write_jsonl(filepath, samples)

print(evaluate_functional_correctness(filepath, k=[1], n_workers=n_workers, timeout=20))

Reading samples...


164it [00:00, 21024.72it/s]

Running test suites...



100%|██████████| 164/164 [00:25<00:00,  6.37it/s]


Writing results to /tmp/human_eval_samples.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 59762.45it/s]

{'pass@1': 0.6524390243902439}





In [22]:
# Read the results
results = list(stream_jsonl(filepath + '_results.jsonl'))
passed = [r['passed'] for r in results]
passed_frac = sum(passed) / len(passed)
print(f'Passed: {passed_frac:.2f}')

Passed: 0.65
