<a href="https://colab.research.google.com/github/henrykmichalewski/human-eval/blob/master/LLama_evals_gsm8k_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/henrykmichalewski/human-eval
!pip install -e human-eval -q

## The above cell clones the unlocked variant of the OpenAI HumanEval repo.

Unlocking means that the correctness code is actually executing completions. The repo contains some helper code to simplify gsm8k_python evaluations. You need to reset the runtime before moving forward.

## Loading gsm8k

In [None]:
!pip install datasets -q

In [None]:
from datasets import load_dataset

In [None]:
gsm8k = load_dataset("gsm8k", "main")

In [None]:
gsm8k_train, gsm8k_test = gsm8k['train'], gsm8k['test']

In [None]:
from human_eval.execution import check_gsm8k_correctness

In [None]:
check_gsm8k_correctness(gsm8k_test[5], " return 63", task_id=5)

In [None]:
check_gsm8k_correctness(gsm8k_test[5], " return 64", task_id=5)

In [None]:
check_gsm8k_correctness(gsm8k_test[5], " return 64.000001", task_id=5)

## Install standard HuggingFace dependencies

In [None]:
!pip install git+https://github.com/huggingface/transformers.git@main accelerate

In [None]:
!pip install sentencepiece -q

In [None]:
import torch

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
device

## Download the model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, LlamaTokenizer, LlamaForCausalLM

# model_name = "codellama/CodeLlama-7b-Python-hf" - also a strong competitor
model_name = "codellama/CodeLlama-13b-Python-hf"

tok = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map='auto',
)
inputs = tok(["An increasing sequence: one,"], return_tensors="pt")

In [None]:
model.eval()

In [None]:
inputs = inputs.to(model.device)
streamer = TextStreamer(tok)

output = model.generate(**inputs, streamer=streamer, max_new_tokens=20)

In [None]:
output

In [None]:
print(tok.batch_decode(output, skip_special_tokens=True))

In [None]:
def generate_one_completion(prompt):
    # Convert prompt to tensors
    inputs = tok([prompt], return_tensors="pt")

    # Move tensors to the same device as the model
    inputs = inputs.to(model.device)

    # Generate output using the model
    output = model.generate(**inputs, max_new_tokens=256)

    # Decode the output and skip any special tokens
    return tok.batch_decode(output, skip_special_tokens=True)[0]

In [None]:
generate_one_completion(f"Hey there {model_name}!")

## Loading/Saving samples into gdrive

In [None]:
from google.colab import drive
import os

drive.mount('/content/gdrive/', force_remount=True)

In [None]:
path = f'/content/gdrive/My Drive/llama/{model_name}'
os.makedirs(path, exist_ok=True)
from human_eval.data import stream_jsonl, write_jsonl

### Read jsonl

In [None]:
try:
  samples = list(stream_jsonl(os.path.join(path, "samples_gsm8k.jsonl")))
except:
  samples = []

In [None]:
len(samples)

## Full loop

In [None]:
from data import gsm8k_python_prompt

In [None]:
%%time
all_correct = 0
for task_id, problem in enumerate(gsm8k_test):

    # Check if task_id is less than samples length
    if task_id < len(samples):
        correctness = check_gsm8k_correctness(problem, samples[task_id]['completion'], task_id=task_id)
        correct = correctness['passed']
        all_correct += correct
        continue

    # Print Task ID
    print(f"task_id {task_id}")

    # Formulate and print the full prompt
    full_prompt = (gsm8k_python_prompt.PYTHON_PROMPT + '\n' +
                  gsm8k_python_prompt.PYTHON_TEMPLATE.format(question=problem['question']))
    completion = generate_one_completion(full_prompt)
    completion = completion[len(full_prompt):].split('\n\n')[0]
    print(completion)

    # New lines for visibility
    print("\n")
    print(problem['answer'])

    # Check for correctness of the problem solved
    correctness = check_gsm8k_correctness(problem, completion, task_id=task_id)
    correct = correctness['passed']
    all_correct += correct

    # Print out the correctness
    print(f"Correctnes: {correct}\nAll correct: {all_correct}")
    print("="*40)

    # Append the task results to the list and check if need to write to JSON file
    samples.append(dict(task_id=task_id, completion=completion))
    if task_id % 100 == 0 and task_id:
        write_jsonl(os.path.join(path, "samples_gsm8k.jsonl"), samples)

# Write all samples to JSONL file at the end
write_jsonl(os.path.join(path, "samples_gsm8k.jsonl"), samples)


### Write jsonl

In [None]:
write_jsonl(os.path.join(path, "samples_gsm8k.jsonl"), samples)