In [None]:
# This notebook is a demonstration of how to run HumanEval while taking advantage of LangSmith's visibility and tracing features.

# To use it:
# 1. Update the settings and API keys below
# 2. Run the notebook.
# 3. View results in LangSmith.

# Dependencies:
!pip install -q langchain langsmith openai human-eval

In [None]:
# Settings

#dataset_name, description, max_problems = "humaneval-all", "HumanEval dataset", False
dataset_name, description, max_problems = "humaneval-small", "HumanEval dataset", 3

model_name, temperature = "gpt-4", 0.2
#model_name, temperature = "gpt-3.5-turbo", 0.2

repetitions_per_problem = 5

In [None]:
import os

# API keys:
os.environ["OPENAI_API_KEY"] = ""
os.environ["LANGCHAIN_API_KEY"] = ""

# LangSmith settings:
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [None]:
from human_eval.data import read_problems

# Get all HumanEval problems
problems = read_problems()

# To use a subset of the data during testing
if max_problems:
  problems = {key: problems[key] for key in list(problems.keys())[:max_problems]}

In [None]:
import langsmith

# Initialize LangChain+ client
client = langsmith.Client()
client

In [None]:
# Create a dataset from the HumanEval problems and solutions

if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):

  dataset = client.create_dataset(dataset_name, description=description)

  for key, value in problems.items():
      client.create_example(
          inputs={
              "prompt": value["prompt"],
              "task_id": key
              },
          outputs={
              "canonical_solution": value["canonical_solution"],
              },
          dataset_id=dataset.id
      )

In [None]:
# I would like to abstract everything in this code block into a separate library.

from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain, TransformChain, SequentialChain
from langchain.prompts import PromptTemplate
import re

# This is a subclass of TransformChain to support async calls.
# This is needed by arun_on_dataset to run multiple chains at once.
class ATransformChain(TransformChain):
      async def _acall(
        self,
        inputs,
        run_manager
    ):
        return self._call(inputs, run_manager)

# This prompt template has been tested with GPT-3.5 and GPT-4.
template = """
```
{prompt}
```
The above is an incomplete Python code fragment. Return the complete and correct code with no additional text.
"""

# Define a chain factory producing chains that generate code from chat models

def CodeGenerator(model_name, temperature=0.2):

  def chain_factory():

    # This chain prompts the chat model to generate code.
    prompt = PromptTemplate(template=template, input_variables=["prompt"])
    llm = ChatOpenAI(model_name=model_name, temperature=temperature)
    llm_chain = LLMChain(prompt=prompt, llm=llm)

    # This function removes non-code text from a chat response.
    def extract_code(inputs: dict) -> dict:
      text = inputs["text"]
      result = re.search(r'```.*?\n(.*?)\n```', text, re.DOTALL)
      result = result.group(1) if result else text
      return {"solution": result}

    transform_chain = ATransformChain(
        input_variables=["text"],
        output_variables=["solution"],
        transform=extract_code
    )

    return SequentialChain(
        input_variables=["prompt", "task_id"],
        output_variables=["solution"],
        chains=[llm_chain, transform_chain],
    )

  return chain_factory

In [None]:
# I would like to abstract everything in this code block into a separate library.
# Note that this block is not safe—it runs arbitrary Python code through check_correctness.

# Define a Python code evaluator for HumanEval

from typing import Optional
from langsmith.evaluation import RunEvaluator, EvaluationResult
from langsmith.schemas import Run, Example
from human_eval.execution import check_correctness

class PythonEvaluator(RunEvaluator):
    def evaluate_run(self, run: Run, example: Optional[Example] = None) -> EvaluationResult:
        print("Evaluating " + example.inputs["task_id"])
        problem = problems[example.inputs["task_id"]]
        solution = run.outputs["solution"]

        # The HumanEval evaluator, which runs the Python code against unit tests
        result = check_correctness(problem, solution, 5)

        return EvaluationResult(
            key = "Correctness",
            score = bool(result["passed"])
            )

In [None]:
# Run the generation and evaluation

from langchain.smith import arun_on_dataset, RunEvalConfig
from uuid import uuid4

os.environ["LANGCHAIN_PROJECT"] = f"HumanEval Test - " + uuid4().hex[0:8]

# Run all generations and evaluations
chain_results = await arun_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=CodeGenerator(model_name, temperature),
    num_repetitions=repetitions_per_problem,
    concurrency_level=5,
    verbose=True,
    client=client,
    tags=["HumanEval"],
    evaluation=RunEvalConfig(
        custom_evaluators=[PythonEvaluator()],
        input_key="prompt"
        )
)