Skip to content

Commit

Permalink
Add Example
Browse files Browse the repository at this point in the history
  • Loading branch information
vowelparrot committed Jun 2, 2023
1 parent 9a980e4 commit f2590b5
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 107 deletions.
11 changes: 6 additions & 5 deletions langchain/client/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import datetime
from enum import Enum
from typing import Any, ClassVar, Dict, List, Mapping, Optional, Sequence, Union
from typing import Any, Dict, List, Optional, Sequence, Union
from uuid import UUID, uuid4
from typing_extensions import Literal

from pydantic import BaseModel, Field, root_validator

Expand Down Expand Up @@ -119,7 +120,7 @@ def validate_time_range(cls, values: Dict[str, Any]) -> Dict[str, Any]:


class FeedbackSourceBase(BaseModel):
type: ClassVar[str]
type: str
metadata: Optional[Dict[str, Any]] = None

class Config:
Expand All @@ -129,13 +130,13 @@ class Config:
class APIFeedbackSource(FeedbackSourceBase):
"""API feedback source."""

type: ClassVar[str] = "api"
type: Literal["api"] = "api"


class ModelFeedbackSource(FeedbackSourceBase):
"""Model feedback source."""

type: ClassVar[str] = "model"
type: Literal["model"] = "model"


class FeedbackSourceType(Enum):
Expand Down Expand Up @@ -186,7 +187,7 @@ class Feedback(FeedbackBase):
"""Schema for getting feedback."""

id: UUID
feedback_source: Optional[Dict] = None
feedback_source: FeedbackSourceBase
"""The source of the feedback. In this case"""


Expand Down
29 changes: 25 additions & 4 deletions langchain/evaluation/run_evaluators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from langchainplus_sdk.schemas import Example, Run
from pydantic import BaseModel
from pyparsing import abstractmethod

from langchain.callbacks.manager import CallbackManagerForChainRun
from langchain.chains.base import Chain

from langchain.chains.llm import LLMChain
from langchain.schema import BaseOutputParser

Expand Down Expand Up @@ -60,6 +60,13 @@ def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]):
"""Parse the output of a run."""

eval_chain_output_key: str = "text"

def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
"""Parse the output of a run."""
text = output[self.eval_chain_output_key]
return self.parse(text)


class ChoicesOutputParser(RunEvaluatorOutputParser):
"""Parse a feedback run with optional choices."""
Expand All @@ -70,7 +77,7 @@ class ChoicesOutputParser(RunEvaluatorOutputParser):
def parse(self, text: str) -> EvaluationResult:
"""Parse the last line of the text and return an evaluation result."""
lines = text.strip().split()
value = lines[-1]
value = lines[-1].strip()
score = self.choices_map.get(value, 0) if self.choices_map else None
comment = " ".join(lines[:-1]) if len(lines) > 1 else None
return EvaluationResult(
Expand All @@ -81,6 +88,20 @@ def parse(self, text: str) -> EvaluationResult:
)


class LabelingOutputParser(RunEvaluatorOutputParser):
"""Simple labeling parser that doesn't interpret the results."""

def parse(self, text: str) -> EvaluationResult:
"""Parse the last line of the text and return an evaluation result."""
lines = text.strip().split()
value = lines[-1].strip()
comment = " ".join(lines[:-1]) if len(lines) > 1 else None
return EvaluationResult(
key=value,
comment=comment,
)


T = TypeVar("T", bound="RunEvaluator")


Expand Down Expand Up @@ -113,8 +134,8 @@ def _call(
chain_input = self.input_mapper.map(run, example)
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
chain_output = self.eval_chain(chain_input, callbacks=_run_manager.get_child())
feedback = chain_output["text"]
return {"feedback": self.output_parser.parse(feedback)}
feedback = self.output_parser.parse_chain_output(chain_output)
return {"feedback": feedback}

def evaluate_run(
self, run: Run, example: Optional[Example] = None
Expand Down
19 changes: 19 additions & 0 deletions langchain/evaluation/run_evaluators/labeler_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# flake8: noqa

from langchain.prompts import PromptTemplate

template = """You are labeling a submitted answer on a given task or input based on a set of labels. Here is the data:
[BEGIN DATA]
***
[Task]: {input}
***
[Submission]: {output}
***
[Labels]: {labels}
***
[END DATA]
Please analyze the submission carefully considering the task it was supposed to accomplish. Compare it with the provided labels. Your task is to choose the most fitting label for the submission. Avoid simply stating the correct label at the outset. Write out in a step by step manner your reasoning about the label choice to be sure that your conclusion is correct. At the end, print the label that you believe is most appropriate for the submission on its own line. Repeat the label again by itself on a new line."""

PROMPT = PromptTemplate(
input_variables=["input", "output", "labels"], template=template
)
96 changes: 95 additions & 1 deletion langchain/evaluation/run_evaluators/run_evaluators.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
from typing import Any, Mapping, Optional, Sequence, Union
from typing import Any, Dict, Mapping, Optional, Sequence, Union

from langchainplus_sdk.evaluation.evaluator import EvaluationResult

from langchain.base_language import BaseLanguageModel
from langchain.chains.llm import LLMChain
from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
from langchain.evaluation.qa.eval_chain import QAEvalChain
from langchain.evaluation.qa.eval_prompt import PROMPT as QA_DEFAULT_PROMPT
from langchain.evaluation.qa.eval_prompt import SQL_PROMPT
from langchain.evaluation.run_evaluators.base import (
ChoicesOutputParser,
LabelingOutputParser,
RunEvaluator,
RunEvaluatorOutputParser,
StringRunEvalInputMapper,
)
from langchain.evaluation.run_evaluators.criteria_prompt import (
PROMPT as CRITERIA_PROMPT,
)
from langchain.evaluation.run_evaluators.labeler_prompt import PROMPT as LABELER_PROMPT
from langchain.prompts.prompt import PromptTemplate
from langchain.tools.base import BaseTool

_QA_PROMPTS = {
"qa": QA_DEFAULT_PROMPT,
Expand Down Expand Up @@ -138,3 +145,90 @@ def get_criteria_evaluator(
output_parser=parser,
**kwargs,
)


class RunTrajectoryOutputHandler(RunEvaluatorOutputParser):
"""Parse the output of a run."""

evaluation_name: str = "Trajectory"

def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
"""Parse the output of a run."""
return EvaluationResult(
key=self.evaluation_name,
score=output["score"],
comment=output.get("reasoning"),
)


def get_run_trajectory_evaluator(
llm: BaseLanguageModel,
*,
agent_tools: Optional[Sequence[BaseTool]] = None,
input_key: str = "input",
trajectory_key: str = "intermediate_steps",
prediction_key: str = "output",
evaluation_name: str = "Trajectory",
**kwargs: Any,
) -> RunEvaluator:
"""Get a RunEvaluator for grading the effectiveness of tool usage of an agent."""
# TODO: Load from serialized run
input_mapper = kwargs.pop(
"input_mapper",
StringRunEvalInputMapper(
input_map={input_key: "input"},
prediction_map={
trajectory_key: "agent_trajectory",
prediction_key: "output",
},
),
)
parser = kwargs.pop(
"output_parser", RunTrajectoryOutputHandler(evaluation_name=evaluation_name)
)
tools = agent_tools or []
eval_chain = kwargs.pop(
"eval_chain",
TrajectoryEvalChain.from_llm(
llm=llm, agent_tools=tools, return_reasoning=True, **kwargs
),
)
return RunEvaluator(
eval_chain=eval_chain,
input_mapper=input_mapper,
output_parser=parser,
**kwargs,
)


def get_run_labeler(
llm: BaseLanguageModel,
labels: Union[Mapping[str, str], Sequence[str]],
*,
input_key: str = "input",
prediction_key: str = "output",
prompt: PromptTemplate = LABELER_PROMPT,
**kwargs: Any,
) -> RunEvaluator:
"""Get a RunEvaluator for grading a model's response against a map of criteria."""
labels_str = (
", ".join(labels)
if isinstance(labels, Sequence)
else "\n".join(f"{k}: {v}" for k, v in labels.items())
)
prompt_ = prompt.partial(labels=labels_str)
input_mapper = kwargs.pop(
"input_mapper",
StringRunEvalInputMapper(
input_map={input_key: "input"},
prediction_map={prediction_key: "output"},
),
)
parser = kwargs.pop("output_parser", LabelingOutputParser())
eval_chain = LLMChain(llm=llm, prompt=prompt_, **kwargs)
return RunEvaluator(
eval_chain=eval_chain,
input_mapper=input_mapper,
output_parser=parser,
**kwargs,
)

0 comments on commit f2590b5

Please sign in to comment.