Skip to content

Commit

Permalink
community[patch]: Add param "task" to Databricks LLM to work around s…
Browse files Browse the repository at this point in the history
…erialization of transform_output_fn (#14933)

**What is the reproduce code?**

```python
from langchain.chains import LLMChain, load_chain
from langchain.llms import Databricks
from langchain.prompts import PromptTemplate

def transform_output(response):
    # Extract the answer from the responses.
    return str(response["candidates"][0]["text"])

def transform_input(**request):
    full_prompt = f"""{request["prompt"]}
    Be Concise.
    """
    request["prompt"] = full_prompt
    return request

chat_model = Databricks(
    endpoint_name="llama2-13B-chat-Brambles",
    transform_input_fn=transform_input,
    transform_output_fn=transform_output,
    verbose=True,
)
print(f"Test chat model: {chat_model('What is Apache Spark')}") # This works

llm_chain = LLMChain(llm=chat_model, prompt=PromptTemplate.from_template("{chat_input}"))
llm_chain("colorful socks") # this works
llm_chain.save("databricks_llm_chain.yaml") # transform_input_fn and transform_output_fn are not serialized into the model yaml file
loaded_chain = load_chain("databricks_llm_chain.yaml") # The Databricks LLM is recreated with transform_input_fn=None, transform_output_fn=None.
loaded_chain("colorful socks") # Thus this errors. The transform_output_fn is needed to produce the correct output
```


Error:
```
 File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-6c34afab-3473-421d-877f-1ef18930ef4d/lib/python3.10/site-packages/pydantic/v1/main.py", line 341, in __init__
    raise validation_error
pydantic.v1.error_wrappers.ValidationError: 1 validation error for Generation
text
  str type expected (type=type_error.str)
 request payload: {'query': 'What is a databricks notebook?'}'}
```

**What does the error mean?**

When the LLM generates an answer, represented by a Generation data
object. The Generation data object takes a str field called text, e.g.
Generation(text=”blah”). However, the Databricks LLM tried to put a
non-str to text, e.g. Generation(text={“candidates”:[{“text”: “blah”}]})
Thus, pydantic errors.

**Why the output format becomes incorrect after saving and loading the
Databricks LLM?**

Databrick LLM does not support serializing transform_input_fn and
transform_output_fn, so they are not serialized into the model yaml
file. When the Databricks LLM is loaded, it is recreated with
transform_input_fn=None, transform_output_fn=None. Without
transform_output_fn, the output text is not unwrapped, thus errors.

Missing transform_output_fn causes this error.
Missing transform_input_fn causes the additional prompt “Be Concise.” to
be lost after saving and loading.
<!-- Thank you for contributing to LangChain!

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes (if applicable),
  - **Dependencies:** any dependencies required for this change,
- **Tag maintainer:** for a quicker response, tag the relevant
maintainer (see below),
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc:
https://python.langchain.com/docs/contributing/

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in `docs/extras`
directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
  • Loading branch information
liangz1 and baskaryan committed Dec 20, 2023
1 parent 1ea6d83 commit 6479aab
Showing 1 changed file with 15 additions and 2 deletions.
17 changes: 15 additions & 2 deletions libs/community/langchain_community/llms/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ def _transform_completions(response: Dict[str, Any]) -> str:
return response["choices"][0]["text"]


def _transform_llama2_chat(response: Dict[str, Any]) -> str:
return response["candidates"][0]["text"]


def _transform_chat(response: Dict[str, Any]) -> str:
return response["choices"][0]["message"]["content"]

Expand Down Expand Up @@ -87,11 +91,12 @@ def __init__(self, **data: Any):
"external_model",
"foundation_model_api",
)
self.task = endpoint.get("task")
if self.task is None:
self.task = endpoint.get("task")

@property
def llm(self) -> bool:
return self.task in ("llm/v1/chat", "llm/v1/completions")
return self.task in ("llm/v1/chat", "llm/v1/completions", "llama2/chat")

@root_validator(pre=True)
def set_api_url(cls, values: Dict[str, Any]) -> Dict[str, Any]:
Expand Down Expand Up @@ -125,6 +130,8 @@ def post(
preds = response["predictions"]
# For a single-record query, the result is not a list.
pred = preds[0] if isinstance(preds, list) else preds
if self.task == "llama2/chat":
return _transform_llama2_chat(pred)
return transform_output_fn(pred) if transform_output_fn else pred


Expand Down Expand Up @@ -325,6 +332,10 @@ class Databricks(LLM):
"""The maximum number of tokens to generate."""
extra_params: Dict[str, Any] = Field(default_factory=dict)
"""Any extra parameters to pass to the endpoint."""
task: Optional[str] = None
"""The task of the endpoint. Only used when using a serving endpoint.
If not provided, the task is automatically inferred from the endpoint.
"""

_client: _DatabricksClientBase = PrivateAttr()

Expand Down Expand Up @@ -401,6 +412,7 @@ def __init__(self, **data: Any):
api_token=self.api_token,
endpoint_name=self.endpoint_name,
databricks_uri=self.databricks_uri,
task=self.task,
)
elif self.cluster_id and self.cluster_driver_port:
self._client = _DatabricksClusterDriverProxyClient(
Expand Down Expand Up @@ -430,6 +442,7 @@ def _default_params(self) -> Dict[str, Any]:
"stop": self.stop,
"max_tokens": self.max_tokens,
"extra_params": self.extra_params,
"task": self.task,
# TODO: Support saving transform_input_fn and transform_output_fn
# "transform_input_fn": self.transform_input_fn,
# "transform_output_fn": self.transform_output_fn,
Expand Down

0 comments on commit 6479aab

Please sign in to comment.