langchain-ai · mpskex · Apr 17, 2023 · Apr 17, 2023 · Apr 18, 2023 · Apr 19, 2023
diff --git a/libs/experimental/langchain_experimental/retrievers/sql_database.py b/libs/experimental/langchain_experimental/retrievers/sql_database.py
@@ -0,0 +1,55 @@
+"""SQL Database Chain Retriever"""
+from typing import Any, Dict, List
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForRetrieverRun,
+    CallbackManagerForRetrieverRun,
+)
+from langchain.schema import BaseRetriever, Document
+from pydantic import validator
+
+from langchain_experimental.sql.base import SQLDatabaseChain
+
+
+class SQLDatabaseChainRetriever(BaseRetriever):
+    """Retriever that uses SQLDatabase as Retriever"""
+
+    sql_db_chain: SQLDatabaseChain
+    """SQL Database Chain"""
+    page_content_key: str = "content"
+    """column name for page content of documents"""
+
+    @validator("sql_db_chain")
+    def sql_db_chain_must_in_native_format(
+        cls, sql_db_chain: SQLDatabaseChain
+    ) -> SQLDatabaseChain:
+        if not sql_db_chain.native_format:
+            raise TypeError(
+                "SQL Database Chain must return in native format. \
+                 Try to turn `native_format` in this chain to `True`."
+            )
+        return sql_db_chain
+
+    def _get_relevant_documents(
+        self,
+        query: str,
+        *,
+        run_manager: CallbackManagerForRetrieverRun,
+        **kwargs: Any,
+    ) -> List[Document]:
+        if not self.sql_db_chain.native_format:
+            raise TypeError(
+                "SQL Database Chain must return in native format. \
+                 Try to turn `native_format` in this chain to `True`."
+            )
+        ret: List[Dict[str, Any]] = self.sql_db_chain(
+            query, callbacks=run_manager.get_child(), **kwargs
+        )["result"]
+        return [
+            Document(page_content=r[self.page_content_key], metadata=r) for r in ret
+        ]
+
+    async def _aget_relevant_documents(
+        self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        raise NotImplementedError
diff --git a/libs/experimental/langchain_experimental/sql/base.py b/libs/experimental/langchain_experimental/sql/base.py
@@ -2,7 +2,8 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, Dict, List, Optional
+from inspect import signature
+from typing import Any, Dict, List, Optional, cast
 
 from langchain.callbacks.manager import CallbackManagerForChainRun
 from langchain.chains.base import Chain
@@ -12,12 +13,21 @@
 from langchain.schema import BasePromptTemplate
 from langchain.schema.language_model import BaseLanguageModel
 from langchain.tools.sql_database.prompt import QUERY_CHECKER
-from langchain.utilities.sql_database import SQLDatabase
-from pydantic import Extra, Field, root_validator
+from pydantic import BaseModel, Extra, Field, root_validator, validator
+
+from langchain_experimental.sql.parser import SQLCommandOutputParser
+from langchain_experimental.utilities.sql_database import SQLDatabase
 
 INTERMEDIATE_STEPS_KEY = "intermediate_steps"
 
 
+class SQLCommand(BaseModel):
+    llm_out: str
+    """Raw output from LLM"""
+    sql_cmd: str
+    """SQL command parsed and ready to run"""
+
+
 class SQLDatabaseChain(Chain):
     """Chain for interacting with SQL Database.
 
@@ -52,13 +62,33 @@ class SQLDatabaseChain(Chain):
     to fix the initial SQL from the LLM."""
     query_checker_prompt: Optional[BasePromptTemplate] = None
     """The prompt template that should be used by the query checker"""
+    native_format: bool = False
+    """If return_direct, controls whether to return in python native format"""
 
     class Config:
         """Configuration for this pydantic object."""
 
         extra = Extra.forbid
         arbitrary_types_allowed = True
 
+    @validator("llm_chain")
+    def check_outputparser_type(cls, llm_chain: LLMChain) -> LLMChain:
+        sig = signature(llm_chain.output_parser.parse)  # type: ignore
+        if sig.return_annotation == Dict[str, Any]:
+            if isinstance(llm_chain.output_parser, SQLCommandOutputParser):
+                return llm_chain
+            else:
+                warnings.warn(
+                    "Accepting output parser that returns Dict[str, Any]."
+                    "Make sure the output must contain `sql_cmd`, `llm_out`."
+                )
+                return llm_chain
+        raise TypeError(
+            "SQLDatabaseChain only works with LLMChains with "
+            "parsers that returns `{'sql_cmd': '<SQL>', 'llm_out': '<SQL>'}` "
+            "or `langchain.chains.sql_database.parser.SQLCommandOutputParser`!"
+        )
+
     @root_validator(pre=True)
     def raise_deprecation(cls, values: Dict) -> Dict:
         if "llm" in values:
@@ -115,57 +145,73 @@ def _call(
         intermediate_steps: List = []
         try:
             intermediate_steps.append(llm_inputs)  # input: sql generation
-            sql_cmd = self.llm_chain.predict(
-                callbacks=_run_manager.get_child(),
-                **llm_inputs,
-            ).strip()
-            if self.return_sql:
-                return {self.output_key: sql_cmd}
+            sql_cmd = cast(
+                Dict[str, Any],
+                self.llm_chain.predict_and_parse(
+                    callbacks=_run_manager.get_child(),
+                    **llm_inputs,
+                ),
+            )
             if not self.use_query_checker:
-                _run_manager.on_text(sql_cmd, color="green", verbose=self.verbose)
+                _run_manager.on_text(
+                    sql_cmd["llm_out"], color="green", verbose=self.verbose
+                )
                 intermediate_steps.append(
-                    sql_cmd
+                    sql_cmd["llm_out"]
                 )  # output: sql generation (no checker)
-                intermediate_steps.append({"sql_cmd": sql_cmd})  # input: sql exec
-                result = self.database.run(sql_cmd)
+                intermediate_steps.append(
+                    {"sql_cmd": sql_cmd["llm_out"]}
+                )  # input: sql exec
+                result = self.database.run(
+                    sql_cmd["sql_cmd"],
+                    native_format=self.native_format if self.return_direct else False,
+                )
                 intermediate_steps.append(str(result))  # output: sql exec
             else:
                 query_checker_prompt = self.query_checker_prompt or PromptTemplate(
                     template=QUERY_CHECKER, input_variables=["query", "dialect"]
                 )
                 query_checker_chain = LLMChain(
-                    llm=self.llm_chain.llm, prompt=query_checker_prompt
+                    llm=self.llm_chain.llm,
+                    prompt=query_checker_prompt,
+                    output_parser=self.llm_chain.output_parser,
                 )
                 query_checker_inputs = {
-                    "query": sql_cmd,
+                    "query": sql_cmd["llm_out"],
                     "dialect": self.database.dialect,
                 }
-                checked_sql_command: str = query_checker_chain.predict(
-                    callbacks=_run_manager.get_child(), **query_checker_inputs
-                ).strip()
+                checked_sql_command = cast(
+                    Dict[str, Any],
+                    query_checker_chain.predict_and_parse(
+                        callbacks=_run_manager.get_child(), **query_checker_inputs
+                    ),
+                )
                 intermediate_steps.append(
-                    checked_sql_command
+                    checked_sql_command["llm_out"]
                 )  # output: sql generation (checker)
                 _run_manager.on_text(
-                    checked_sql_command, color="green", verbose=self.verbose
+                    checked_sql_command["llm_out"], color="green", verbose=self.verbose
                 )
                 intermediate_steps.append(
-                    {"sql_cmd": checked_sql_command}
+                    {"sql_cmd": checked_sql_command["llm_out"]}
                 )  # input: sql exec
-                result = self.database.run(checked_sql_command)
+                result = self.database.run(
+                    checked_sql_command["sql_cmd"],
+                    native_format=self.native_format if self.return_direct else False,
+                )
                 intermediate_steps.append(str(result))  # output: sql exec
                 sql_cmd = checked_sql_command
 
             _run_manager.on_text("\nSQLResult: ", verbose=self.verbose)
-            _run_manager.on_text(result, color="yellow", verbose=self.verbose)
+            _run_manager.on_text(str(result), color="yellow", verbose=self.verbose)
             # If return direct, we just set the final result equal to
             # the result of the sql query result, otherwise try to get a human readable
             # final answer
             if self.return_direct:
                 final_result = result
             else:
                 _run_manager.on_text("\nAnswer:", verbose=self.verbose)
-                input_text += f"{sql_cmd}\nSQLResult: {result}\nAnswer:"
+                input_text += f"{sql_cmd['llm_out']}\nSQLResult: {result}\nAnswer:"
                 llm_inputs["input"] = input_text
                 intermediate_steps.append(llm_inputs)  # input: final answer
                 final_result = self.llm_chain.predict(
@@ -194,10 +240,13 @@ def from_llm(
         llm: BaseLanguageModel,
         db: SQLDatabase,
         prompt: Optional[BasePromptTemplate] = None,
+        sql_cmd_parser: Optional[SQLCommandOutputParser] = None,
         **kwargs: Any,
     ) -> SQLDatabaseChain:
+        if not sql_cmd_parser:
+            sql_cmd_parser = SQLCommandOutputParser()
         prompt = prompt or SQL_PROMPTS.get(db.dialect, PROMPT)
-        llm_chain = LLMChain(llm=llm, prompt=prompt)
+        llm_chain = LLMChain(llm=llm, prompt=prompt, output_parser=sql_cmd_parser)
         return cls(llm_chain=llm_chain, database=db, **kwargs)
 
 

diff --git a/libs/experimental/langchain_experimental/sql/parser.py b/libs/experimental/langchain_experimental/sql/parser.py
@@ -0,0 +1,74 @@
+from typing import Any, Dict
+
+from langchain.embeddings.base import Embeddings
+from langchain.schema import BaseOutputParser
+
+
+class SQLCommandOutputParser(BaseOutputParser[Dict[str, Any]]):
+    @property
+    def _type(self) -> str:
+        return "sql_bypass"
+
+    def parse(self, text: str) -> Dict[str, Any]:
+        text = text.strip()
+        return {"llm_out": text, "sql_cmd": text}
+
+
+class VectorSQLOutputParser(SQLCommandOutputParser):
+    """Output Parser for Vector SQL
+    1. finds for `NeuralArray()` and replace it with the embedding
+    2. finds for `DISTANCE()` and replace it with the distance name in backend SQL
+    """
+
+    model: Embeddings
+    """Embedding model to extract embedding for entity"""
+    distance_func_name: str = "distance"
+    """Distance name for vector SQL"""
+
+    class Config:
+        arbitrary_types_allowed = 1
+
+    @property
+    def _type(self) -> str:
+        return "vector_sql"
+
+    @classmethod
+    def from_embeddings(
+        cls, model: Embeddings, distance_func_name: str = "distance", **kwargs: Any
+    ) -> BaseOutputParser:
+        return cls(model=model, distance_func_name=distance_func_name, **kwargs)
+
+    def parse(self, text: str) -> Dict[str, Any]:
+        text = text.strip()
+        start = text.find("NeuralArray(")
+        _sql_str_compl = text
+        if start > 0:
+            _matched = text[text.find("NeuralArray(") + len("NeuralArray(") :]
+            end = _matched.find(")") + start + len("NeuralArray(") + 1
+            entity = _matched[: _matched.find(")")]
+            vecs = self.model.embed_query(entity)
+            vecs_str = "[" + ",".join(map(str, vecs)) + "]"
+            _sql_str_compl = text.replace("DISTANCE", self.distance_func_name).replace(
+                text[start:end], vecs_str
+            )
+            if _sql_str_compl[-1] == ";":
+                _sql_str_compl = _sql_str_compl[:-1]
+        return {"llm_out": text, "sql_cmd": _sql_str_compl}
+
+
+class VectorSQLRetrieveAllOutputParser(VectorSQLOutputParser):
+    """Based on VectorSQLOutputParser
+    It also modify the SQL to get all columns
+    """
+
+    @property
+    def _type(self) -> str:
+        return "vector_sql_retrieve_all"
+
+    def parse(self, text: str) -> Dict[str, Any]:
+        text = text.strip()
+        start = text.upper().find("SELECT")
+        if start >= 0:
+            end = text.upper().find("FROM")
+            text = text.replace(text[start + len("SELECT") + 1 : end - 1], "*")
+        return super().parse(text)
diff --git a/libs/experimental/langchain_experimental/sql/prompt.py b/libs/experimental/langchain_experimental/sql/prompt.py
@@ -0,0 +1,39 @@
+# flake8: noqa
+from langchain.output_parsers.list import CommaSeparatedListOutputParser
+from langchain.prompts.prompt import PromptTemplate
+
+
+PROMPT_SUFFIX = """Only use the following tables:
+{table_info}
+
+Question: {input}"""
+
+
+_myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.
+MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance. 
+When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.
+
+*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array. 
+
+Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per MyScale. You should only order according to the distance function.
+Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
+Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
+Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema.
+
+Use the following format:
+
+Question: "Question here"
+SQLQuery: "SQL Query to run"
+SQLResult: "Result of the SQLQuery"
+Answer: "Final answer here"
+"""
+
+MYSCALE_PROMPT = PromptTemplate(
+    input_variables=["input", "table_info", "top_k"],
+    template=_myscale_prompt + PROMPT_SUFFIX,
+)
+
+
+SQL_PROMPTS = {
+    "myscale": MYSCALE_PROMPT,
+}