langchain-ai · baskaryan · Sep 7, 2023 · Sep 4, 2023 · Sep 5, 2023
diff --git a/docs/extras/modules/data_connection/retrievers/sql_database/myscale_vector_sql.ipynb b/docs/extras/modules/data_connection/retrievers/sql_database/myscale_vector_sql.ipynb
@@ -0,0 +1,200 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "245065c6",
+   "metadata": {},
+   "source": [
+    "# Vector SQL Retriever with MyScale\n",
+    "\n",
+    ">[MyScale](https://docs.myscale.com/en/) is an integrated vector database. You can access your database in SQL and also from here, LangChain. MyScale can make a use of [various data types and functions for filters](https://blog.myscale.com/2023/06/06/why-integrated-database-solution-can-boost-your-llm-apps/#filter-on-anything-without-constraints). It will boost up your LLM app no matter if you are scaling up your data or expand your system to broader application."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0246c5bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install clickhouse-sqlalchemy InstructorEmbedding sentence_transformers openai langchain-experimental"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7585d2c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from os import environ\n",
+    "import getpass\n",
+    "from typing import Dict, Any\n",
+    "from langchain import OpenAI, SQLDatabase, LLMChain\n",
+    "from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
+    "from sqlalchemy import create_engine, Column, MetaData\n",
+    "from langchain import PromptTemplate\n",
+    "\n",
+    "\n",
+    "from sqlalchemy import create_engine\n",
+    "\n",
+    "MYSCALE_HOST = \"msc-1decbcc9.us-east-1.aws.staging.myscale.cloud\"\n",
+    "MYSCALE_PORT = 443\n",
+    "MYSCALE_USER = \"chatdata\"\n",
+    "MYSCALE_PASSWORD = \"myscale_rocks\"\n",
+    "OPENAI_API_KEY = getpass.getpass(\"OpenAI API Key:\")\n",
+    "\n",
+    "engine = create_engine(\n",
+    "    f\"clickhouse://{MYSCALE_USER}:{MYSCALE_PASSWORD}@{MYSCALE_HOST}:{MYSCALE_PORT}/default?protocol=https\"\n",
+    ")\n",
+    "metadata = MetaData(bind=engine)\n",
+    "environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e08d9ddc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import HuggingFaceInstructEmbeddings\n",
+    "from langchain_experimental.sql.vector_sql import VectorSQLOutputParser\n",
+    "\n",
+    "output_parser = VectorSQLOutputParser.from_embeddings(\n",
+    "    model=HuggingFaceInstructEmbeddings(\n",
+    "        model_name=\"hkunlp/instructor-xl\", model_kwargs={\"device\": \"cpu\"}\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84b705b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from langchain.llms import OpenAI\n",
+    "from langchain.callbacks import StdOutCallbackHandler\n",
+    "\n",
+    "from langchain.utilities.sql_database import SQLDatabase\n",
+    "from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n",
+    "from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
+    "\n",
+    "chain = VectorSQLDatabaseChain(\n",
+    "    llm_chain=LLMChain(\n",
+    "        llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n",
+    "        prompt=MYSCALE_PROMPT,\n",
+    "    ),\n",
+    "    top_k=10,\n",
+    "    return_direct=True,\n",
+    "    sql_cmd_parser=output_parser,\n",
+    "    database=SQLDatabase(engine, None, metadata),\n",
+    ")\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    chain.run(\n",
+    "        \"Please give me 10 papers to ask what is PageRank?\",\n",
+    "        callbacks=[StdOutCallbackHandler()],\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c09cda0",
+   "metadata": {},
+   "source": [
+    "## SQL Database as Retriever"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "734d7ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain\n",
+    "\n",
+    "from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
+    "from langchain_experimental.retrievers.vector_sql_database \\\n",
+    "    import VectorSQLDatabaseChainRetriever\n",
+    "from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n",
+    "from langchain_experimental.sql.vector_sql import VectorSQLRetrieveAllOutputParser\n",
+    "\n",
+    "output_parser_retrieve_all = VectorSQLRetrieveAllOutputParser.from_embeddings(\n",
+    "    output_parser.model\n",
+    ")\n",
+    "\n",
+    "chain = VectorSQLDatabaseChain.from_llm(\n",
+    "    llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n",
+    "    prompt=MYSCALE_PROMPT,\n",
+    "    top_k=10,\n",
+    "    return_direct=True,\n",
+    "    db=SQLDatabase(engine, None, metadata),\n",
+    "    sql_cmd_parser=output_parser_retrieve_all,\n",
+    "    native_format=True,\n",
+    ")\n",
+    "\n",
+    "# You need all those keys to get docs\n",
+    "retriever = VectorSQLDatabaseChainRetriever(sql_db_chain=chain, page_content_key=\"abstract\")\n",
+    "\n",
+    "document_with_metadata_prompt = PromptTemplate(\n",
+    "    input_variables=[\"page_content\", \"id\", \"title\", \"authors\", \"pubdate\", \"categories\"],\n",
+    "    template=\"Content:\\n\\tTitle: {title}\\n\\tAbstract: {page_content}\\n\\tAuthors: {authors}\\n\\tDate of Publication: {pubdate}\\n\\tCategories: {categories}\\nSOURCE: {id}\",\n",
+    ")\n",
+    "\n",
+    "chain = RetrievalQAWithSourcesChain.from_chain_type(\n",
+    "    ChatOpenAI(\n",
+    "        model_name=\"gpt-3.5-turbo-16k\", openai_api_key=OPENAI_API_KEY, temperature=0.6\n",
+    "    ),\n",
+    "    retriever=retriever,\n",
+    "    chain_type=\"stuff\",\n",
+    "    chain_type_kwargs={\n",
+    "        \"document_prompt\": document_with_metadata_prompt,\n",
+    "    },\n",
+    "    return_source_documents=True,\n",
+    ")\n",
+    "ans = chain(\"Please give me 10 papers to ask what is PageRank?\",\n",
+    "            callbacks=[StdOutCallbackHandler()])\n",
+    "print(ans[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4948ff25",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/libs/experimental/langchain_experimental/retrievers/vector_sql_database.py b/libs/experimental/langchain_experimental/retrievers/vector_sql_database.py
@@ -0,0 +1,38 @@
+"""Vector SQL Database Chain Retriever"""
+from typing import Any, Dict, List
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForRetrieverRun,
+    CallbackManagerForRetrieverRun,
+)
+from langchain.schema import BaseRetriever, Document
+
+from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain
+
+
+class VectorSQLDatabaseChainRetriever(BaseRetriever):
+    """Retriever that uses SQLDatabase as Retriever"""
+
+    sql_db_chain: VectorSQLDatabaseChain
+    """SQL Database Chain"""
+    page_content_key: str = "content"
+    """column name for page content of documents"""
+
+    def _get_relevant_documents(
+        self,
+        query: str,
+        *,
+        run_manager: CallbackManagerForRetrieverRun,
+        **kwargs: Any,
+    ) -> List[Document]:
+        ret: List[Dict[str, Any]] = self.sql_db_chain(
+            query, callbacks=run_manager.get_child(), **kwargs
+        )["result"]
+        return [
+            Document(page_content=r[self.page_content_key], metadata=r) for r in ret
+        ]
+
+    async def _aget_relevant_documents(
+        self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        raise NotImplementedError
diff --git a/libs/experimental/langchain_experimental/sql/prompt.py b/libs/experimental/langchain_experimental/sql/prompt.py
@@ -0,0 +1,85 @@
+# flake8: noqa
+from langchain.prompts.prompt import PromptTemplate
+
+
+PROMPT_SUFFIX = """Only use the following tables:
+{table_info}
+
+Question: {input}"""
+
+_VECTOR_SQL_DEFAULT_TEMPLATE = """You are a {dialect} expert. Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer to the input question.
+{dialect} queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance. 
+When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.
+
+*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array. 
+
+Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per {dialect}. You should only order according to the distance function.
+Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
+Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
+Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema.
+
+Use the following format:
+
+Question: "Question here"
+SQLQuery: "SQL Query to run"
+SQLResult: "Result of the SQLQuery"
+Answer: "Final answer here"
+"""
+
+VECTOR_SQL_PROMPT = PromptTemplate(
+    input_variables=["input", "table_info", "dialect", "top_k"],
+    template=_VECTOR_SQL_DEFAULT_TEMPLATE + PROMPT_SUFFIX,
+)
+
+
+_myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.
+MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance. 
+When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.
+
+*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array. 
+
+Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per MyScale. You should only order according to the distance function.
+Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
+Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
+Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema.
+
+Use the following format:
+
+======== table info ========
+<some table infos>
+
+Question: "Question here"
+SQLQuery: "SQL Query to run"
+
+
+Here are some examples:
+
+======== table info ========
+CREATE TABLE "ChatPaper" (
+	abstract String, 
+	id String, 
+	vector Array(Float32), 
+) ENGINE = ReplicatedReplacingMergeTree()
+ ORDER BY id
+ PRIMARY KEY id
+
+Question: What is Feartue Pyramid Network?
+SQLQuery: SELECT ChatPaper.title, ChatPaper.id, ChatPaper.authors FROM ChatPaper ORDER BY DISTANCE(vector, NeuralArray(PaperRank contribution)) LIMIT {top_k}
+
+
+Let's begin:
+======== table info ========
+{table_info}
+
+Question: {input}
+SQLQuery: """
+
+MYSCALE_PROMPT = PromptTemplate(
+    input_variables=["input", "table_info", "top_k"],
+    template=_myscale_prompt + PROMPT_SUFFIX,
+)
+
+
+VECTOR_SQL_PROMPTS = {
+    "myscale": MYSCALE_PROMPT,
+}