partner-upstage[patch]: embeddings empty list bug (#22057)

Fixed an error in `embed_documents` when the input was given as an empty list. And I have revised the document.
langchain-ai · May 23, 2024 · d9eff44 · d9eff44
1 parent 2df8ac4
commit d9eff44
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 10 deletions.
diff --git a/cookbook/rag_upstage_layout_analysis_groundedness_check.ipynb b/cookbook/rag_upstage_layout_analysis_groundedness_check.ipynb
@@ -36,7 +36,9 @@
     "\n",
     "docs = loader.load()\n",
     "\n",
-    "vectorstore = DocArrayInMemorySearch.from_documents(docs, embedding=UpstageEmbeddings())\n",
+    "vectorstore = DocArrayInMemorySearch.from_documents(\n",
+    "    docs, embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\")\n",
+    ")\n",
     "retriever = vectorstore.as_retriever()\n",
     "\n",
     "template = \"\"\"Answer the question based only on the following context:\n",

diff --git a/docs/docs/integrations/providers/upstage.ipynb b/docs/docs/integrations/providers/upstage.ipynb
@@ -115,13 +115,13 @@
    "source": [
     "from langchain_upstage import UpstageEmbeddings\n",
     "\n",
-    "embeddings = UpstageEmbeddings()\n",
+    "embeddings = UpstageEmbeddings(model=\"solar-embedding-1-large\")\n",
     "doc_result = embeddings.embed_documents(\n",
-    "    [\"Sam is a teacher.\", \"This is another document\"]\n",
+    "    [\"Sung is a professor.\", \"This is another document\"]\n",
     ")\n",
     "print(doc_result)\n",
     "\n",
-    "query_result = embeddings.embed_query(\"What does Sam do?\")\n",
+    "query_result = embeddings.embed_query(\"What does Sung do?\")\n",
     "print(query_result)"
    ]
   },

diff --git a/docs/docs/integrations/text_embedding/upstage.ipynb b/docs/docs/integrations/text_embedding/upstage.ipynb
@@ -80,7 +80,7 @@
    "source": [
     "from langchain_upstage import UpstageEmbeddings\n",
     "\n",
-    "embeddings = UpstageEmbeddings()"
+    "embeddings = UpstageEmbeddings(model=\"solar-embedding-1-large\")"
    ]
   },
   {
@@ -101,7 +101,7 @@
    "outputs": [],
    "source": [
     "doc_result = embeddings.embed_documents(\n",
-    "    [\"Sam is a teacher.\", \"This is another document\"]\n",
+    "    [\"Sung is a professor.\", \"This is another document\"]\n",
     ")\n",
     "print(doc_result)"
    ]
@@ -123,7 +123,7 @@
    },
    "outputs": [],
    "source": [
-    "query_result = embeddings.embed_query(\"What does Sam do?\")\n",
+    "query_result = embeddings.embed_query(\"What does Sung do?\")\n",
     "print(query_result)"
    ]
   },
@@ -184,7 +184,7 @@
     "\n",
     "vectorstore = DocArrayInMemorySearch.from_texts(\n",
     "    [\"harrison worked at kensho\", \"bears like to eat honey\"],\n",
-    "    embedding=UpstageEmbeddings(),\n",
+    "    embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
     ")\n",
     "retriever = vectorstore.as_retriever()\n",
     "docs = retriever.invoke(\"Where did Harrison work?\")\n",

diff --git a/libs/partners/upstage/README.md b/libs/partners/upstage/README.md
@@ -21,5 +21,5 @@ See a [usage example](https://python.langchain.com/docs/integrations/chat/upstag
 
 See a [usage example](https://python.langchain.com/docs/integrations/text_embedding/upstage)
 
-Use `solar-1-mini-embedding` as the default model for embeddings. Do not add suffixes such as `-query` or `-passage` to the model name.
+Use `solar-embedding-1-large` model for embeddings. Do not add suffixes such as `-query` or `-passage` to the model name.
 `UpstageEmbeddings` will automatically add the suffixes based on the method called.
diff --git a/libs/partners/upstage/langchain_upstage/embeddings.py b/libs/partners/upstage/langchain_upstage/embeddings.py
@@ -46,7 +46,7 @@ class UpstageEmbeddings(BaseModel, Embeddings):
 
             from langchain_upstage import UpstageEmbeddings
 
-            model = UpstageEmbeddings()
+            model = UpstageEmbeddings(model='solar-embedding-1-large')
     """
 
     client: Any = Field(default=None, exclude=True)  #: :meta private:
@@ -200,6 +200,8 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
         assert (
             self.embed_batch_size <= MAX_EMBED_BATCH_SIZE
         ), f"The embed_batch_size should not be larger than {MAX_EMBED_BATCH_SIZE}."
+        if not texts:
+            return []
         params = self._invocation_params
         params["model"] = params["model"] + "-passage"
         embeddings = []
@@ -242,6 +244,8 @@ async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
         assert (
             self.embed_batch_size <= MAX_EMBED_BATCH_SIZE
         ), f"The embed_batch_size should not be larger than {MAX_EMBED_BATCH_SIZE}."
+        if not texts:
+            return []
         params = self._invocation_params
         params["model"] = params["model"] + "-passage"
         embeddings = []

diff --git a/libs/partners/upstage/tests/integration_tests/test_embeddings.py b/libs/partners/upstage/tests/integration_tests/test_embeddings.py
@@ -35,3 +35,17 @@ async def test_langchain_upstage_aembed_query() -> None:
     embedding = UpstageEmbeddings(model="solar-embedding-1-large")
     output = await embedding.aembed_query(query)
     assert len(output) > 0
+
+
+def test_langchain_upstage_embed_documents_with_empty_list() -> None:
+    """Test Upstage embeddings with empty list."""
+    embedding = UpstageEmbeddings(model="solar-embedding-1-large")
+    output = embedding.embed_documents([])
+    assert len(output) == 0
+
+
+async def test_langchain_upstage_aembed_documents_with_empty_list() -> None:
+    """Test Upstage embeddings asynchronous with empty list."""
+    embedding = UpstageEmbeddings(model="solar-embedding-1-large")
+    output = await embedding.aembed_documents([])
+    assert len(output) == 0