Skip to content

Commit

Permalink
Update Hologres vector store: use hologres-vector (#13767)
Browse files Browse the repository at this point in the history
Hi,
I made some code changes on the Hologres vector store to improve the
data insertion performance.
Also, this version of the code uses `hologres-vector` library. This
library is more convenient for us to update, and more efficient in
performance.
The code has passed the format/lint/spell check. I have run the unit
test for Hologres connecting to my own database.
Please check this PR again and tell me if anything needs to change.

Best,
Changgeng,
Developer @ Alibaba Cloud

Co-authored-by: Changgeng Zhao <zhaochanggeng.zcg@alibaba-inc.com>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
  • Loading branch information
3 people committed Dec 3, 2023
1 parent 0de7cf8 commit 9b59bde
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 130 deletions.
2 changes: 1 addition & 1 deletion docs/docs/integrations/providers/hologres.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
Click [here](https://www.alibabacloud.com/zh/product/hologres) to fast deploy a Hologres cloud instance.

```bash
pip install psycopg2
pip install hologres-vector
```

## Vector Store
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/integrations/vectorstores/hologres.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"metadata": {},
"outputs": [],
"source": [
"#!pip install psycopg2"
"!pip install hologres-vector"
]
},
{
Expand Down
152 changes: 27 additions & 125 deletions libs/langchain/langchain/vectorstores/hologres.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import json
import logging
import uuid
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
Expand All @@ -15,104 +14,6 @@
_LANGCHAIN_DEFAULT_TABLE_NAME = "langchain_pg_embedding"


class HologresWrapper:
"""`Hologres API` wrapper."""

def __init__(self, connection_string: str, ndims: int, table_name: str) -> None:
"""Initialize the wrapper.
Args:
connection_string: Hologres connection string.
ndims: Number of dimensions of the embedding output.
table_name: Name of the table to store embeddings and data.
"""

import psycopg2

self.table_name = table_name
self.conn = psycopg2.connect(connection_string)
self.cursor = self.conn.cursor()
self.conn.autocommit = False
self.ndims = ndims

def create_vector_extension(self) -> None:
self.cursor.execute("create extension if not exists proxima")
self.conn.commit()

def create_table(self, drop_if_exist: bool = True) -> None:
if drop_if_exist:
self.cursor.execute(f"drop table if exists {self.table_name}")
self.conn.commit()

self.cursor.execute(
f"""create table if not exists {self.table_name} (
id text,
embedding float4[] check(array_ndims(embedding) = 1 and \
array_length(embedding, 1) = {self.ndims}),
metadata json,
document text);"""
)
self.cursor.execute(
f"call set_table_property('{self.table_name}'"
+ """, 'proxima_vectors',
'{"embedding":{"algorithm":"Graph",
"distance_method":"SquaredEuclidean",
"build_params":{"min_flush_proxima_row_count" : 1,
"min_compaction_proxima_row_count" : 1,
"max_total_size_to_merge_mb" : 2000}}}');"""
)
self.conn.commit()

def get_by_id(self, id: str) -> List[Tuple]:
statement = (
f"select id, embedding, metadata, "
f"document from {self.table_name} where id = %s;"
)
self.cursor.execute(
statement,
(id),
)
self.conn.commit()
return self.cursor.fetchall()

def insert(
self,
embedding: List[float],
metadata: dict,
document: str,
id: Optional[str] = None,
) -> None:
self.cursor.execute(
f'insert into "{self.table_name}" '
f"values (%s, array{json.dumps(embedding)}::float4[], %s, %s)",
(id if id is not None else "null", json.dumps(metadata), document),
)
self.conn.commit()

def query_nearest_neighbours(
self, embedding: List[float], k: int, filter: Optional[Dict[str, str]] = None
) -> List[Tuple[str, str, float]]:
params = []
filter_clause = ""
if filter is not None:
conjuncts = []
for key, val in filter.items():
conjuncts.append("metadata->>%s=%s")
params.append(key)
params.append(val)
filter_clause = "where " + " and ".join(conjuncts)

sql = (
f"select document, metadata::text, "
f"pm_approx_squared_euclidean_distance(array{json.dumps(embedding)}"
f"::float4[], embedding) as distance from"
f" {self.table_name} {filter_clause} order by distance asc limit {k};"
)
self.cursor.execute(sql, tuple(params))
self.conn.commit()
return self.cursor.fetchall()


class Hologres(VectorStore):
"""`Hologres API` vector store.
Expand Down Expand Up @@ -152,26 +53,20 @@ def __post_init__(
"""
Initialize the store.
"""
self.storage = HologresWrapper(
self.connection_string, self.ndims, self.table_name
from hologres_vector import HologresVector

self.storage = HologresVector(
self.connection_string,
ndims=self.ndims,
table_name=self.table_name,
table_schema={"document": "text"},
pre_delete_table=self.pre_delete_table,
)
self.create_vector_extension()
self.create_table()

@property
def embeddings(self) -> Embeddings:
return self.embedding_function

def create_vector_extension(self) -> None:
try:
self.storage.create_vector_extension()
except Exception as e:
self.logger.exception(e)
raise e

def create_table(self) -> None:
self.storage.create_table(self.pre_delete_table)

@classmethod
def __from(
cls,
Expand Down Expand Up @@ -224,11 +119,10 @@ def add_embeddings(
kwargs: vectorstore specific parameters
"""
try:
for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids):
self.storage.insert(embedding, metadata, text, id)
schema_datas = [{"document": t} for t in texts]
self.storage.upsert_vectors(embeddings, ids, metadatas, schema_datas)
except Exception as e:
self.logger.exception(e)
self.storage.conn.commit()

def add_texts(
self,
Expand Down Expand Up @@ -333,17 +227,17 @@ def similarity_search_with_score_by_vector(
k: int = 4,
filter: Optional[dict] = None,
) -> List[Tuple[Document, float]]:
results: List[Tuple[str, str, float]] = self.storage.query_nearest_neighbours(
embedding, k, filter
results: List[dict[str, Any]] = self.storage.search(
embedding, k=k, select_columns=["document"], metadata_filters=filter
)

docs = [
(
Document(
page_content=result[0],
metadata=json.loads(result[1]),
page_content=result["document"],
metadata=result["metadata"],
),
result[2],
result["distance"],
)
for result in results
]
Expand All @@ -363,9 +257,11 @@ def from_texts(
) -> Hologres:
"""
Return VectorStore initialized from texts and embeddings.
Postgres connection string is required
Hologres connection string is required
"Either pass it as a parameter
or set the HOLOGRES_CONNECTION_STRING environment variable.
Create the connection string by calling
HologresVector.connection_string_from_db_params
"""
embeddings = embedding.embed_documents(list(texts))

Expand Down Expand Up @@ -397,9 +293,11 @@ def from_embeddings(
generated embeddings.
Return VectorStore initialized from documents and embeddings.
Postgres connection string is required
Hologres connection string is required
"Either pass it as a parameter
or set the HOLOGRES_CONNECTION_STRING environment variable.
Create the connection string by calling
HologresVector.connection_string_from_db_params
Example:
.. code-block:: python
Expand Down Expand Up @@ -463,9 +361,11 @@ def get_connection_string(cls, kwargs: Dict[str, Any]) -> str:

if not connection_string:
raise ValueError(
"Postgres connection string is required"
"Hologres connection string is required"
"Either pass it as a parameter"
"or set the HOLOGRES_CONNECTION_STRING environment variable."
"Create the connection string by calling"
"HologresVector.connection_string_from_db_params"
)

return connection_string
Expand All @@ -483,9 +383,11 @@ def from_documents(
) -> Hologres:
"""
Return VectorStore initialized from documents and embeddings.
Postgres connection string is required
Hologres connection string is required
"Either pass it as a parameter
or set the HOLOGRES_CONNECTION_STRING environment variable.
Create the connection string by calling
HologresVector.connection_string_from_db_params
"""

texts = [d.page_content for d in documents]
Expand Down
43 changes: 40 additions & 3 deletions libs/langchain/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions libs/langchain/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ azure-ai-textanalytics = {version = "^5.3.0", optional = true}
google-cloud-documentai = {version = "^2.20.1", optional = true}
fireworks-ai = {version = "^0.6.0", optional = true, python = ">=3.9,<4.0"}
javelin-sdk = {version = "^0.1.8", optional = true}
hologres-vector = {version = "^0.0.6", optional = true}
praw = {version = "^7.7.1", optional = true}
msal = {version = "^1.25.0", optional = true}
databricks-vectorsearch = {version = "^0.21", optional = true}
Expand Down Expand Up @@ -315,6 +316,7 @@ all = [
"amadeus",
"librosa",
"python-arango",
"hologres-vector",
"dgml-utils",
]

Expand Down Expand Up @@ -386,6 +388,7 @@ extended_testing = [
"rspace_client",
"fireworks-ai",
"javelin-sdk",
"hologres-vector",
"praw",
"databricks-vectorsearch",
"dgml-utils",
Expand Down

0 comments on commit 9b59bde

Please sign in to comment.