Skip to content

Commit

Permalink
WIP: support both row by row and from Dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
jaceksan committed May 22, 2024
1 parent 4ff3079 commit dce52a8
Showing 1 changed file with 28 additions and 20 deletions.
48 changes: 28 additions & 20 deletions libs/community/langchain_community/vectorstores/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import uuid
from typing import Any, Iterable, List, Optional, Type
import warnings

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
Expand Down Expand Up @@ -106,8 +107,6 @@ def __init__(
raise ValueError("An embedding function or model must be provided.")

if connection is None:
import warnings

warnings.warn(
"No DuckDB connection provided. A new connection will be created."
"This connection is running in memory and no data will be persisted."
Expand Down Expand Up @@ -144,14 +143,15 @@ def add_texts(
Returns:
List of ids of the added texts.
"""

have_pandas = False
try:
import pandas as pd
except ImportError as e:
raise ImportError(
"Unable to import pandas, please install it with "
"`pip install -U pandas`"
) from e
have_pandas = True
except ImportError:
warnings.warn(
"Unable to import pandas. "
"Please install it with `pip install -U pandas` to improve performance of add_texts()."

Check failure on line 153 in libs/community/langchain_community/vectorstores/duckdb.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.8

Ruff (E501)

langchain_community/vectorstores/duckdb.py:153:89: E501 Line too long (103 > 88)
)

# Extract ids from kwargs or generate new ones if not provided
ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
Expand All @@ -168,19 +168,27 @@ def add_texts(
if metadatas and idx < len(metadatas)
else None
)
data.append(
{
self._id_key: ids[idx],
self._text_key: text,
self._vector_key: embedding,
"metadata": metadata,
}
if have_pandas:
data.append(
{
self._id_key: ids[idx],
self._text_key: text,
self._vector_key: embedding,
"metadata": metadata,
}
)
else:
self._connection.execute(
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
[ids[idx], text, embedding, metadata],
)

if have_pandas:
# noinspection PyUnusedLocal
df = pd.DataFrame.from_dict(data) # noqa: F841
self._connection.execute(
f"INSERT INTO {self._table_name} SELECT * FROM df",
)
# noinspection PyUnusedLocal
df = pd.DataFrame.from_dict(data) # noqa: F841
self._connection.execute(
f"INSERT INTO {self._table_name} SELECT * FROM df",
)
return ids

def similarity_search(
Expand Down

0 comments on commit dce52a8

Please sign in to comment.