In [1]:
import os
import pandas as pd
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict

In [2]:
class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str

settings = Settings()
print(settings.embedding_model)

baai/bge-m3


# Prepare Embedder

In [3]:
import os
from langchain_openai import OpenAIEmbeddings

os.environ["OPENAI_API_BASE"] = "{}/v1/".format(settings.embedding_base_url)
embeddings = OpenAIEmbeddings(
    model=settings.embedding_model,
    api_key=settings.embedding_api_key
)
vectors = embeddings.embed_documents(["hello", "goodbye"])
len(vectors[0])

1024

# Prepare DB

In [4]:
from langchain_chroma import Chroma

In [5]:
vector_store = Chroma(
    collection_name="test2",
    embedding_function=embeddings,
    persist_directory="./database/chroma_langchain/local_storage"
)

# Test Docs

In [6]:
from uuid import uuid4
from langchain_core.documents import Document

In [7]:
document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)
documents = [
    document_1,
    document_2
]
uuids = [str(uuid4()) for _ in range(len(documents))]

In [8]:
vector_store.add_documents(documents=documents, ids=uuids)

['31f19867-7926-4547-8137-49b449ae9510',
 '2a87642f-dad1-4182-8ce0-a7679fe1948a']

In [9]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy", k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* I had chocalate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]
* The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]


In [10]:
print("FILTERING WITH TWEET")
results = vector_store.similarity_search(
    "kitty", k=10, filter={"source": {"$in": ["tweet"]}}
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")
    
print("FILTERING WITH NEWS")
results = vector_store.similarity_search(
    "kitty", k=10, filter={"source": {"$in": ["news"]}}
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

Number of requested results 10 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 10 is greater than number of elements in index 2, updating n_results = 2


FILTERING WITH TWEET
* I had chocalate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]
FILTERING WITH NEWS
* The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]


In [12]:
import sqlite3
import sqlite_vec

## Print Schemas
connection = sqlite3.connect("./database/chroma_langchain/local_storage/chroma.sqlite3")
connection.enable_load_extension(True)
sqlite_vec.load(connection)
connection.enable_load_extension(False)

cursor = connection.cursor()

try:
    # Retrieve the CREATE TABLE statements for all tables in the database
    cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    # Print the SQL CREATE TABLE statements
    for table_name, create_statement in tables:
        print(f"-- Schema for table: {table_name}")
        print(f"{create_statement};\n")
finally:
    # Close the connection
    connection.close()

-- Schema for table: migrations
CREATE TABLE migrations (
                        dir TEXT NOT NULL,
                        version INTEGER NOT NULL,
                        filename TEXT NOT NULL,
                        sql TEXT NOT NULL,
                        hash TEXT NOT NULL,
                        PRIMARY KEY (dir, version)
                    );

-- Schema for table: embeddings_queue
CREATE TABLE embeddings_queue (
    seq_id INTEGER PRIMARY KEY,
    created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
    operation INTEGER NOT NULL,
    topic TEXT NOT NULL,
    id TEXT NOT NULL,
    vector BLOB,
    encoding TEXT,
    metadata TEXT
);

-- Schema for table: embeddings_queue_config
CREATE TABLE embeddings_queue_config (
    id INTEGER PRIMARY KEY,
    config_json_str TEXT
);

-- Schema for table: collection_metadata
CREATE TABLE collection_metadata (
    collection_id TEXT REFERENCES collections(id) ON DELETE CASCADE,
    key TEXT NOT NULL,
    str_value TEXT,
    int_va