In [1]:
import os
import pandas as pd
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict

In [2]:
class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str

settings = Settings()
print(settings.embedding_model)

baai/bge-m3


# Prepare Embedder

In [3]:
import os
from langchain_openai import OpenAIEmbeddings

os.environ["OPENAI_API_BASE"] = "{}/v1/".format(settings.embedding_base_url)
embeddings = OpenAIEmbeddings(
    model=settings.embedding_model,
    api_key=settings.embedding_api_key
)

In [4]:
vectors = embeddings.embed_documents(["hello", "goodbye"])
len(vectors[0])

1024

# Prepare DB

In [5]:
from langchain_community.vectorstores import LanceDB

In [6]:
uri = "database/lancedb_langchain/local_storage"

# https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.lancedb.LanceDB.html
vector_store = LanceDB(
    uri=uri,
    api_key=None,
    region=None,
    
    # vector_key="embedding",
    embedding=embeddings,
    table_name='langchain_test'
)

# Test Docs

In [7]:
from uuid import uuid4
from langchain_core.documents import Document

In [8]:
document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)
documents = [
    document_1,
    document_2
]
uuids = [str(uuid4()) for _ in range(len(documents))]

In [9]:
vector_store.add_documents(documents=documents, ids=uuids)

['14a3e82c-46d7-4001-9cb1-98ad21200af5',
 'd9d65e2b-38a1-4897-910c-4b1f88c7c866']

uri folder after insertion
* creates `{table_name}.lance` folders
```
└── langchain_test.lance
    ├── _transactions
    │   ├── 0-55d84f7a-7218-458a-b47f-039237898766.txn
    │   └── 1-53e55a83-192e-4ac2-9736-b49436cc3445.txn
    ├── _versions
    │   ├── 1.manifest
    │   └── 2.manifest
    └── data
        └── ced84e32-b53c-4caf-a4d2-0a51c9ccf181.lance
```

# Search

In [10]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy", k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* I had chocalate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]
* The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]
