In [None]:
from pydantic_settings import BaseSettings

import os
os.environ['setting'] = 'a'


class Test(BaseSettings):
    setting: str


Test()  # type: ignore

Test(setting='a')

In [1]:
from pydantic import BaseModel, ConfigDict

In [None]:
from enum import Enum
from typing import Literal, Optional

from pydantic import BaseModel, ConfigDict
from pydantic.alias_generators import to_camel


class BasePayloadModel(BaseModel):
    model_config = ConfigDict(
        alias_generator=to_camel,
        populate_by_name=True,
    )


class ModelWithInvitationCode:
    invitation_code1: str = ''


class ModelWithResponse:
    invitation_code: str = ''


class ValidateInvitationRequestPayload(ModelWithInvitationCode, ModelWithResponse, BasePayloadModel):
    chat_history: str = ''


ValidateInvitationRequestPayload(invitation_code='a')

ValidateInvitationRequestPayload(invitation_code='a', invitation_code1='', chat_history='')

In [2]:
pwd

'/workspaces/hsswiki-azureswa/docs/notebooks'

In [3]:
cd ../../api/

/workspaces/hsswiki-azureswa/api


## Config

In [14]:
import logging as log
from enum import Enum

from src.utils.pydantic_utils import FrozenBaseSettings


class EmbeddingModels:
    AZURE_TEXT_EMBEDDING_3_LARGE = "azure.text-embedding-3-large"
    AZURE_TEXT_EMBEDDING_3_SMALL = "azure.text-embedding-3-small"
    AZURE_TEXT_EMBEDDING_ADA_002 = "azure.text-embedding-ada-002"


class CompletionModels:
    AZURE_GPT_4O_MINI = "azure.gpt-4o-mini"
    AZURE_O1 = "azure.o1"


COMPLETION_MODEL_CONFIG = {
    CompletionModels.AZURE_GPT_4O_MINI: {
        "max_input_tokens": 128000,
        "max_output_tokens": 16384,
        "temperature": 0,
    },
    CompletionModels.AZURE_O1: {
        "max_input_tokens": 128000,
        "max_output_tokens": 32768,
        "temperature": 1,  # Must be 1 for o1. Otherwise will raise error.
    },
}


class VectorStoreType(str, Enum):
    AZURE_AI_SEARCH = "AZURE_AI_SEARCH"
    CHROMA_DB = "CHROMA_DB"


class Config(FrozenBaseSettings):
    vector_input_folder: str = "./vector_data/input/"
    invitation_codes_csv: str
    max_tokens: int = None

    # TODO
    # For testing. `data/` folder is gitignored.
    # chromadb_path: str = "./data/chromadb"

    chunk_size: int = 1000
    chunk_overlap: int = 100
    llm_max_try_count: int = 3

    vector_store_table_name: str = "wiki"
    # Collection of ChromaDB or Index of Azure AI Search

    litellm_api_key: str = ""
    litellm_api_base: str = ""

    vector_store_type: VectorStoreType = VectorStoreType.CHROMA_DB

    azure_ai_search_endpoint: str = ""
    azure_ai_search_key: str = ""

    embedding_model_name: str = EmbeddingModels.AZURE_TEXT_EMBEDDING_3_SMALL
    completion_model_name: str = CompletionModels.AZURE_O1
    max_input_tokens: int = COMPLETION_MODEL_CONFIG[completion_model_name][
        "max_input_tokens"
    ]
    max_output_tokens: int = COMPLETION_MODEL_CONFIG[completion_model_name][
        "max_output_tokens"
    ]
    temperature: int = COMPLETION_MODEL_CONFIG[completion_model_name][
        "temperature"
    ]

    def _init_adder(self):
        log.info(f"{self.vector_store_type=}")


config = Config()

In [None]:
config.invitation_codes_csv

'33,44'

## embedding with SentenceTransformer

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

text = "This is a test document."
query_result = embeddings.embed_query(text)

# show only the first 100 characters of the stringified vector
print(str(query_result)[:100] + "...")

[-0.03833850100636482, 0.1234646886587143, -0.028642931953072548, 0.05365271866321564, 0.00884538888...


In [None]:
len(query_result)

384

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_core.documents import Document

local_embedding_model = SentenceTransformer(
    'all-MiniLM-L6-v2'
)  # Lightweight model

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [5]:
# from sentence_transformers import SentenceTransformer
# from langchain_core.documents import Document

# # pip install sentence-transformers langchain-huggingface
# # pip install -qU langchain-huggingface

# # Initialize the embedding model
# local_embedding_model = SentenceTransformer(
#     'all-MiniLM-L6-v2'
# )  # Lightweight model


def embed_documents_locally(documents: list[Document]) -> list[list[float]]:
    """
    Embeds a list of LangChain Document objects using a local embedding model.

    Args:
        documents (list[Document]): List of LangChain Document objects.

    Returns:
        list[list[float]]: List of embeddings (vectors) for each document.
    """
    # Extract text content from each Document
    texts = [doc.page_content for doc in documents]

    # Generate embeddings
    embeddings = local_embedding_model.encode(
        texts, convert_to_numpy=True).tolist()

    return embeddings


# Example usage
documents = [
    Document(page_content="This is the first document."),
    Document(page_content="This is the second document."),
]
embeddings = embed_documents_locally(documents)
print(embeddings)

[[-0.04903821274638176, 0.07802308350801468, 0.02861848473548889, 0.024448907002806664, -0.002161254407837987, 0.008988065645098686, -0.05157139524817467, 0.0013845919165760279, 0.014908031560480595, 0.03507004678249359, 0.013822662644088268, 0.11790087819099426, -0.004221329931169748, -0.01593811623752117, -0.09778235852718353, 0.02438424527645111, -0.018930917605757713, -0.051611121743917465, 0.07603345811367035, 0.08705620467662811, 0.043991509824991226, 0.1023361012339592, 0.0432649664580822, -0.039265818893909454, 0.030956320464611053, 0.037268638610839844, -0.08601121604442596, -0.007963895797729492, 0.03792817145586014, -0.09579125046730042, 0.011275500059127808, -0.007409264799207449, 0.08646606653928757, 0.05231975018978119, 0.06710892915725708, 0.005854431074112654, 0.12016648799180984, -0.02835702709853649, 0.022932633757591248, 0.07027052342891693, 0.01882941462099552, -0.09840216487646103, -0.025092992931604385, 0.0072495415806770325, 0.006786780431866646, 0.03713012114167

In [7]:
len(embeddings[0])

384

## Chroma

In [None]:
import chromadb
chroma_client = chromadb.PersistentClient("./data/output/chromadb")
if "wiki" in chroma_client.list_collections():
    chroma_client.delete_collection(name="wiki")
collection = chroma_client.get_or_create_collection(name="wiki")

text = "This is a test document."
query_result = embeddings.embed_query(text)

collection.add()

# show only the first 100 characters of the stringified vector
print(str(query_result)[:100] + "...")

In [None]:
from langchain_chroma import Chroma
from uuid import uuid4

from langchain_core.documents import Document

vector_store = Chroma(
    # Doc: https://python.langchain.com/docs/integrations/vectorstores/chroma/
    collection_name="example_collection",
    persist_directory="./data/output/chromadb",
)

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)
document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)
documents = [
    document_1,
    document_2,
]
uuids = [str(uuid4()) for _ in range(len(documents))]
# vector_store.add_documents(documents=documents, ids=uuids, embeddings=)
vector_store.add_embeddings(zip(texts, embeddings), metadatas)

# vector_store.delete(ids=uuids[-1])

results = vector_store.similarity_search_by_vector(
    embedding=[1], k=1
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

ValueError: You must provide an embedding function to compute embeddings.https://docs.trychroma.com/guides/embeddings in upsert.