In [1]:
class Test:
    pass

type(Test)

type

In [1]:
from pydantic import BaseModel, ConfigDict

In [14]:
from enum import Enum
from typing import Literal, Optional

from pydantic import BaseModel, ConfigDict
from pydantic.alias_generators import to_camel


class BasePayloadModel(BaseModel):
    model_config = ConfigDict(
        alias_generator=to_camel,
        populate_by_name=True,
    )


class ModelWithInvitationCode:
    invitation_code1: str = ''


class ModelWithResponse:
    invitation_code: str = ''


class ValidateInvitationRequestPayload(ModelWithInvitationCode, ModelWithResponse, BasePayloadModel):
    chat_history: str = ''

ValidateInvitationRequestPayload(invitation_code='a')

ValidateInvitationRequestPayload(invitation_code='a', invitation_code1='', chat_history='')

## Config

In [None]:
import logging as log
from enum import Enum

from src.utils.pydantic_utils import FrozenBaseSettings


class EmbeddingModels:
    AZURE_TEXT_EMBEDDING_3_LARGE = "azure.text-embedding-3-large"
    AZURE_TEXT_EMBEDDING_3_SMALL = "azure.text-embedding-3-small"
    AZURE_TEXT_EMBEDDING_ADA_002 = "azure.text-embedding-ada-002"


class CompletionModels:
    AZURE_GPT_4O_MINI = "azure.gpt-4o-mini"
    AZURE_O1 = "azure.o1"


COMPLETION_MODEL_CONFIG = {
    CompletionModels.AZURE_GPT_4O_MINI: {
        "max_input_tokens": 128000,
        "max_output_tokens": 16384,
        "temperature": 0,
    },
    CompletionModels.AZURE_O1: {
        "max_input_tokens": 128000,
        "max_output_tokens": 32768,
        "temperature": 1,  # Must be 1 for o1. Otherwise will raise error.
    },
}


class VectorStoreType(str, Enum):
    AZURE_AI_SEARCH = "AZURE_AI_SEARCH"
    CHROMA_DB = "CHROMA_DB"


class Config(FrozenBaseSettings):
    vector_input_folder: str = "./vector_data/input/"

    # TODO
    # For testing. `data/` folder is gitignored.
    # chromadb_path: str = "./data/chromadb"

    chunk_size: int = 1000
    chunk_overlap: int = 100
    llm_max_try_count: int = 3

    vector_store_table_name: str = "wiki"
    # Collection of ChromaDB or Index of Azure AI Search

    litellm_api_key: str = ""
    litellm_api_base: str = ""

    vector_store_type: VectorStoreType = VectorStoreType.CHROMA_DB

    azure_ai_search_endpoint: str = ""
    azure_ai_search_key: str = ""

    embedding_model_name: str = EmbeddingModels.AZURE_TEXT_EMBEDDING_3_SMALL
    completion_model_name: str = CompletionModels.AZURE_O1
    max_input_tokens: int = COMPLETION_MODEL_CONFIG[completion_model_name][
        "max_input_tokens"
    ]
    max_output_tokens: int = COMPLETION_MODEL_CONFIG[completion_model_name][
        "max_output_tokens"
    ]
    temperature: int = COMPLETION_MODEL_CONFIG[completion_model_name][
        "temperature"
    ]

    def _init_adder(self):
        log.info(f"{self.vector_store_type=}")

config = Config()

In [7]:
config.vector_store_type

<VectorStoreType.CHROMA_DB: 'CHROMA_DB'>

## Chroma

In [1]:
test = """
'  assistant: Ask me anything about Shen! To continue our chat, may I have your invitation code please? Typically, it can be found on the resumes Shen sents out 😊\n\n  user: 33\n\n  assistant: Thank you. How may I assist you with my knowledge of Shen?\n\n  user: hi\n\n  assistant: hello'
"""
print(test)


'  assistant: Ask me anything about Shen! To continue our chat, may I have your invitation code please? Typically, it can be found on the resumes Shen sents out 😊

  user: 33

  assistant: Thank you. How may I assist you with my knowledge of Shen?

  user: hi

  assistant: hello'



In [None]:
import chromadb
chroma_client = chromadb.PersistentClient("./data/output/chromadb")
collection = chroma_client.get_or_create_collection(name="wiki")
if "wiki" in chroma_client.list_collections():
    chroma_client.delete_collection(name="wiki")

collection = chroma_client.get_or_create_collection(name="wiki")

In [9]:
from langchain_chroma import Chroma
from uuid import uuid4

from langchain_core.documents import Document

vector_store = Chroma(
    # Doc: https://python.langchain.com/docs/integrations/vectorstores/chroma/
    collection_name="example_collection",
    persist_directory="./data/output/chromadb",
)

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)
document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)
documents = [
    document_1,
    document_2,
]
uuids = [str(uuid4()) for _ in range(len(documents))]
# vector_store.add_documents(documents=documents, ids=uuids, embeddings=)
vector_store.add_embeddings(zip(texts, embeddings), metadatas)

# vector_store.delete(ids=uuids[-1])

results = vector_store.similarity_search_by_vector(
    embedding=[1], k=1
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

ValueError: You must provide an embedding function to compute embeddings.https://docs.trychroma.com/guides/embeddings in upsert.

## embedding with SentenceTransformer

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_core.documents import Document

# Initialize the embedding model
local_embedding_model = SentenceTransformer(
    'all-MiniLM-L6-v2')  # Lightweight model


def embed_documents_locally(documents: list[Document]) -> list[list[float]]:
    """
    Embeds a list of LangChain Document objects using a local embedding model.

    Args:
        documents (list[Document]): List of LangChain Document objects.

    Returns:
        list[list[float]]: List of embeddings (vectors) for each document.
    """
    # Extract text content from each Document
    texts = [doc.page_content for doc in documents]

    # Generate embeddings
    embeddings = local_embedding_model.encode(
        texts, convert_to_numpy=True).tolist()

    return embeddings


# Example usage
documents = [
    Document(page_content="This is the first document."),
    Document(page_content="This is the second document."),
]
embeddings = embed_documents_locally(documents)
print(embeddings)

In [None]:
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
# Initialize the embedding model
local_embedding_model = SentenceTransformer(
    'all-MiniLM-L6-v2')  # Example model, replace as needed


def embed_documents_locally(documents: list[Document]) -> list[list[float]]:
    """
    Embeds a list of LangChain Document objects using a local embedding model.

    Args:
        documents (list[Document]): List of LangChain Document objects.

    Returns:
        list[list[float]]: List of embeddings (vectors) for each document.
    """
    # Extract text content from each Document
    texts = [doc.page_content for doc in documents]

    # Generate embeddings
    embeddings = local_embedding_model.encode(
        texts, convert_to_numpy=True).tolist()

    return embeddings

# Example usage
# documents = [Document(page_content="Example text 1"), Document(page_content="Example text 2")]
# embeddings = embed_documents_locally(documents)
# print(embeddings)