In [7]:
from DataIngestTransform import SmartBatchLoader
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS
from typing import Any, Optional




In [8]:

class EmbeddingLoadVectorDB:
    """
    Generic class to embed text chunks and load into a vector database.
    Supports OpenAI, HuggingFace, and Ollama embeddings,
    and Chroma or FAISS as vector stores.
    """

    def __init__(
        self,
        splits: list,
        embeddingType: str = "OpenAI",
        vectorDB: str = "chroma",
        embedding_model: Optional[str] = None,
        persist_directory: Optional[str] = "./chroma_store",
    ):
        self.embeddingType = embeddingType.lower()
        self.vectorDB = vectorDB.lower()
        self.embedding_model = embedding_model
        self.persist_directory = persist_directory
        self.splits = splits
        self.embedding = self._select_embedding()

    def _select_embedding(self):
        """Select and initialize embedding model based on embeddingType."""
        if self.embeddingType == "openai":
            model_name = self.embedding_model or "text-embedding-3-small"
            print(f"[INFO] Using OpenAI Embeddings ({model_name})")
            return OpenAIEmbeddings(model=model_name)

        elif self.embeddingType == "huggingface":
            model_name = self.embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
            print(f"[INFO] Using HuggingFace Embeddings ({model_name})")
            return HuggingFaceEmbeddings(model_name=model_name)

        elif self.embeddingType == "ollama":
            model_name = self.embedding_model or "nomic-embed-text"
            print(f"[INFO] Using Ollama Embeddings ({model_name})")
            return OllamaEmbeddings(model=model_name)

        else:
            raise ValueError(f"❌ Unsupported embedding type: {self.embeddingType}")

    def embeddAndLoadVectorDB(self) -> Any:
        """Embed documents and load them into the specified vector database."""
        print(f"[INFO] Creating vector store: {self.vectorDB}")

        if self.vectorDB == "chroma":
            vdb = Chroma.from_documents(
                documents=self.splits,
                embedding=self.embedding,
                persist_directory=self.persist_directory,
            )
            print(f"[INFO] Chroma DB created and stored at {self.persist_directory}")

        elif self.vectorDB == "faiss":
            vdb = FAISS.from_documents(
                documents=self.splits,
                embedding=self.embedding,
            )
            print(f"[INFO] FAISS DB created in memory")

        else:
            raise ValueError(f"❌ Unsupported vector DB type: {self.vectorDB}")

        return vdb




In [11]:
splits=SmartBatchLoader(['Space.pdf']).load_and_split_all()
embedder = EmbeddingLoadVectorDB(
    splits=splits,
    embeddingType="openai",
    vectorDB="chroma",
)

vector_db = embedder.embeddAndLoadVectorDB()


[INFO] Using OpenAI Embeddings (text-embedding-3-small)
[INFO] Creating vector store: chroma
[INFO] Chroma DB created and stored at ./chroma_store
