<a href="https://colab.research.google.com/github/khokhakhokha/chatbot-for-business-website/blob/main/rag_chatbot_on_custom_website_content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import

In [31]:
!pip install langchain
!pip install langchain-community
!pip install langchain-google-genai
!pip install langchain-openai
!pip install  langchain-chroma
!pip install chromadb
!pip install huggingface_hub
!pip install streamlit
!pip install numpy

Collecting langchain-chroma
  Downloading langchain_chroma-0.2.6-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_chroma-0.2.6-py3-none-any.whl (12 kB)
Installing collected packages: langchain-chroma
Successfully installed langchain-chroma-0.2.6


In [32]:
from pathlib import Path
from google.colab import drive
import os, tempfile, glob, random
# documents loaders
from langchain_community.document_loaders import DirectoryLoader, TextLoader
# text splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
# Chroma: vectorstore
from langchain_community.vectorstores import Chroma
from langchain_chroma import Chroma
# # LLM: openai and google_genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings


In [33]:
# Mount Google Drive
drive.mount('/content/drive')
# Define your base folder in Drive (you can change 'MyDrive' path if needed)
BASE_DIR = Path("/content/drive/MyDrive/data")
# Create the directories if they don't exist
(BASE_DIR / "tmp").mkdir(parents=True, exist_ok=True)
(BASE_DIR / "vector_stores").mkdir(parents=True, exist_ok=True)
# Define paths
TMP_DIR = BASE_DIR / "tmp"
LOCAL_VECTOR_STORE_DIR = BASE_DIR / "vector_stores"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##API KEYS

In [34]:
google_api_key =  os.environ.get("GOOGLE_API_KEY")
HF_key = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

##Document Loaders

In [35]:
def langchain_document_loader(TMP_DIR):


    documents = []

    txt_loader = DirectoryLoader(
        TMP_DIR.as_posix(), glob="**/*.txt", loader_cls=TextLoader, show_progress=True
    )
    documents.extend(txt_loader.load())

    return documents

In [36]:
# load documents
documents = langchain_document_loader(TMP_DIR)
print(f"\nNumber of documents: {len(documents)}")

100%|██████████| 5/5 [00:00<00:00, 211.96it/s]


Number of documents: 5





##Text splitters


In [37]:
# Create a RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " ", ""],
    chunk_size = 1600,
    chunk_overlap= 200
)

##Vectorsores and Embeddings




###Embeddings

In [38]:
embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001",
            google_api_key=google_api_key
        )

###Vectorestores

In [39]:
def create_vectorstore(embeddings, documents, vectorstore_name):
    """Create and persist a Chroma vector database."""

    # Define persistence directory
    persist_directory = Path("./data/vector_stores") / vectorstore_name
    persist_directory.mkdir(parents=True, exist_ok=True)

    # Create Chroma vector store
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=str(persist_directory)
    )

    # Persist data to disk
    vector_store.persist()


    return vector_store, str(persist_directory)

###Load chroma vectorestore

In [41]:
vector_store = Chroma(
    persist_directory = LOCAL_VECTOR_STORE_DIR.as_posix() + "/Vit_All_Google_Embeddings",
    embedding_function=embeddings)



## Retrivers

In [43]:
def vectorestore_retriver():
  return vector_store.as_retriever()


## Retrival : all the block

In [44]:
def setup_retrieval_pipeline(
    create_new_vectorstore=True,
    vectorstore_name="Vit_All_Google_Embeddings",
    chunk_size=1600,
    chunk_overlap=200,
    google_api_key="***",
):
    """
    🔍 Sets up the retrieval pipeline:
    Loads documents → splits into chunks → embeds them → creates or loads a Chroma vectorstore → builds a retriever.

    Parameters:
        create_new_vectorstore (bool):
            If True, creates a new Chroma vectorstore from source documents.
            If False, loads an existing vectorstore from disk.

        vectorstore_name (str):
            Directory name (inside LOCAL_VECTOR_STORE_DIR) where the Chroma vectorstore is stored.

        chunk_size (int):
            Maximum number of characters per text chunk when splitting documents.

        chunk_overlap (int):
            Number of overlapping characters between consecutive chunks to preserve context.

        google_api_key (str):
            Google API key used to initialize Google Generative AI embeddings.

    Returns:
        retriever (BaseRetriever):
            A configured retriever ready to query the Chroma vectorstore.
    """

    try:
        # --- Initialize Embeddings ---
        print("🧠 Initializing Google Generative AI Embeddings ...")
        embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001",
            google_api_key=google_api_key
        )

        # --- Create or Load Vectorstore ---
        if create_new_vectorstore:
            print(f"🔧 Creating new Chroma vectorstore: {vectorstore_name} ...")

            # 1️⃣ Load documents
            documents = langchain_document_loader(TMP_DIR)

            # 2️⃣ Split into chunks
            splitter = RecursiveCharacterTextSplitter(
                separators=["\n\n", "\n", " ", ""],
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
            )
            chunks = splitter.split_documents(documents)

            # 3️⃣ Create and persist vectorstore
            vector_store = create_vectorstore(
                embeddings=embeddings,
                documents=chunks,
                vectorstore_name=vectorstore_name,
            )

        else:
            print(f"📦 Loading existing Chroma vectorstore: {vectorstore_name} ...")
            vector_store = Chroma(
                persist_directory=(LOCAL_VECTOR_STORE_DIR / vectorstore_name).as_posix(),
                embedding_function=embeddings
            )

        # --- Build Retriever ---
        retriever = vectorestore_retriver()

        # --- Summary Output ---
        chunk_count = vector_store._collection.count()
        print(f"\n✅ Retrieval pipeline ready!")
        print(f"   ➤ Vectorstore: {vectorstore_name}")
        print(f"   ➤ Embeddings: Google Generative AI")
        print(f"   ➤ Chunks available: {chunk_count}")

        return retriever

    except Exception as e:
        print(f"⚠️ Error in setup_retrieval_pipeline: {e}")
        raise
