<a href="https://colab.research.google.com/github/khokhakhokha/chatbot-for-business-website/blob/main/Fixed_LangChain_Notebook_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# ============================================
# ðŸ“¦ INSTALL ALL REQUIRED PACKAGES (STABLE SETUP)
# ============================================
# !pip uninstall -y langchain langchain-core langchain-community langchain-text-splitters
# !pip install -q langchain==0.0.350 langchain-google-genai chromadb unstructured tiktoken nbformat streamlit
!pip install langchain==0.1.4
!pip install langchain-google-genai==0.0.6
!pip install chromadb
!pip install unstructured
!pip install tiktoken
!pip install nbformat
!pip install streamlit==1.28.0
print("âœ… Installation complete. Please restart runtime before continuing.")


## 2. Import All Dependencies

Import all necessary libraries and define helper functions.

In [14]:

# ============================================
# CORE PYTHON LIBRARIES
# ============================================
from pathlib import Path
import os
from typing import List, Optional, Dict, Any

# ============================================
# LANGCHAIN CORE COMPONENTS
# ============================================
from langchain.document_loaders import TextLoader, UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import HumanMessage, AIMessage, BaseMessage

from langchain.memory import ConversationBufferMemory

# ============================================
# LANGCHAIN CHAIN COMPONENTS
# ============================================
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

# ============================================
# GOOGLE GEMINI INTEGRATION
# ============================================
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI



# ============================================
# HELPER FUNCTIONS
# ============================================
def require_env_var(name: str):
    val = os.environ.get(name)
    if not val:
        raise EnvironmentError(f"Environment variable {name} is required. Set it and re-run the cell.")
    return val

print("âœ… All dependencies imported successfully and ready to use!")


âœ… All dependencies imported successfully and ready to use!


In [4]:
import os
os.environ['GOOGLE_API_KEY'] = 'AIzaSyC4frdIQu2rZ2BDutQZuOayDii8ioXCoSw'

## 3. Document Loading and Splitting

Functions to load documents from various formats and split them into manageable chunks.

In [5]:
def load_documents(directory: str = './docs') -> List[Any]:
    p = Path(directory)
    if not p.exists():
        raise FileNotFoundError(f"Documents directory not found: {p.resolve()}")

    docs = []
    for f in p.rglob('*'):
        if f.is_dir():
            continue

        ext = f.suffix.lower()
        try:
            if ext in ('.txt', '.md', '.csv', '.json'):
                loader = TextLoader(str(f), encoding='utf-8')
                docs.extend(loader.load())
            elif ext == '.pdf':
                loader = UnstructuredPDFLoader(str(f))
                docs.extend(loader.load())
            else:
                try:
                    loader = TextLoader(str(f), encoding='utf-8')
                    docs.extend(loader.load())
                except Exception:
                    print(f'Skipping unsupported file: {f.name}')
        except Exception as e:
            print(f'Failed to load {f.name}: {e}')

    print(f'Loaded {len(docs)} documents from {directory}')
    return docs

def split_documents(documents: List[Any], chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Any]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = splitter.split_documents(documents)
    print(f'Split into {len(chunks)} chunks')
    return chunks

from langchain_community.embeddings import HuggingFaceEmbeddings

def get_embeddings():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [6]:

def build_chroma(documents, persist_directory="./chroma_db", embedding=None):
    vectordb = Chroma.from_documents(
        documents,
        embedding,
        persist_directory=persist_directory
    )
    vectordb.persist()
    return vectordb

from pathlib import Path
from typing import List, Optional, Any
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Function to get Hugging Face embeddings
def get_hf_embeddings(model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> HuggingFaceEmbeddings:
    return HuggingFaceEmbeddings(model_name=model_name)

# Build Chroma vectorstore using HF embeddings
def build_chroma(texts: List[Any], persist_directory: str = './chroma_db', embedding: Optional[HuggingFaceEmbeddings] = None) -> Chroma:
    embedding = embedding or get_hf_embeddings()
    vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)
    print(f'Chroma vectorstore ready at {persist_directory}')
    return vectordb

# Load persisted Chroma vectorstore using HF embeddings
def load_chroma(persist_directory: str = './chroma_db', embedding: Optional[HuggingFaceEmbeddings] = None) -> Chroma:
    embedding = embedding or get_hf_embeddings()
    p = Path(persist_directory)
    if not p.exists() or not any(p.iterdir()):
        raise FileNotFoundError(f'Chroma persistence directory not found at {persist_directory}. Run document ingestion first.')
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
    print(f'Loaded persisted Chroma from {persist_directory}')
    return vectordb



## 5. Google Gemini Chat Model Configuration

Initialize the Google Gemini language model for conversational AI.

In [7]:
def get_google_chat_model(temperature: float = 0.2, model_name: Optional[str] = None) -> ChatGoogleGenerativeAI:
    require_env_var('GOOGLE_API_KEY')
    model_name = model_name or os.environ.get('GOOGLE_MODEL', 'gemini-2.5-flash')
    return ChatGoogleGenerativeAI(temperature=temperature, model=model_name,convert_system_message_to_human=True)

## 6. RAG Chain Construction

Build the complete RAG chain with history-aware retrieval and document combination.

In [8]:
def build_rag_chain(vectordb: Chroma, llm: Optional[ChatGoogleGenerativeAI] = None, k: int = 4):
    llm = llm or get_google_chat_model()
    retriever = vectordb.as_retriever(search_kwargs={'k': k})
    rag = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True)
    print('RAG chain built successfully using ConversationalRetrievalChain')
    return {'chain': rag}


## 7. Document Ingestion and Indexing

Load documents, split them into chunks, and create the vector store index.

In [9]:
docs_dir = './docs'
persist_dir = './chroma_db'

print('Loading documents...')
docs = load_documents(docs_dir)

print('\nSplitting documents...')
chunks = split_documents(docs, chunk_size=1000, chunk_overlap=200)

print('\nCreating embeddings and building vector store...')
emb = get_hf_embeddings()   # <-- use HF embeddings here
vectordb = build_chroma(chunks, persist_directory=persist_dir, embedding=emb)

print('\nDocument ingestion complete!')

Loading documents...
Loaded 6 documents from ./docs

Splitting documents...
Split into 13 chunks

Creating embeddings and building vector store...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Chroma vectorstore ready at ./chroma_db

Document ingestion complete!


## Chat history

In [12]:
from langchain.schema import HumanMessage, AIMessage

class ChatHistory:
    def __init__(self):
        self.messages = []

    def add_user_message(self, text):
        self.messages.append(HumanMessage(content=text))

    def add_ai_message(self, text):
        self.messages.append(AIMessage(content=text))

    def get_messages(self):
        # returns list of BaseMessage objects
        return self.messages



## 8. Interactive Chat Interface

Test the RAG chatbot with an interactive terminal-based chat loop.

In [13]:
print('Initializing chatbot...')

try:
    vectordb = load_chroma(persist_dir)
except:
    print('Could not load vector store. Run the ingestion cell first.')
    raise

llm = get_google_chat_model()
rag_bundle = build_rag_chain(vectordb, llm=llm, k=4)
rag_chain = rag_bundle['chain']

chat_history = ChatHistory()

print('\nChatbot ready! Type exit or quit to stop.\n')
print('=' * 60)

while True:
    user_input = input('\nYou: ').strip()

    if user_input.lower() in ('exit', 'quit', 'q'):
        print('\nGoodbye!')
        break

    if not user_input:
        continue

    result = rag_chain.invoke({'question': user_input, 'chat_history': chat_history.get_messages()})
    answer = result.get('answer', 'No answer generated.')

    print(f'\nAssistant: {answer}')

    chat_history.add_user_message(user_input)
    chat_history.add_ai_message(answer)

    context_docs = result.get('context', [])
    if context_docs:
        print('\nSources:')
        for i, doc in enumerate(context_docs[:3], 1):
            source = doc.metadata.get('source', 'unknown')
            preview = doc.page_content[:150].replace('\n', ' ')
            print(f'  {i}. {source}: {preview}...')

    print('\n' + '-' * 60)

Initializing chatbot...
Loaded persisted Chroma from ./chroma_db
RAG chain built successfully using ConversationalRetrievalChain

Chatbot ready! Type exit or quit to stop.


You: what

Assistant: I don't know the answer because your question is incomplete. Please tell me what information you are looking for.

------------------------------------------------------------

You: the company u are working for

Assistant: I am working for PureSkin.

------------------------------------------------------------


KeyboardInterrupt: Interrupted by user