Local RAG Chatbot with Ollama

A complete retrieval-augmented generation system running locally with Ollama LLMs.

In [1]:
# !pip install pydantic==2.11.0
# !pip install gradio
# !pip install openai # For interacting with Ollama's OpenAI-compatible API
# !pip install langchain-core langchain-community langchain-text-splitters langchain-chroma # Core LangChain components
# !pip install chromadb sentence-transformers huggingface-hub
# !pip install scikit-learn plotly numpy python-dotenv # For visualization and utilities
# !pip install ipykernel ipywidgets # For Jupyter environment

In [2]:
# Core imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from IPython.display import Markdown
import shutil
import uuid

# OpenAI Library
from openai import OpenAI

# LangChain components needed
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Visualization imports
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go


In [3]:
# Load environment variables from .env file
load_dotenv(override=True)

True

In [4]:
# Simple dictionary to store chat histories as lists of messages ---
chat_histories = {}

In [5]:
def setup_ollama_client():
    """
    Initialize and configure the OpenAI client to point to the Ollama server.
    Ensures the correct base URL (/v1) is used.
    """
    # URL must end in /v1 for Ollama's OpenAI-compatible API
    ollama_url = os.getenv('OLLAMA_URL', 'http://localhost:11434/v1')
    ollama_api_key = os.getenv('OLLAMA_API_KEY', 'ollama')
    ollama_model = os.getenv('OLLAMA_MODEL', 'llama3.2:latest')

    print(f"🔧 Configuring OpenAI client for Ollama...")
    print(f"   URL: {ollama_url}")
    print(f"   Model: {ollama_model}")

    # Create the OpenAI client, pointing it to Ollama
    try:
        client = OpenAI(
            base_url=ollama_url,
            api_key=ollama_api_key,
        )
        print("   Client configured successfully.")
        return client, ollama_model
    except Exception as e:
        print(f"     Error configuring client: {e}")
        print("      Ensure Ollama is running and the URL is correct (including /v1).")
        return None, None

In [6]:
def test_direct_ollama(client, model_name):
    """
    Sends a simple test prompt directly to Ollama using the configured
    OpenAI client to verify the connection.
    Uses the correct client.chat.completions.create() method.
    """
    if not client: return False
    print("🧪 Testing Ollama connection...")
    try:
        # FIXED: Use the standard OpenAI library method create()
        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": "What is 2+2? Answer in one word."}],
            max_tokens=10
        )
        # FIXED: Access the response content correctly
        answer = response.choices[0].message.content
        print(f"   Ollama test response: {answer.strip()}")
        return True
    except Exception as e:
        print(f"   Ollama connection failed: {e}")
        print("     Make sure Ollama is running and accessible at the configured URL (including /v1).")
        return False

In [7]:
def load_and_process_documents(knowledge_base_path="knowledge-base"):
    """
    Loads documents from the specified directory, adds metadata,
    and splits them into manageable chunks using RecursiveCharacterTextSplitter.
    Uses TextLoader via loader_cls.
    """

    print("Loading documents from knowledge base...")

    folders = glob.glob(f"{knowledge_base_path}/*")
    documents = []
    text_loader_kwargs = {'encoding': 'utf-8'} # Standard encoding

    for folder in folders:
        doc_type = os.path.basename(folder)
        try:
            loader = DirectoryLoader(
                folder,
                glob="**/*.md",
                loader_cls=TextLoader,
                loader_kwargs=text_loader_kwargs
            )
            folder_docs = loader.load()
            if folder_docs: # Check if loader returned any documents
                 for doc in folder_docs: # Add metadata to each loaded doc
                     doc.metadata["doc_type"] = doc_type
                 documents.extend(folder_docs)
            else:
                 print(f"   Warning: No documents loaded from folder '{folder}' with glob '**/*.md'")
        except Exception as e:
            print(f"   Error loading documents from folder '{folder}': {e}")

    # Check if any documents were loaded overall
    if not documents:
         print("   CRITICAL: No documents were successfully loaded. Check paths, glob pattern, and file types.")
         return [] # Return empty list if no documents loaded

    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
        separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""], # Define separators for splitting
    )
    
    # Split the loaded documents into chunks
    chunks = text_splitter.split_documents(documents)

    print(f"   Total documents loaded: {len(documents)}")
    print(f"   Total chunks created: {len(chunks)}")
    print(f"   ℹDocument types found: {set(doc.metadata.get('doc_type', 'N/A') for doc in documents)}")

    return chunks

In [8]:
def create_vector_store(chunks, db_name="vector_db"):
    """
    Creates a Chroma vector store from document chunks using
    HuggingFace embeddings. Deletes any existing database first.
    """
    
    if not chunks:
        print("   Skipping vector store creation: No document chunks provided.")
        return None
    
    print("Creating vector store...")

    # Initialize embeddings model
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True} # Normalize embeddings for better similarity search
    )
    # Remove old database directory if it exists
    if os.path.exists(db_name):
        print("   Removing existing vector database...")
        shutil.rmtree(db_name, ignore_errors=True)

    # Create and persist the vector store
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=db_name
    )
    
    print(f"   Vector store created with {vectorstore._collection.count()} documents")
    return vectorstore

In [9]:
def visualize_vector_store(vectorstore):
    """
    Creates 2D and 3D t-SNE visualizations of the document chunk embeddings
    in the vector store using Plotly. Uses 'max_iter' instead of 'n_iter'.
    """
    if not vectorstore:
        print("   Skipping visualization: Vector store not available.")
        return
    print("Creating vector store visualizations...")

    try:
        # Get embeddings, documents, and metadata from the Chroma collection
        collection = vectorstore._collection
        result = collection.get(include=['embeddings', 'documents', 'metadatas'])

        vectors = np.array(result['embeddings'])
        if vectors.shape[0] < 2: # Need at least 2 points for TSNE
             print("   Skipping visualization: Not enough documents for TSNE.")
             return

        # Handle perplexity: must be less than the number of samples
        perplexity_value = min(30, vectors.shape[0] - 1)
        if perplexity_value <= 0:
             print("   Skipping visualization: Perplexity must be positive.")
             return


        documents_text = result['documents']
        metadatas = result['metadatas']
        doc_types = [metadata.get('doc_type', 'unknown') for metadata in metadatas]

        # Define colors for different document types
        color_map = {'products': 'blue', 'employees': 'green', 'contracts': 'red', 'company': 'orange'}
        colors = [color_map.get(t, 'gray') for t in doc_types]

        # --- 2D Visualization using t-SNE ---
        print("   Generating 2D plot...")
        
        tsne_2d = TSNE(n_components=2, random_state=42, perplexity=perplexity_value, max_iter=300) 
        reduced_vectors_2d = tsne_2d.fit_transform(vectors)

        fig_2d = go.Figure(data=[go.Scatter(
            x=reduced_vectors_2d[:, 0],
            y=reduced_vectors_2d[:, 1],
            mode='markers',
            marker=dict(size=5, color=colors, opacity=0.8),
            text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents_text)], # Tooltip text
            hoverinfo='text'
        )])
        fig_2d.update_layout(title='2D Vector Store Visualization (t-SNE)', width=800, height=600)
        fig_2d.show()

        # --- 3D Visualization using t-SNE ---
        if vectors.shape[0] >= 4: # Need enough points for 3D
            print("   Generating 3D plot...")
            # FIXED: Changed n_iter=300 to max_iter=300
            tsne_3d = TSNE(n_components=3, random_state=42, perplexity=perplexity_value, max_iter=300) 
            reduced_vectors_3d = tsne_3d.fit_transform(vectors)

            fig_3d = go.Figure(data=[go.Scatter3d(
                x=reduced_vectors_3d[:, 0],
                y=reduced_vectors_3d[:, 1],
                z=reduced_vectors_3d[:, 2],
                mode='markers',
                marker=dict(size=4, color=colors, opacity=0.8),
                text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents_text)], # Tooltip text
                hoverinfo='text'
            )])
            fig_3d.update_layout(title='3D Vector Store Visualization (t-SNE)', width=900, height=700)
            fig_3d.show()
        else:
            print("   Skipping 3D plot: Need at least 4 documents.")

    except ImportError:
        print("   Visualization skipped: Please install scikit-learn and plotly.")
    except Exception as e:
        print(f"   Visualization failed: {e}")

In [10]:
def format_docs(docs):
    """ Helper function to combine document page contents into a single string. """
    return "\n\n".join(doc.page_content for doc in docs)

In [11]:
def run_rag_with_history(client, model_name, retriever, question, session_id):
    """
    Core RAG logic using the direct openai client:
    1. Retrieves relevant documents based on the question.
    2. Fetches the chat history for the current session.
    3. Constructs a prompt including system instructions, retrieved context, and chat history.
    4. Calls the Ollama model via the OpenAI client.
    5. Updates the chat history with the new question and answer.
    """

    print(f"\n--- Processing Query for session {session_id} ---")
    print(f"   User Question: {question}")

    # 1. Retrieve relevant documents
    try:
        retrieved_docs = retriever.invoke(question)
        context = format_docs(retrieved_docs)
        print(f"   Retrieved {len(retrieved_docs)} documents.")
    except Exception as e:
        print(f"   Error during document retrieval: {e}")

        context = "" # Proceed without context if retrieval fails
        retrieved_docs = []

    # 2. Get chat history for the session (simple list of dicts)
    history = chat_histories.get(session_id, [])

    # 3. Construct the prompt messages for the OpenAI client
    system_prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces "
        "of retrieved context to answer the question. If the context doesn't provide the answer, "
        "say that you don't have enough information based on the provided documents. "
        "Do not make up information. Keep the answer concise and relevant."
    )

    messages = [{"role": "system", "content": system_prompt}]

    # Add formatted context if available
    if context:
        messages.append({"role": "system", "content": f"Context information based on provided documents:\n{context}"})
    else:
         messages.append({"role": "system", "content": "No relevant context documents were found."})

    # Add past history (if any)
    messages.extend(history)

    # Add current user question
    messages.append({"role": "user", "content": question})

    # 4. Call Ollama using the OpenAI client
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=messages,
            temperature=0.3, # Lower temperature for more focused answers
            max_tokens=500,  # Limit response length
        )

        answer = response.choices[0].message.content
        print(f"   LLM Answer: {answer[:100].strip()}...")

        # 5. Update history (simple list append)
        current_history = chat_histories.get(session_id, [])
        current_history.append({"role": "user", "content": question})
        current_history.append({"role": "assistant", "content": answer})

        # Optional: Limit history length to prevent excessive token usage
        MAX_HISTORY_TURNS = 5 # Keep last 5 Q/A pairs (10 messages total)
        if len(current_history) > MAX_HISTORY_TURNS * 2:
            current_history = current_history[-(MAX_HISTORY_TURNS * 2):]

        chat_histories[session_id] = current_history # Store updated history

        return answer

    except Exception as e:
        error_msg = f"Error during Ollama call: {e}"
        print(f"   {error_msg}")
        # Consider more specific error handling if needed
        return "Sorry, I encountered an error trying to contact the AI model. Please ensure Ollama is running."

In [12]:
def create_chat_interface(client, model_name, retriever):
    """
    Sets up and launches the Gradio chat interface.
    Manages a unique session ID for history tracking within the app instance.
    """

    print("Setting up chat interface...")

    # Generate a unique session_id for this Gradio instance run
    session_id = str(uuid.uuid4())
    print(f"   Using Session ID for this run: {session_id}")

    def chat_function_gradio(message, history):
        """
        Wrapper function for Gradio. It calls the main RAG logic function,
        passing the message and the unique session_id for this instance.
        Gradio's 'history' parameter is not used for our internal memory management.
        """
        return run_rag_with_history(client, model_name, retriever, message, session_id)

    # Create the Gradio interface
    interface = gr.ChatInterface(
        fn=chat_function_gradio, # The function Gradio calls for each chat turn
        title="Local RAG Chatbot",
        description="Ask questions about your documents. Powered by local Ollama LLM.",
        
        examples=[ # Example prompts for the user
            "What is Insurellm?",
            "When was it founded?",
            "What services does Insurellm offer?",
        ],
        cache_examples=False # Disable example caching if content changes
    )
    return interface

In [15]:
def main():
    """
    Main function to set up and run the RAG chatbot application.
    Orchestrates client setup, document loading, vector store creation,
    testing, and launching the Gradio UI.
    """
    print("Starting Local RAG Chatbot Setup...")
    print("=" * 50)

    # Step 1: Setup Ollama client using OpenAI library
    client, model_name = setup_ollama_client()
    if not client:
        print("Exiting due to client setup failure.")
        return # Stop if client setup fails

    # Step 2: Test Ollama connection
    if not test_direct_ollama(client, model_name):
        print("Exiting due to connection test failure.")
        return # Stop if connection fails

    # Step 3: Load and process documents
    chunks = load_and_process_documents()
    if not chunks:
        print("Exiting because no documents were processed.")
        return # Stop if no documents

    # Step 4: Create vector store
    vectorstore = create_vector_store(chunks)
    if not vectorstore:
        print("Exiting due to vector store creation failure.")
        return # Stop if vector store fails
    
    # Create the retriever from the vector store
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) # Retrieve top 5 chunks

    # Step 5: Visualize vector store (Optional, requires Plotly/Sklearn)
    visualize_vector_store(vectorstore)

    # Step 6: Launch chat interface
    print("Launching chat interface...")
    interface = create_chat_interface(client, model_name, retriever)
    interface.launch(
        inbrowser=True,
        share=False,
        server_name="127.0.0.1",
        server_port=7860,
        debug=True
    )
    print("Chatbot interface launched. Check your browser.")

In [16]:
main()

Starting Local RAG Chatbot Setup...
🔧 Configuring OpenAI client for Ollama...
   URL: http://localhost:11434/v1
   Model: llama3.2:latest
   Client configured successfully.
🧪 Testing Ollama connection...
   Ollama test response: Four.
Loading documents from knowledge base...
   Total documents loaded: 31
   Total chunks created: 152
   ℹDocument types found: {'company', 'products', 'employees', 'contracts'}
Creating vector store...
   Removing existing vector database...
   Vector store created with 304 documents
Creating vector store visualizations...
   Generating 2D plot...


   Generating 3D plot...


Launching chat interface...
Setting up chat interface...
   Using Session ID for this run: 93e27a20-21b5-4dfa-8839-0da264e18f0d



The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.



* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.
Chatbot interface launched. Check your browser.
