In [1]:
from dotenv import load_dotenv
import os
import glob
import math
import google.generativeai as genai

load_dotenv("/home/harish/Desktop/chatbot/.env")
api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=api_key) 
model_name = "gemini-1.5-flash"

def chunk_text(text, max_length=500):
    """Splits a single text string into chunks of a max length."""
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

# 1. FIND ALL FILE PATHS (Your code is good)
doc_path = "/home/harish/Desktop/chatbot/docs/**/*" # Using '/*' is slightly more robust
doc_files = glob.glob(doc_path, recursive=True)

# Filter for specific file extensions
doc_files = [f for f in doc_files if f.endswith((".md", ".txt", ".html"))]

if not doc_files:
    raise ValueError("❌ No documents found. Check your docs path and file extensions!")

chunks = []
for f in doc_files:
    with open(f, "r", encoding="utf-8") as file:
        text = file.read()
    for chunk in chunk_text(text):
        embedding = genai.embed_content(
            model="models/text-embedding-004",
            content=chunk
        )
        chunks.append({"text": chunk, "vector": embedding["embedding"]})

if not chunks:
    raise ValueError("❌ No chunks created. Are your documents empty?")

print(f"✅ Loaded {len(chunks)} chunks from {len(doc_files)} documents.")

# Cosine similarity helper
def cosine_similarity(v1, v2):
    dot = sum(a * b for a, b in zip(v1, v2))
    mag1 = math.sqrt(sum(x ** 2 for x in v1))
    mag2 = math.sqrt(sum(x ** 2 for x in v2))
    return dot / (mag1 * mag2)

# Setup Chat Model
chat_model = genai.GenerativeModel(model_name)

# Ask Bot function
def ask_bot(question: str) -> str:
    """Ask the chatbot a question based on your docs."""
    # Embed question
    q_embed = genai.embed_content(
        model="models/text-embedding-004",
        content=question
    )["embedding"]

    # Rank chunks by similarity
    ranked = sorted(
        [(c, cosine_similarity(q_embed, c["vector"])) for c in chunks],
        key=lambda x: -x[1]
    )[:3]

    # Build prompt
    context = "\n---\n".join(c["text"] for c, _ in ranked)
    prompt = f"""
    You are an assistant that answers questions based only on the following document excerpts.
    If the answer isn't clearly in the documents, reply with "I don't know."
    DOCUMENTS:
    {context}
    QUESTION: {question}
    """

    # Ask Gemini
    response = chat_model.generate_content([{"role": "user", "parts": [prompt]}])
    return response.text


    #how to use 
    #ask_bot("your question")


✅ Loaded 855 chunks from 83 documents.


In [2]:
ask_bot("what is aws")

'Based on the provided text, AWS refers to Amazon Web Services.  The document describes setting up Auto Scaling for a Rails application on AWS, utilizing services like EC2, Auto Scaling, and Load Balancing.\n'