<a href="https://colab.research.google.com/github/jayshri/AIAgents/blob/main/ChatBotFromWebpageNov21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install chromadb langchain openai langchain-community langchain-text-splitters langchain-openai



In [None]:
import gradio as gr
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
import os
from langchain_openai import OpenAIEmbeddings
from google.colab import userdata
import getpass
from langchain_openai import ChatOpenAI

def create_langchain_documents(df: pd.DataFrame) -> list[Document]:
  # create langchain documents from dataframe
  docs = [
    Document(
        page_content=str(row["content"]),
        metadata={
            "chunk_id": int(row["chunk_id"]),
            "length": int(row["length"]),
            "source": row["source"],
            "title": row["title"],
        },
    )
    for _, row in df.iterrows()
  ]
  #print("Number of langchain documents:", len(docs))
  return docs


def build_vectorstore(df: pd.DataFrame, persist_dir: str = "chroma_db") -> Chroma:
  """ 1st step is to create langchain documents from the dataframe.
      2nd step is to create embedding
      3rd step is to create vectorstore

  """
  docs = create_langchain_documents(df)
  embeddings = create_embedding()

  vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory="chroma_db_openai",
    collection_name="my_openai_embeddings"
  )
  vectorstore.persist()
  #print("Chroma DB created and persisted.")
  return vectorstore

def create_embedding():
  openai_api_key = userdata.get('OPENAI_API_KEY')
  os.environ["OPENAI_API_KEY"] = openai_api_key
  embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
  )
  return embeddings

def create_dataFrame(url):
  """ This method will get the url content and perform data chunking
      And then create a dataframe and return it
  """
  #load the webpage content
  loader = WebBaseLoader(url)
  webpagedata = loader.load()
  #print(f"Loaded {len(webpagedata)} documents from the URL.")

  # Initialize the text splitter
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

  # Split the loaded document into chunks
  chunks = text_splitter.split_documents(webpagedata)

  #print(f"Original document split into {len(chunks)} chunks.")

  # To store the chunks in a pandas DataFrame:
  chunk_data = []
  for i, chunk in enumerate(chunks):
      chunk_data.append({
          'chunk_id': i,
          'content': chunk.page_content,
          'length': len(chunk.page_content),
          'source': chunk.metadata.get('source'),
          'title': chunk.metadata.get('title')
      })

  chunks_df = pd.DataFrame(chunk_data)
  #print("DataFrame containing the chunks created.")
  #display(chunks_df.head())
  return chunks_df


def create_ai_agent(query, history):
  """ Factory method to create RAG agent from a url
  """
  url = "https://www.nasa.gov/what-is-artificial-intelligence/"
  df = create_dataFrame(url)
  vector_store = build_vectorstore(df,"chroma_db")
  #print("Vector store created.")

  # llm to find the answer to the user query
  llm = ChatOpenAI(
    model="gpt-4.1-mini",
    temperature=0
  )
  #print("LLM created.")

  if not query.strip():
    return "No query provided."

  reponse = create_rag_agent(query, llm, vector_store,history)

  return "Answer: " + reponse + ""

def create_rag_agent(query, llm, vector_store, history):
  docs = vector_store.similarity_search(query, k=4)

  if not docs:
    return "No documents found for the given query."

  # Build context string
  context = "\n\n---\n\n".join(d.page_content for d in docs)

  # Build conversation history
  history_text = ""
  for user, bot in history[-5:]:
    history_text += f"User: {user}\nAssistant: {bot}\n\n"

  # Build prompt
  prompt = f"""
  You are a helpful assistant chatting with a user.
You have:
- Conversation history
- Retrieved context from a vector database

Use BOTH the history and the context below to answer.
If the answer is not supported by the context, say you don't know.

Conversation so far:
{history_text}

Retrieved context:
{context}

User's new question:
{query}

Provide a clear, concise answer:
  """
  response = llm.invoke(prompt)
  return response.content


#Gradio creates the user interface which takes the url and display the reponse in ui

chatbot = gr.ChatInterface(
    fn=create_ai_agent,
    title="My Simple RAG App",
    description="This app uses a webpage database + OpenAI to answer questions from your custom data."
  )
chatbot.launch(debug=True)


  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d33aaf7093154789c5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
