## **1. Install and import the necessary libraries**

In [None]:
!pip install pypdf openai gradio langchain
!pip install langchain_openai
!pip install PyPDF2
!pip install -U langchain-community
!pip install chromadb

In [2]:
import requests
import zipfile
import os

# Replace with the raw file download link
zip_file_url = "https://github.com/genaiatutd/Spring-2025-Chatbot/raw/main/Gen-AI-session-20250210T210052Z-001.zip"

output_file = "chatbot_files.zip"

# Download the ZIP file
response = requests.get(zip_file_url, stream=True)

if response.status_code == 200:
    with open(output_file, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f"Downloaded: {output_file}")

    # Extract the ZIP file
    extracted_folder = "chatbot_content"
    with zipfile.ZipFile(output_file, 'r') as zip_ref:
        zip_ref.extractall(extracted_folder)
    print(f"Extraction complete. Contents are in '{extracted_folder}/'")

else:
    print(f"Failed to download file. HTTP Status Code: {response.status_code}")

Downloaded: chatbot_files.zip
Extraction complete. Contents are in 'chatbot_content/'


In [3]:
import os
import PyPDF2
import gradio as gr
from PIL import Image
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
import openai

# 🔹 Set up OpenAI API key
openai.api_key = ""
os.environ["OPENAI_API_KEY"] = openai.api_key

## **2. Dataset setup**

In [4]:
# 🔹 Load the robot image
image_path = "/content/chatbot_content/Gen-AI-session/my_robot.png"
robot_image = Image.open(image_path).resize((900, 300))

# 🔹 Set directories
pdf_directory = "/content/chatbot_content/Gen-AI-session"
persist_directory = "/content/vectorstore"

# 🔹 Ensure persist directory exists
if not os.path.exists(persist_directory):
    os.makedirs(persist_directory)

# 🔹 Load PDFs into LangChain Document Loader
all_docs = []
for file in os.listdir(pdf_directory):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(pdf_directory, file))
        docs = loader.load()
        all_docs.extend(docs)

print(f"📄 Loaded {len(all_docs)} pages from PDFs.")

📄 Loaded 32 pages from PDFs.


## **3. Split the PDFs into data chunks**

In [5]:
# 🔹 Split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(all_docs)

print(f"📑 Split into {len(chunks)} document chunks.")

# 🔹 Store documents in Chroma
embedding_function = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(chunks, embedding=embedding_function, persist_directory=persist_directory)

print(f"✅ Successfully stored {len(vectorstore.get()['documents'])} document chunks in Chroma!")

📑 Split into 131 document chunks.


  embedding_function = OpenAIEmbeddings()


✅ Successfully stored 131 document chunks in Chroma!


## ***4. Initiate RAG ***

In [6]:
# 🔹 Define RAG framework components
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves top 3 most relevant chunks
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# 🔹 Function to format retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs if doc.page_content.strip())

# 🔹 Define RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 🔹 Initialize conversation history
conversation_history = []



## **5. Response function to provide contextually rich answers**

In [7]:
# 🔹 Function to generate chatbot responses
def generate_response(question):
    global conversation_history

    # Ensure question is valid
    if not question or not isinstance(question, str):
        return "Invalid input. Please ask a relevant question."

    question_lower = question.lower().strip()

    # Predefined quick responses
    greetings = {"hi", "hello", "hey", "hola"}
    thanks = {"Thank you!","thank you","Thanks!", "thanks", "thx", "ty!"}

    if question_lower in greetings:
        return "Hello! How can I assist you today?"
    elif question_lower in thanks:
        return "You're welcome! Let me know if you need more help."

    # Retrieve context from Chroma
    retrieved_docs = retriever.invoke(question)
    formatted_context = format_docs(retrieved_docs)

    # If no context is found, return a relevant response
    if not formatted_context.strip():
        return "I couldn't find relevant information in my database. Can you clarify or provide more details?"

    # Prepare the input for RAG model
    full_input = f"Context:\n{formatted_context}\n\nQuestion: {question}"

    # Generate response using the RAG chain
    response = rag_chain.invoke(full_input)

    # Append conversation history
    conversation_history.append(f"User: {question}\nBot: {response}")

    return response

## **6. User Interface**

In [8]:
# 🔹 Define Gradio Chatbot UI
with gr.Blocks() as iface:
    with gr.Row():
        gr.Markdown("<h1 style='text-align: center;'>UTD Chatbot</h1>")

    with gr.Row():
        gr.Markdown("<p style='text-align: center;'><b>Ask any question about UTD!</b></p>")

    with gr.Row():
        gr.Image(value=robot_image, label="UTD Chatbot", show_label=False)

    with gr.Column(scale=1, min_width=300):
        input_box = gr.Textbox(label="Ask your question", placeholder="Type here...")

    with gr.Row():
        submit_btn = gr.Button("Submit", variant="primary")
        clear_btn = gr.Button("Clear", variant="secondary")

    output_box = gr.Textbox(label="Response", interactive=False)

    submit_btn.click(fn=generate_response, inputs=input_box, outputs=output_box)
    clear_btn.click(lambda: ("", ""), inputs=[], outputs=[input_box, output_box])

iface.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a4a12748ca4f3e3ba6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://a4a12748ca4f3e3ba6.gradio.live


