<a href="https://colab.research.google.com/github/kmalhotra18/RAG/blob/main/Conversational_Personal_AI_Chatbot_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
#!pip install -q langchain chromadb unstructured openai tiktoken sentence-transformers

!pip install -q OpenAI
!pip install -q google-generativeai
!pip install -q python-dotenv
!pip install -q anthropic
!pip install -q gradio
!pip install -q langchain-community # Install the langchain-community package
!pip install -q langchain-openai
!pip install -q chromadb
!pip install -q langchain-chroma
!pip install unstructured
!pip install "unstructured[doc]"
!pip install "unstructured[pdf]"

In [None]:
# imports for langchain and Chroma and plotly

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE                                     # To visualize
import plotly.graph_objects as go                                     # To plot
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import glob
from pathlib import Path
from langchain.document_loaders import UnstructuredFileLoader

In [None]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-turbo"
db_name = "vector_db"

In [None]:
# Load environment variables in a file called .env

from dotenv import load_dotenv
import os
import glob

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
!apt-get update && apt-get install -y libreoffice

In [None]:

root_directory = "/content/drive/MyDrive/Important Documents/My Resumes"

documents = []
skipped_files = []

# Walk through all files manually
for filepath in Path(root_directory).rglob("*"):
    if filepath.is_file() and not filepath.suffix.lower().endswith((".gdoc", ".gsheet", ".gslides")):
        try:
            loader = UnstructuredFileLoader(str(filepath))
            loaded_docs = loader.load()
            documents.extend(loaded_docs)
        except Exception as e:
            print(f"❌ Skipping {filepath.name}: {e}")
            skipped_files.append(filepath)

# Feedback
print(f"\n✅ Loaded {len(documents)} documents.")
if skipped_files:
    print(f"⚠️ Skipped {len(skipped_files)} files due to errors.")

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
# Check if 'doc_type' exists before accessing it, providing a default value if not
doc_types = set(chunk.metadata.get('doc_type', 'unknown') for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

In [None]:
from langchain.vectorstores import Chroma

import shutil
import os

db_name = "/content/chroma_db"  # path to where your DB will be saved

from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


# Optional: delete if the DB already exists
if os.path.exists(db_name):
    print("🧹 Removing existing Chroma DB...")
    shutil.rmtree(db_name)

# Build vectorstore from document chunks
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

print(f"✅ Vectorstore created with {vectorstore._collection.count()} documents")

Personal AI Chatbot

In [None]:
# ✅ Step 1: Install required libraries
!pip install -q langchain chromadb unstructured openai tiktoken sentence-transformers
!apt install poppler-utils  # for PDF parsing
!pip install -q pypdf # for PDF parsing
!pip install -q python-dotenv
!pip install -q gradio
!pip install -q pdfminer.six
!pip install -q pi_heif
!pip install -q unstructured-inference
!pip install numpy
!pip install pdf2image
!pip install -q python-docx
!pip install -q "unstructured[local-inference,ocr,pytesseract]"
!apt install -y poppler-utils tesseract-ocr

In [None]:
# ✅ Step 2: Mount Google Drive and locate the folder
from google.colab import drive
import os

In [None]:
# Set path to your folder
DOC_FOLDER = "/content/drive/MyDrive/Important Documents/My Resumes"
assert os.path.exists(DOC_FOLDER), "Important Documents folder not found. Check path."

In [None]:
# ✅ Step 3: Load and process documents using Unstructured
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.docx import partition_docx
from unstructured.partition.text import partition_text
from pathlib import Path

def load_documents(folder_path):
    docs = []
    for file in Path(folder_path).rglob("*"):
        try:
            if file.suffix.lower() == ".pdf":
                elements = partition_pdf(filename=str(file))
            elif file.suffix.lower() == ".docx":
                elements = partition_docx(filename=str(file))
            elif file.suffix.lower() == ".txt":
                elements = partition_text(filename=str(file))
            else:
                continue
            doc_text = "\n".join([str(el) for el in elements])
            docs.append({"path": str(file), "text": doc_text})
        except Exception as e:
            print(f"❌ Skipping {file.name}: {e}")
    return docs


In [None]:
# ✅ Step 4: Embed and store in Chroma vector store
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document


In [None]:
# Load HF embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# Prepare langchain Documents
raw_docs = load_documents(DOC_FOLDER) # Assuming DOC_FOLDER is the variable holding the path to your documents
langchain_docs = [Document(page_content=doc['text'], metadata={"source": doc['path']}) for doc in raw_docs]

In [None]:
# Create Chroma DB
persist_dir = "/content/chroma_store"
vectordb = Chroma.from_documents(documents=langchain_docs, embedding=embedding_model, persist_directory=persist_dir)
vectordb.persist()
print("✅ Vector store created and saved.")

In [None]:
# ✅ Step 5: Build RAG Q&A pipeline
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import os
from dotenv import load_dotenv
import gradio as gr
import glob

In [None]:
!pip install -q langchain-anthropic

In [None]:
from langchain_anthropic import ChatAnthropic
from langchain.chains import RetrievalQA

In [None]:
# Load environment variables in a file called .env

from langchain.chains import RetrievalQA

retriever = vectordb.as_retriever(search_kwargs={"k": 2})  # Limit number of chunks

from langchain.chat_models import ChatOpenAI
#llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
llm = ChatAnthropic(model="claude-3-haiku-20240307", temperature=0)

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

retriever = vectordb.as_retriever(search_kwargs={"k": 2})

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="map_reduce"
)

In [None]:

# ✅ Step 6: Ask a question
query = "What is the contract duration in the consulting agreement?"
response = qa.run(query)
print("\n❓ Question:", query)
print("\n💡 Answer:", response)


In [None]:
# ✅ Step 6: Gradio Interface

def answer_question(question):
    return qa.run(question)

gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(placeholder="Ask a question about your documents..."),
    outputs="text",
    title="RAG Chatbot",
    description="Ask questions about the documents in your 'Important Documents' folder"
).launch()
