In [2]:
import os
os.makedirs("content/data", exist_ok=True)

In [3]:
from google.colab import files
uploaded = files.upload()

Saving employee_handbook.md to employee_handbook.md
Saving engineering_master_doc.md to engineering_master_doc.md
Saving financial_summary.md to financial_summary.md
Saving hr_data.csv to hr_data.csv
Saving market_report_q4_2024.md to market_report_q4_2024.md
Saving marketing_report_2024.md to marketing_report_2024.md
Saving marketing_report_q1_2024.md to marketing_report_q1_2024.md
Saving marketing_report_q2_2024.md to marketing_report_q2_2024.md
Saving marketing_report_q3_2024.md to marketing_report_q3_2024.md
Saving quarterly_financial_report.md to quarterly_financial_report.md


In [15]:
!pip install -q -U langchain google-generativeai chromadb markdown2 pandas langchain-google-genai langchain-community

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m2.1/2.5 MB[0m [31m63.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

In [6]:
import os
import pandas as pd
import markdown2
from langchain.docstore.document import Document  # ✅ correct import

from bs4 import BeautifulSoup

def clean_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator="\n")


def load_documents(path="/content/data"):
    docs = []

    for file in os.listdir(path):
        file_path = os.path.join(path, file)

        # Load Markdown files
        if file.endswith(".md"):
            with open(file_path, "r") as f:
                markdown_text = f.read()
            html_text = markdown2.markdown(markdown_text)
            clean_text = clean_html(html_text)
            docs.append(Document(page_content=clean_text, metadata={"source": file}))

        # Load CSV (HR data)
        elif file.endswith(".csv"):
            df = pd.read_csv(file_path)
            for _, row in df.iterrows():
                row_text = "\n".join(f"{col}: {row[col]}" for col in df.columns)
                docs.append(Document(page_content=row_text, metadata={"source": file}))

        else:
            print(f"⚠️ Skipping unsupported file: {file}")

    return docs


In [7]:
documents = load_documents("content/data")
print(f"Loaded {len(documents)} documents.")

Loaded 109 documents.


In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(documents)

print(f"Split into {len(split_docs)} chunks.")

os.makedirs("content/vector_store", exist_ok=True)

embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectorstore = Chroma.from_documents(documents=split_docs, embedding=embedding, persist_directory="content/vector_store")
vectorstore.persist()

Split into 334 chunks.


  vectorstore.persist()


In [21]:
# Simulated user-role mapping
USER_DB = {
    "finance_user": {"role": "finance"},
    "hr_user": {"role": "hr"},
    "eng_user": {"role": "engineering"},
    "marketing_user": {"role": "marketing"},
    "staff_user": {"role": "general"},

}

# Get role from username
def get_role(username: str) -> str:
    return USER_DB.get(username, {}).get("role", "general")

# Filter docs by role-specific source file
def retrieve_docs_by_role(query, role, vectordb):
    all_docs = vectordb.similarity_search(query, k=10)



    # Filter docs by matching metadata
    role_docs = []
    for doc in all_docs:
        source = doc.metadata.get("source", "").lower()
        if role.lower() in source:
            role_docs.append(doc)

    return role_docs


In [28]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

def get_rag_response(query, username, vectordb):
    role = get_role(username)
    docs = retrieve_docs_by_role(query, role, vectordb)

    if not docs:
        return f"🔒 No accessible data found for role: {role}"

    context = "\n\n".join([doc.page_content for doc in docs])
    prompt = f"""You are an internal company chatbot for FinSolve Technologies.

Context:
{context}

Answer the following question clearly and concisely:
Q: {query}
"""

    response = llm.invoke(prompt)
    return f"🧑‍💼 Role: {role}\n📌 Answer:\n{response.content}"

In [30]:
print(get_rag_response("What is the system architecture?", "eng_user", vectorstore))
print(get_rag_response("Attendance of Aadhya Patel?", "hr_user", vectorstore))
print(get_rag_response("What are marketing KPIs?", "finance_user", vectorstore))

🧑‍💼 Role: engineering
📌 Answer:
FinSolve's system architecture is a microservices-based, cloud-native system built on AWS, Kubernetes, and Cloudflare.  It uses a modular design with client applications (iOS, Android, and React web app) communicating via REST and GraphQL APIs to a microservices layer (e.g., Authentication, Payment Processing, Wealth Management).  Data is stored in PostgreSQL, MongoDB, Redis, and Amazon S3.  The architecture prioritizes scalability, resilience, and security.
🧑‍💼 Role: hr
📌 Answer:
Aadhya Patel's attendance is 99.31%.
🔒 No accessible data found for role: finance
