# RAG-based AI App About Ana

## Setup

In [1]:
# !pip install -r requirements.txt

In [1]:
import google.generativeai as genai
import os
from dotenv import load_dotenv

from langchain.document_loaders import DirectoryLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
import shutil
from langchain_chroma import Chroma

from langchain.prompts import ChatPromptTemplate

from langchain_google_genai import GoogleGenerativeAI

load_dotenv()

True

### Configure Gemini LLM

In [2]:
# prompt="do you know ana?"

In [3]:
# genai.configure(api_key=os.environ['GEMINI_API_KEY'])
# model = genai.GenerativeModel("gemini-1.5-flash")

# response = model.generate_content(
#     prompt,
#     generation_config=genai.types.GenerationConfig(
#         # max_output_tokens=140,
#         # temperature=0.7,
#     )
# )

llm = GoogleGenerativeAI(model="gemini-1.5-flash",
                         google_api_key=os.environ['GEMINI_API_KEY'])

In [4]:
# print(response.text)

### Create vector DB from personal PDF files

In [5]:
DATA_PATH = "data_sources"
CHROMA_PATH = "chroma"

model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {'allow_download': 'True'}
gpt4all_embeddings = GPT4AllEmbeddings(
    model_name=model_name,
    gpt4all_kwargs=gpt4all_kwargs
)

def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf")
    documents = loader.load()
    return documents

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks

def save_to_chroma(chunks: list[Document]):
    # clear previous db
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # create db
    db = Chroma.from_documents(
        chunks, gpt4all_embeddings, persist_directory=CHROMA_PATH
    )
    
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [6]:
documents = load_documents()
doc_chunks = split_text(documents)
save_to_chroma(doc_chunks)

Split 3 documents into 50 chunks.
Saved 50 chunks to chroma.


### Retrieve similar vectors as the prompt from the DB

In [7]:
db = Chroma(persist_directory=CHROMA_PATH,
            embedding_function=gpt4all_embeddings)

retriever = db.as_retriever(search_type="similarity")

# r_docs = retriever.invoke("what are ana's qualifications?")

In [8]:
# prompt_embedding = gpt4all_embeddings.embed_query("who is ana?")

In [9]:
# r_docs

[Document(metadata={'source': 'data_sources\\Personal Profile.pdf', 'start_index': 0}, page_content='Name: Ana Vasquez\n\nPersonal Goal:\n\nHealth: Keep a healthy lifestyle so much that I am still fit by 65 y.o. - Career: Become an NLP specialist in the Philippines.\n\nHobbies: -'),
 Document(metadata={'source': 'data_sources\\PROFILE DS AI - Ana Vasquez.pdf', 'start_index': 0}, page_content='ANA VASQUEZ avasquez.msds2024@aim.edu | linkedin.com/in/ana-p-vasquez\n\ngithub.com/helloanavee | +63 919 001 5061'),
 Document(metadata={'source': 'data_sources\\LinkedIn Profile.pdf', 'start_index': 182}, page_content="Certifications\n\nData Analytics\n\nHonors-Awards\n\nSpecial Award: Awesome Attitude Award\n\nAna Vasquez\n\nData Science | NLP | Gen AI | MS Data Science at AIM Metro Manila, National Capital Region, Philippines\n\nSummary\n\nI'm a Data Scientist passionate about Natural Language Processing,\n\nGen AI, and LLMs."),
 Document(metadata={'source': 'data_sources\\Personal Profile.pdf

### Reformulate the question to give context

In [10]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

### Answer the question

In [14]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory

qa_system_prompt = """
Use the following pieces of retrieved context to answer the question. \
Use three to five sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
) 
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [15]:
from typing import Sequence

from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, StateGraph
from langgraph.graph.message import add_messages
from typing_extensions import Annotated, TypedDict


# We define a dict representing the state of the application.
# This state has the same input and output keys as `rag_chain`.
class State(TypedDict):
    input: str
    chat_history: Annotated[Sequence[BaseMessage], add_messages]
    context: str
    answer: str


# We then define a simple node that runs the `rag_chain`.
# The `return` values of the node update the graph state, so here we just
# update the chat history with the input message and response.
def call_model(state: State):
    response = rag_chain.invoke(state)
    return {
        "chat_history": [
            HumanMessage(state["input"]),
            AIMessage(response["answer"]),
        ],
        "context": response["context"],
        "answer": response["answer"],
    }


# Our graph consists only of one node:
workflow = StateGraph(state_schema=State)
workflow.add_edge(START, "model")
workflow.add_node("model", call_model)

# Finally, we compile the graph with a checkpointer object.
# This persists the state, in this case in memory.
memory = MemorySaver()
app = workflow.compile(checkpointer=memory)

In [21]:
config = {"configurable": {"thread_id": "abc123"}}

result = app.invoke(
    {"input": "Can you explain more on her role in data science?"},
    config=config,
)
print(result["answer"])

Ana is a Data Science Intern at Netopia AI, where she collaborates with clients to create data projects using SQL and Looker Studio. Her skills include Python, SQL, NLP tools, and a focus on end-to-end machine learning solutions. She has experience in various industries, including marketing, HR, real estate, health, and finance. 



In [22]:
chat_history = app.get_state(config).values["chat_history"]
for message in chat_history:
    message.pretty_print()


And what else?

The individual has a strong background in software engineering and data science, demonstrated by their experience at Magis Solutions and their current internship at Netopia AI. They have expertise in machine learning, data analysis, and software development, with a proven track record of delivering business value through their projects.  Their skills include Python, SQL, NLP tools, and a focus on end-to-end machine learning solutions.

What is ana good at?

Ana is skilled in data science, machine learning, and software engineering. She has a strong background in Python, SQL, and NLP tools. Her experience includes working as a data science consultant, data analyst, and software engineer, focusing on delivering business value through end-to-end machine learning solutions.

Can you explain more on her role in data science?

Ana is a Data Science Intern at Netopia AI, where she collaborates with clients to create data projects using SQL and Looker Studio. Her skills includ

In [18]:
# def get_session_history(session_id: str) -> BaseChatMessageHistory:
#     if session_id not in store:
#         store[session_id] = load_session_history(session_id)
#     return store[session_id]

# # Function to load chat history
# def load_session_history(session_id: str) -> BaseChatMessageHistory:
#     db = next(get_db())
#     chat_history = ChatMessageHistory()
#     try:
#         session = db.query(Session).filter(Session.session_id == session_id).first()
#         if session:
#             for message in session.messages:
#                 chat_history.add_message({"role": message.role, "content": message.content})
#     except SQLAlchemyError:
#         pass
#     finally:
#         db.close()

#     return chat_history

# ### Statefully manage chat history ###
# store = {}

# conversational_rag_chain = RunnableWithMessageHistory(
#     rag_chain,
#     get_session_history,
#     input_messages_key="input",
#     history_messages_key="chat_history",
#     output_messages_key="answer",
# )

### Prompt the LLM

In [11]:
PROMPT_TEMPLATE = """
Answer the question based mostly only on the following context:

CONTEXT: {context}

QUESTION: {question}
"""

query_text = "Should I hire ana as an AI Engineer?"

context_text = "\n\n---\n\n".join([r_doc.page_content for r_doc in r_docs])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)

In [12]:
print(prompt)

Human: 
Answer the question based mostly only on the following context:

CONTEXT: Name: Ana Vasquez

Personal Goal:

Health: Keep a healthy lifestyle so much that I am still fit by 65 y.o. - Career: Become an NLP specialist in the Philippines.

Hobbies: -

---

provides a normalized measure of performance, allowing for comparison

across different assignments with varying point values. Only valid score ratio

between 0 and 1 is considered. This ensures that scores exceeding 1, possibly

---

through regular code reviews.

Education

Asian Institute of Management

Master of Science - MS, Data Science · (July 2023 - September 2024)

De La Salle University

Bachelor’s Degree, Psychology

Eskwelabs

Certificate, Data Analytics Bootcamp · (October 2021 - December 2021)

---

sources, ensuring a cohesive dataset for evaluation. Data anonymization is conducted

to protect privacy, followed by comprehensive preprocessing, which includes

removing non-English rows, converting text to lowercase,

In [13]:
response = model.generate_content(prompt)
print(response.text)

The provided context doesn't offer enough information to make a decision about hiring Ana as an AI Engineer. 

Here's why:

* **No AI Engineering Experience:** The context highlights Ana's background in data science and psychology, but there's no mention of any experience or skills related to AI engineering.
* **Goal is NLP:** While NLP (Natural Language Processing) is a branch of AI, Ana's goal is to become an NLP specialist, not an AI engineer in general. 
* **Context Focuses on Education:** The provided information focuses on Ana's education and a random snippet about normalized score ratios. This doesn't give a clear picture of her technical abilities or suitability for an AI engineering role.

**To make a hiring decision, you'd need to:**

* **Assess Ana's technical skills:** Evaluate her programming abilities, knowledge of AI algorithms, and experience with AI development tools.
* **Determine her AI engineering experience:** Look for past projects or roles related to AI developme