In [None]:
#!pip install chromadb openai
#!pip install -U langchain-openai

In [13]:
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
import chromadb
from chromadb.config import Settings
import os
import openai
from dotenv import load_dotenv


In [15]:
load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")
openai_api_key = openai.api_key

1. **PDF document intake**
2. **Break PDF into pages and chunks**

In [18]:
pdf_loader = PyPDFLoader("the_nestle_hr_policy_pdf_2012.pdf")

In [20]:
pdf_pages = pdf_loader.load_and_split()
# print(pdf_pages[0])

In [22]:
doc_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
chunks = doc_splitter.split_documents(pdf_pages)
print(len(chunks))  # Prints the number of chunks the PDF has been split into
type(chunks)

35


list

3. **Generate embeddings for each chunk**
4. **Store chunks and embeddings in ChromaDB**

In [25]:
chroma_client = chromadb.PersistentClient(path="./chroma_db")
# Create or retrieve a collection named "my_collection"
collection = chroma_client.get_or_create_collection("nestle_hr")


In [27]:
embed = OpenAIEmbeddings(
    model = 'text-embedding-ada-002',
    openai_api_key= openai_api_key)

  embed = OpenAIEmbeddings(


In [29]:
# Extract raw text from chunks
context_array = [chunk.page_content for chunk in chunks]
# print(context_array)

# Submit array to OpenAI, which will return a list of embeddings 
# calculated from the input array of text chunks
emb_vectors = embed.embed_documents(context_array) 


In [30]:
import psutil
print(f"RAM Usage Before Query: {psutil.virtual_memory().percent}%")
print(collection.count()) 

RAM Usage Before Query: 52.2%
15


In [33]:
N=15
existing_ids = set(collection.get()["ids"])  # Fetch existing IDs
for i, (text,embedding) in enumerate(zip(context_array[:N], emb_vectors[:N])):
    id_str = str(i)
    if id_str in existing_ids:
        print(f"Skipping duplicate embedding ID: {id_str}")
        continue  # Skip adding duplicate IDs
    
    collection.add(
        ids=[id_str],
        embeddings=[embedding],
        metadatas=[{"text": text}]
    )
    print(f"Added embedding ID: {id_str}")

Skipping duplicate embedding ID: 0
Skipping duplicate embedding ID: 1
Skipping duplicate embedding ID: 2
Skipping duplicate embedding ID: 3
Skipping duplicate embedding ID: 4
Skipping duplicate embedding ID: 5
Skipping duplicate embedding ID: 6
Skipping duplicate embedding ID: 7
Skipping duplicate embedding ID: 8
Skipping duplicate embedding ID: 9
Skipping duplicate embedding ID: 10
Skipping duplicate embedding ID: 11
Skipping duplicate embedding ID: 12
Skipping duplicate embedding ID: 13
Skipping duplicate embedding ID: 14


In [35]:
import psutil
print(f"RAM Usage Before Query: {psutil.virtual_memory().percent}%")
print(collection.count()) 
# db = Chroma(persist_directory="./chroma_db", embedding_function=embed)
# all_docs = db.get()
# count = len(all_docs.get('ids', []))
# print(f"Total documents in collection: {count}")

result = collection.get(ids=["10"])
# print(f"Document 0 text: {result['metadatas'][0]['text']}")

RAM Usage Before Query: 51.0%
15
Document 0 text: The Nestlé Human Resources Policy4
Learning is part of the Company culture.
Employees at all levels are systematically 
encouraged to consider how they upgrade their 
knowledge and skills.
The Company determines training and deve-
lopment priorities. The responsibility for turning 
these into actions is shared between employees, 
line managers and the Human Resources. 
Experience and on-the-job training are the 
primary source of learning. Managers are 
responsible for guiding and coaching employees 
to succeed in their current positions.  
Nestlé employees understand the importance 
of continuous improvement, as well as sharing knowledge and ideas freely with others. Practices such as lateral professional development, extension of responsibilities, and cross functional teams are encouraged to acquire additional skills, enrich job content and widen accountability.
Nestlé also offers a comprehensive range of 
training activities and meth

In [37]:
query_text = "what is the Total Rewards program about"
query_embedding = embed.embed_query(query_text)

query_result = collection.query(
    query_embeddings=[query_embedding],
    n_results=5  # Retrieve top 5 similar chunks
)

# Extracting text from metadatas
# texts = [entry['text'] for entry in query_result['metadatas'][0]]

combined_text = "\n".join(entry['text'] for entry in query_result['metadatas'][0])
# print(combined_text)


Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Ad

The Nestlé Human Resources Policy3
 Total rewards
Attracting new hires and keeping current 
employees engaged is not only about 
remuneration and benefits based on solid 
performance. It is also about the hard earned value and trust that our name brings to those who work with us; the relationships with our line 
managers and fellow workers; recognition and 
experiences enjoyed while working for a diverse global company; and possibilities to learn and grow. These are as a whole, the Total Rewards we 
receive.
Nestlé, therefore, focuses on Fixed Pay, 
Variable Pay, Benefits, Personal Growth and Development and Work Life Environment as the 
key elements that define Total Rewards. In the spirit of developing a high performance culture, those elements need to correspond to what is 
valued by employees in each and every market, 
and which demonstrate how Nestlé is committed to giving each employee the opportunity to grow, evolve and contribute.Nestlé Total Rewards programmes must be
establis

In [39]:
from chromadb.utils import embedding_functions

openai_api_key= openai_api_key
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key,
    model_name="text-embedding-ada-002"  # This model produces 1536-dimensional embeddings
)

# Get or create the collection with the right embedding function
collection = chroma_client.get_or_create_collection(
    name="nestle_hr",
    embedding_function=openai_ef
)

In [45]:
import chromadb
from openai import OpenAI

# Initialize clients
client = OpenAI(api_key=openai_api_key)

def answer_question(query, top_k=5):
    # 1. Retrieve relevant chunks from ChromaDB
    results = collection.query(
        query_texts=[query],
        n_results=top_k
        )
    # print(results)
    # retrieved_chunks = results["documents"][0]  # First (and only) query result
    retrieved_chunks = results['metadatas'][0]
    
    # 2. Format the prompt with retrieved chunks and query
    context = "\n\n".join([f"[Chunk {i+1}]: {chunk}" for i, chunk in enumerate(retrieved_chunks)])
    print(context)
    user_prompt = f"CONTEXT:\n{context}\n\nQUESTION: {query}"
    
    # 3. The system message (from our prompt template)
    system_message = """You are a helpful assistant that answers questions based on the provided text chunks. Your goal is to give accurate, concise, and relevant answers using only the information available in the chunks.

Guidelines:
1. If the answer to a question is contained in the text chunks, provide the answer directly and cite the source chunk number.
2. If the answer is not contained in the text chunks, respond with "Based on the provided information, I cannot answer this question."
3. Always maintain a helpful, informative tone.
4. Do not make up or hallucinate information that isn't in the provided text chunks.
5. If relevant information is spread across multiple chunks, synthesize a complete answer and cite all relevant chunks.
6. When appropriate, use direct quotes from the text chunks to support your answer.
7. If the question is ambiguous, request clarification before providing an answer.
8. Always prioritize accuracy over completeness. It's better to provide a partial answer that's correct than a complete answer that includes hallucinated information.

Remember: Your only source of information is the text chunks provided with each query."""
    
    # 4. Call GPT-3.5 Turbo with our formatted prompt
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.7
        )
    
    return response.choices[0].message.content

# Usage example
user_question = "What is the main thesis of this document?"
answer = answer_question(user_question)
print(answer)

[Chunk 1]: {'text': 'account the specific context. Its spirit should be \nrespected under all circumstances and could be summarised in one sentence: At Nestlé we put people at the centre of everything we do.\nJean-Marc Duvoisin\nDeputy Executive Vice President Introduction'}

[Chunk 2]: {'text': 'The Nestlé Human Resources Policy1\nAt Nestlé, we recognize that our employees \nare the key to our success and nothing can be \nachieved without their engagement. \nThis document encompasses the guidelines \nwhich constitute a solid basis for effective Human \nResources Management throughout the Nestlé \nGroup around the world. It explains to all Nestlé \nemployees the vision and mission of the Human Resources function and illustrates every aspect of the Nestlé employee lifecycle. \nThe Nestlé Management and Leadership \nPrinciples inspire all the Nestlé employees in their actions and in their dealings with others. The \nCorporate Business Principles refer to all the basic \nprinciples which 

In [46]:
import gradio as gr


# Create Gradio interface
iface = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(lines=2, placeholder="Enter your question here..."),
    outputs=gr.Textbox(lines=30),
    title="Q&A Chatbot",
    description="Ask me anything!"
)

# Launch inside Jupyter Notebook
iface.launch(inline=True)

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


