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

In [113]:
import os, warnings, tiktoken, faiss
from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama

In [1]:
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")

load_dotenv()

True

In [2]:
os.environ['LANGCHAIN_PROJECT']

'ChatPDF'

# Load the Documents

In [21]:
loader = PyMuPDFLoader("./RAG_data/gym supplements/1. Analysis of Actual Fitness Supplement.pdf")
docs = loader.load()

# D:\PROJECT\LLM\Llama-3.2\ChatPDF\RAG_data\gym supplements\1. Analysis of Actual Fitness Supplement.pdf

In [30]:
# doc = docs[0]
# print(doc.metadata)
# print(doc.page_content)

In [37]:
pdfs=[]
for root , dirs, files in os.walk('RAG_data'):
    # print(root, dirs, files)
    for file in files:
        if file.endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

In [38]:
pdfs

['RAG_data\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf',
 'RAG_data\\gym supplements\\2. High Prevalence of Supplement Intake.pdf',
 'RAG_data\\health supplements\\1. dietary supplements - for whom.pdf',
 'RAG_data\\health supplements\\2. Nutraceuticals research.pdf',
 'RAG_data\\health supplements\\3.health_supplements_side_effects.pdf']

In [39]:
docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    pages = loader.load()

    docs.extend(pages)

In [41]:
len(docs)

64

# Document Chunking

In [47]:

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=100
)
chunks = text_splitter.split_documents(docs)
len(chunks), len(docs)

(321, 64)

In [53]:
len(chunks[0].page_content), len(docs[0].page_content)

(981, 4340)

In [56]:
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
len(encoding.encode(chunks[0].page_content)), len(encoding.encode(docs[0].page_content))

(294, 969)

In [58]:
# encoding.encode(chunks[0].page_content)

# Vector Embedding

In [67]:
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")

single_vector = embeddings.embed_query("Does she love me?")
len(single_vector)

768

In [69]:
index = faiss.IndexFlatL2(len(single_vector))
index.ntotal, index.d

(0, 768)

In [70]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [71]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x1c3b3fc9490>

docs -> doc -> chunks -> embeddings ->

In [75]:
ids = vector_store.add_documents(documents=chunks)

In [79]:
len(ids), len(chunks)
# vector_store.index_to_docstore_id

(321, 321)

## store & load vector database

In [86]:
# run it once, if the database has been created, command this cell again

# db_name="health&gym_supplements"
# vector_store.save_local("health&gym_supplements")

# new_vector_store = FAISS.load_local(db_name, embeddings=embeddings, allow_dangerous_deserialization=True)

# len(new_vector_store.index_to_docstore_id)

# retrival

In [88]:
question="what is used to gain muscle mass?"
docs = vector_store.search(query=question, search_type='similarity')

for doc in docs:
    # the results that appear are chunks
    print(doc.page_content)
    print("\n\n")

acids than traditional protein sources. Its numerous benefits have made it a popular choice
for snacks and drinks among consumers [3]. Another widely embraced supplement is
caffeine, which is found in many sports and food supplements. Caffeine reduces perceived
effort, minimizes fatigue and pain, and proves to be effective for endurance and high-
intensity activities, which is the choice of consumers [4].
Creatine monohydrate is another well-known supplement used to gain muscle mass
and support performance and recovery. It is known not to increase fat mass and remains
effective even when taken in recommended doses [5]. Despite its popularity in the fitness
Foods 2024, 13, 1424. https://doi.org/10.3390/foods13091424
https://www.mdpi.com/journal/foods



and strength gain among men. We detected more prevalent protein and creatine supplementation
among younger compared to older ﬁtness center users, whereas the opposite was found for vitamin
supplementation. Other authors made similar obse

In [90]:
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={
        'k':3, 
        'fetch_k':100, 
        'lambda_mult':1}
    )

In [92]:
doc = retriever.invoke(question)

for doc in docs:
    # the results that appear are chunks
    print(doc.page_content)
    print("\n\n")

acids than traditional protein sources. Its numerous benefits have made it a popular choice
for snacks and drinks among consumers [3]. Another widely embraced supplement is
caffeine, which is found in many sports and food supplements. Caffeine reduces perceived
effort, minimizes fatigue and pain, and proves to be effective for endurance and high-
intensity activities, which is the choice of consumers [4].
Creatine monohydrate is another well-known supplement used to gain muscle mass
and support performance and recovery. It is known not to increase fat mass and remains
effective even when taken in recommended doses [5]. Despite its popularity in the fitness
Foods 2024, 13, 1424. https://doi.org/10.3390/foods13091424
https://www.mdpi.com/journal/foods



and strength gain among men. We detected more prevalent protein and creatine supplementation
among younger compared to older ﬁtness center users, whereas the opposite was found for vitamin
supplementation. Other authors made similar obse

# RAG with LLAMA 3.2 on OLLAMA

In [95]:
model = ChatOllama(model="llama3.2:1b", base_url="http://localhost:11434") # load model dari ollama

model.invoke('who are you?')

AIMessage(content='I\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."', additional_kwargs={}, response_metadata={'model': 'llama3.2:1b', 'created_at': '2024-11-18T07:58:03.2840941Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 2257922700, 'load_duration': 24112100, 'prompt_eval_count': 29, 'prompt_eval_duration': 376979000, 'eval_count': 23, 'eval_duration': 1855505000}, id='run-2db848df-247d-42d6-be21-f72a177ac4ae-0', usage_metadata={'input_tokens': 29, 'output_tokens': 23, 'total_tokens': 52})

In [103]:
prompt = hub.pull("rlm/rag-prompt")

In [104]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [117]:
prompt = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.
    Question: {question} 
    Context: {context} 
    Answer:
"""
prompt = ChatPromptTemplate.from_template(prompt)


In [118]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

# print(format_docs(docs))

In [119]:
rag_chain = (
    {
        "context": retriever|format_docs, 
        "question": RunnablePassthrough()
    } 
    | prompt
    | model
    | StrOutputParser()
    
)

In [120]:
question = "what is used to gain muscle mass?"

output = rag_chain.invoke(question)
print(output)

Here are the relevant points to answer the question "What is used to gain muscle mass?"

* Caffeine is a supplement that reduces perceived effort, minimizes fatigue and pain, and proves to be effective for endurance and high-intensity activities.
* Creatine monohydrate is another well-known supplement used to gain muscle mass and support performance and recovery.

These two supplements are both commonly used to aid in gaining muscle mass.


In [115]:
# question = "what is used to gain muscle mass?"
# question = "what is used to reduce weight?"
# question = "what are side effects of supplements?"
# question = "what are the benefits of supplements?"
# question = "what are the benefits of BCAA supplements?"

question = "what is used to increase mass of the Earth?"

output = rag_chain.invoke(question)
print(output)

Here are the relevance scores I assigned to each retrieved document:

* **Keyword extraction**: Relevant (1) - The document contains keywords related to the user question "what is used to increase mass of the Earth?"
	+ e.g. DMAA
	+ OxyELITE Pro
	+ Bauxhinia purpurea
	+ Bacopa monniera
	+ Cirsium oligophyllum
	+ Rauwolscine (Yohimbe)
* **Semantic meaning**: Relevant (1) - The document provides semantic meaning related to the user question, discussing weight loss supplements and their potential effects on metabolism.
* **Contextual relevance**: Relevant (1) - The context of the document is relevant to the user question, as it discusses weight loss supplements and their effectiveness in promoting weight reduction.
