In [12]:
# Import necessary libraries
import os
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.core.callbacks import CallbackManager
from langfuse import Langfuse
from langfuse.llama_index import LlamaIndexCallbackHandler
from langfuse.openai import openai

# Load environment variables
load_dotenv()

langfuse_callback_handler = LlamaIndexCallbackHandler()
Settings.callback_manager = CallbackManager([langfuse_callback_handler])
langfuse = Langfuse()

# Load documents
documents = SimpleDirectoryReader("data").load_data()

# Debug: Print the number of documents loaded
print(f"Number of documents loaded: {len(documents)}")

# Create an index from the documents
index = VectorStoreIndex.from_documents(documents)

# Create a retriever to fetch relevant documents
retriever = index.as_retriever(retrieval_mode='similarity', k=3)

# Define your query
query = "Give me some suggestions on what to eat to obtain 10g of protein?"

# Retrieve relevant documents
relevant_docs = retriever.retrieve(query)

context_str = ""
# Debug: print info related to relevant_docs
print(f"Number of relevant documents: {len(relevant_docs)}")
print("\n" + "="*50 + "\n")

for i, doc in enumerate(relevant_docs):
    context_str += f"{doc.node.metadata.get('page_label', 'N/A')}\n"
    context_str += f"{doc.node.metadata.get('file_path', 'N/A')}\n"
    context_str += f"{doc.node.get_content()}\n\n"

    print(f"Document {i+1}:")
    print(f"Text sample: {doc.node.get_content()[:200]}...")  # Print first 200 characters
    print(f"Metadata: {doc.node.metadata}")
    print(f"Score: {doc.score}")
    print("\n" + "="*50 + "\n")

# Craft prompt that combines the documents with the query
# Follow the example from Langfuse trace of LLamaIndex_query
system_prompt = f"""
You're an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' 
or anything along those lines.
"""

context_prompt = f"""
Context information is below.\n{'-'*50}\n{context_str}\n{'-'*50}
"""

query_prompt = f"""
Given the context information and not prior knowledge, answer the query.
Query: {query}
Answer: 
"""

user_prompt = context_prompt + "\n" + query_prompt
print(f"user prompt: {user_prompt}")

client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature=0.2
)

result = response.choices[0].message.content
print(f"Answer: {result}")

Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 31 0 (offset 0)


Number of documents loaded: 59
Number of relevant documents: 2


Document 1:
Text sample: Dr. Chloe Scheel, ND, Lac Kwan-Yin Healing Arts Center 2330 NW Flanders, Suite 101 Portland, Oregon, 97210 (503) 701-8766 ext. 314 CScheel@kwanyinhealingarts.com www.simplynaturalmedicine.org 
Page 1 ...
Metadata: {'page_label': '1', 'file_name': 'SNM Protein Compilation.pdf', 'file_path': '/Users/itasari/Documents/BootcampAILLM/week-2-project/data/SNM Protein Compilation.pdf', 'file_type': 'application/pdf', 'file_size': 447854, 'creation_date': '2024-09-21', 'last_modified_date': '2024-09-21'}
Score: 0.8340497972099924


Document 2:
Text sample: Optimize Your Brain 
              Copyright ©  2012 Kristen Allott, ND, L.Ac.   
www.dynamicpaths.com                                                                                                  ...
Metadata: {'page_label': '4', 'file_name': 'SNM Protein Compilation.pdf', 'file_path': '/Users/itasari/Documents/BootcampAILLM/week-2-project/data/SNM P