In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

### Step 1: Load documents from PDF files

In [3]:

# Define the directory containing PDF files
pdf_directory = './docs'

# List to hold loaded documents
documents = []

# Get all PDF files from the directory
file_paths = [os.path.join(pdf_directory, file) for file in os.listdir(pdf_directory) if file.endswith('.md')]

from langchain.document_loaders import UnstructuredMarkdownLoader
# Load PDFs using PyPDFLoader
for file_path in file_paths:
    loader = UnstructuredMarkdownLoader(file_path)
    documents.extend(loader.load())

### Step 2: Split the documents into chunks

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Split is require because LLM have different context size, also when storing in vector database it is always good to store in chunk
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=200
    )
text_splitted_document = text_splitter.split_documents(documents) 

### Step 3: Create embeddings and store in vector database

In [5]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(text_splitted_document, embeddings)

### Step 4: Query Processing 

In [6]:
query = "GlideRecord"
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)
result = retriever.invoke(query)
print(len(result))
result[0].page_content

3


'MOBILE APPLICATION REDESIGN\n\nManaged the redesign of a mobile application, improving user engagement by 45% and increasing daily active users by 30%.\n\nDATA VISUALIZATION DASHBOARD\n\nDeveloped a data visualization dashboard that enabled customers to gain insights from complex datasets, increasing customer satisfaction scores by 22%.'

In [7]:
# Create QA chain
from langchain.chains import RetrievalQA
from langchain_openai import OpenAI
llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [8]:
print("Ready! Ask questions about your documents.")
while True:
    query = input("\nQuestion (type 'exit' to quit): ")
    if query.lower() == 'exit':
        break

    result = qa_chain.invoke({"query": query})
    print("\nAnswer:", result["result"])
    print("\nSources:")
    for i, doc in enumerate(result["source_documents"]):
        print(f"Source {i+1}: {doc.metadata.get('source', 'Unknown')}, Page {doc.metadata.get('page', 'Unknown')}")

Ready! Ask questions about your documents.

Answer:  Based on the information provided, your financial health appears to be in good shape. You have a solid budget in place and are consistently meeting your monthly expenses. Your investment portfolio is also performing well, with positive growth in all accounts. However, it may be beneficial to discuss rebalancing your 401(k) and exploring tax-efficient investing strategies with your financial advisor. Additionally, it may be worth considering increasing contributions to your college fund for your children to catch up on your savings goals. Overall, it seems like you are on track to meet your long-term goals, but it may be helpful to review your insurance coverage and explore options for refinancing your car loan to potentially save money in the long run.

Sources:
Source 1: ./docs/personal_financial_record.md, Page Unknown
Source 2: ./docs/personal_financial_record.md, Page Unknown
Source 3: ./docs/personal_financial_record.md, Page Un