In [3]:
import os
from langchain.document_loaders import PyMuPDFLoader
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# Disable tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Step 1: Load documents using PyMuPDFLoader
loader = PyMuPDFLoader("/Users/jiaweixu/Documents/UCHICAGO/MSADS/ADSP 34003_Capstone II/PTLO 2023 Q4 10K.pdf")
documents = loader.load()

# Step 2: Extract document content
# Extract the actual text from each document loaded in the previous step
texts = [doc.page_content for doc in documents] 

# Step 3: Use a sentence-transformers model for embeddings
# To convert the text into a numeric vector representation (embeddings), which can be stored and used for fast retrieval.
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 4: Create the FAISS vector store using the sentence-transformers model
# FAISS is a library used to store and retrieve embeddings efficiently. Here, we store the document embeddings for fast retrieval later.
vector_store = FAISS.from_texts(texts, embedding_model, metadatas=[{"source": doc.metadata["source"]} for doc in documents])

  from tqdm.autonotebook import tqdm, trange


Step 3: The HuggingFaceEmbeddings model is initialized using the sentence-transformers model 'all-MiniLM-L6-v2'. This model takes the text from the documents and converts them into embeddings, which represent the meaning of the text in a mathematical form that can be used by FAISS.

Step 4: FAISS.from_texts() takes the raw document text (texts), creates embeddings using the embedding_model, and stores those embeddings in a FAISS index.
Additionally, metadata (such as the document source) is stored alongside the embeddings to help identify the original documents when they are retrieved.

In [4]:
documents

[Document(metadata={'source': '/Users/jiaweixu/Documents/UCHICAGO/MSADS/ADSP 34003_Capstone II/PTLO 2023 Q4 10K.pdf', 'file_path': '/Users/jiaweixu/Documents/UCHICAGO/MSADS/ADSP 34003_Capstone II/PTLO 2023 Q4 10K.pdf', 'page': 0, 'total_pages': 123, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': '', 'creationDate': '', 'modDate': '', 'trapped': ''}, page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWASHINGTON, D.C. 20549\nFORM 10-K\n☒      ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFOR THE FISCAL YEAR ENDED December 31 , 2023\nOR\n☐     TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 FOR\nTHE TRANSITION PERIOD FROM         TO\nCOMMISSION FILE NUMBER: 001-40951\nPORTILLO\'S INC.\n(Exact name of registrant as specified in its charter)\nDelaware\n87-1104304\n(State or other jurisdiction of incorporation or organization)\n(I.R.S. Empl

Retrieval-Augmented Generation (RAG)

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# Step 1: Initialize the LLM
llm = OpenAI(temperature=0, openai_api_key="")

# Step 2: Set up the RetrievalQA chain with the FAISS retriever and the LLM
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can also use other types like "map_reduce", "refine" based on your needs
    retriever=vector_store.as_retriever()
)

# Step 3: Example query from an investor
query = "What is the net income for Q3 2023?"

# Step 4: Get the result from the QA chain
result = qa_chain.run(query)
print(result)
