In [28]:
# Install necessary libraries
# !python3 -m pip install langchain openai pypdf chromadb sentence-transformers
# !python3 -m pip list

!python3 -m pip install --quiet --upgrade bs4




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [None]:
from langchain import hub
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from IPython.display import display, Markdown

In [13]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access the API key
api_key = os.getenv("OPENAI_API_KEY")
if api_key:
    print("API key loaded successfully.")
else:
    print("Failed to load API key. Please check your .env file.")

API key loaded successfully.


## Load our Model + Embeddings

In [32]:

from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings


llm = init_chat_model("gpt-4o-mini", model_provider="openai")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


## Create Vector Store

In [26]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

## Populate Vector Store

In [None]:
# Load My PDFs
loader = PyPDFLoader(
    file_path="../data/JustinFosterResume2025.2.pdf"
)

# for now only 1 document, add more later
pages = []
async for page in loader.alazy_load():
    pages.append(page)

# Index chunks
_ = vector_store.add_documents(documents=pages)


Ignoring wrong pointing object 86 0 (offset 0)


## Define State Graph, Retrieval, and Generation

In [57]:
# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()



In [58]:
response = graph.invoke({"question": "In markdown table format, list all companies I have worked for, their locations, the dates I worked for them and a summary of the work I completed at each company."})

In [59]:
display(Markdown(response["answer"]))

```markdown
| Company                        | Location               | Dates                | Summary of Work                                                                                                             |
|-------------------------------|-----------------------|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| Lockheed Martin               | Ft. Worth, TX        | June 2023 - Present  | Led a team developing a machine learning anomaly detection algorithm and co-authored a whitepaper on advanced data correlation. |
| Lockheed Martin               | Ft. Worth, TX        | June 2022 - August 2022 | Integrated synthetic data processes in machine learning algorithms for training recommender systems.                          |
| Texas Spacecraft Laboratory    | Austin, TX           | Feb 2021 - May 2023 | Led a research team focused on developing a CubeSat Mobile-Pose machine learning model and optimized data preparation processes. |
```