In [16]:
!pip install --upgrade pip
!pip install --upgrade pypdf langchain-text-splitters langchain-community langgraph faiss-cpu langchain-openai python-dotenv rapidocr-onnxruntime langsmith
!pip install -qU "langchain[openai]"

Collecting pypdf
  Using cached pypdf-5.3.0-py3-none-any.whl.metadata (7.2 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-0.3.6-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-community
  Using cached langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting langgraph
  Using cached langgraph-0.2.73-py3-none-any.whl.metadata (17 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp313-cp313-macosx_10_14_x86_64.whl.metadata (4.4 kB)
Collecting langchain-openai
  Using cached langchain_openai-0.3.6-py3-none-any.whl.metadata (2.3 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting rapidocr-onnxruntime
  Using cached rapidocr_onnxruntime-1.2.3-py3-none-any.whl.metadata (4.8 kB)
Collecting langsmith
  Downloading langsmith-0.3.8-py3-none-any.whl.metadata (14 kB)
Collecting langchain-core<1.0.0,>=0.3.34 (from langchain-text-splitters)
  Downloading langchai

In [17]:
import os
from dotenv import load_dotenv
from langchain import hub
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain.chat_models import init_chat_model
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [18]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGSMITH_TRACING = os.getenv("LANGSMITH_TRACING", "false")
LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY", "not provided")

In [19]:
def load_pdfs_recursively(directory):
    all_pages = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                loader = PyPDFLoader(pdf_path, extract_images=True)
                pages = loader.load_and_split()
                all_pages.extend(pages)
    return all_pages

pdf_directory = "pdfs"  # path to dir with PDFs

docs = load_pdfs_recursively(pdf_directory)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = FAISS.from_documents(docs, embeddings)

_ = vector_store.add_documents(documents=all_splits)    # index chunks

print("index created!")

index created!


In [20]:
# Invoke LLM
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [21]:
question = "What should I highlight on an Academic Resume?"

In [22]:
response = graph.invoke({
    "question": question
})

print(response["answer"])

On an academic resume, you should highlight your academic and administrative experience, including committee work and any positions held in academic organizations. Include a section for publications and conferences, detailing peer-reviewed and non-peer-reviewed works, as well as ongoing projects. Additionally, emphasize professional development, relevant professional affiliations, and any honors and awards related to your discipline.
