In [1]:
!pip install --upgrade pip
!pip install --upgrade pypdf langchain-text-splitters langchain-community langgraph faiss-cpu langchain-openai python-dotenv rapidocr-onnxruntime langsmith
!pip install -qU "langchain[openai]"



In [2]:
import os
from dotenv import load_dotenv
from langchain import hub
from pypdf import PdfReader
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain.chat_models import init_chat_model
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGSMITH_TRACING = os.getenv("LANGSMITH_TRACING", "false")
LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY", "not provided")

In [4]:
def load_pdfs_recursively(directory):
    all_pages = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                try:
                    with open(pdf_path, 'rb') as f:
                        PdfReader(f)  # will raise an exception if the PDF is invalid

                    loader = PyPDFLoader(pdf_path, extract_images=False)  # set to False unless you NEED images
                    pages = loader.load()
                    all_pages.extend(pages)
                except Exception as e:
                    print(f"Error processing {pdf_path}: {e}")
    return all_pages

pdf_directory = "pdfs"  # path to dir with PDFs
docs = load_pdfs_recursively(pdf_directory)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = FAISS.from_documents(all_splits, embeddings)

print("Index created!")

Index created!


In [5]:
# Invoke LLM
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [6]:
question = "What should I highlight on an Academic Resume?"

In [7]:
response = graph.invoke({
    "question": question
})

print(response["answer"])

On an academic resume, you should highlight your education, relevant experience (including teaching, research, and administrative roles), and any publications or conferences you've participated in. Additionally, include professional development, honors and awards, and professional associations to showcase your qualifications and contributions to your field. Tailor these sections based on the specific position you are applying for to demonstrate your fit for the role.
