# 02-rag.ipynb

In [9]:
from dotenv import load_dotenv

# .env 파일에 있는 환경변수들을 불러오기
load_dotenv()

True

In [9]:
# 1. Document Load (PDF)
# 지원하는 문서 
from langchain_community.document_loaders import PyPDFLoader
# 불러올 파일 위치
file_path = './nke-10k-2023.pdf'
# pdf를 변환해줄 폴더
loader = PyPDFLoader(file_path)
# 로더가 pdffmf python에서 쓸 수 있도록 변환(pdf 1page -> 1 Document.)
docs = loader.load()
print(len(docs))

107


In [13]:
# 2. Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Document를 잘라줄 스플리터
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap = 200, add_start_index =True
)

# 쪼개기
chunks = text_splitter.split_documents(docs)

print(len(chunks)) # 전체 chunk 개수
print(chunks[0].page_content) # 첫번째 청크의 원본 텍스트 내용

516
Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other jurisdiction of incorporation) (IRS Employer Identification No.)
One Bowerman Drive, Beaverton, Oregon 97005-6453
(Address of principal executive offices and zip code)
(503) 671-6453
(Registrant's telephone number, including area code)
SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:
Class B Common Stock NKE New York Stock Exchange
(Title of each class) (Trading symbol) (Name of each exchange on which registered)


In [16]:
# 3. Embedding (숫자로 바꾸기)
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

# 아래는 테스트용 (실제 텍스트 -> 벡터로 바뀌는 과정)
v1 = embeddings.embed_query(chunks[0].page_content) # 청크1 벡터로 변환
v2 = embeddings.embed_query(chunks[1].page_content) # 청크2 벡터로 변환

# 차원수는 같아야 한다.
print(len(v1) == len(v2))
print(v1[:10]) # 벡터 눈으로 확인하기

True
[0.054528214037418365, 0.048848193138837814, 0.019134575501084328, -0.0062835244461894035, 0.024802763015031815, -0.008969701826572418, -0.006224357523024082, 0.024258429184556007, 0.0007244247244670987, 0.009809872135519981]


In [None]:
# 4. Vector Store에 저장하기
from langchain_core.vectorstores import InMemoryVectorStore

# 테스트/개발용 메모리 벡터스토어
vector_store = InMemoryVectorStore(embeddings)

# pdf 쪼개놓은 chunks를 벡터스토어에 저장 (저장 후 id들이 나옴)
ids = vector_store.add_documents(documents=chunks)


In [20]:
len(ids)

516

In [21]:
retriever = vector_store.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 3}
)

retriever.invoke('나이키의 미국 영업점 개수?')

[Document(id='0493e956-734a-4a05-bd40-a20527ec6cf1', metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './nke-10k-2023.pdf', 'total_pages': 107, 'page': 4, 'page_label': '5', 'start_index': 3125}, page_content='direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further informati

# PDF RAG를 Agent에 통합

In [37]:
# 검색기(retriever)를 Tool(함수)로 만들기

# 검색어를 인자로 받음
def search_vectorstore(query: str) -> str:
    """Retrieve info to help answer a query about Nike"""
    # 검색기 대신 벡터스토어 바로 활용하기 (chunk 2개만 검색)
    docs = vector_store.similarity_search(query, k=2)
    result = ''

    for doc in docs:
        result += doc.page_content + '\n\n'

    print(docs[0].page_content)
    return result

search_vectorstore('나이키 영업점 개수')

direct to consumer operations sell products through the following number of retail stores in the United States:
U.S. RETAIL STORES NUMBER
NIKE Brand factory stores 213 
NIKE Brand in-line stores (including employee-only stores) 74 
Converse stores (including factory stores) 82 
TOTAL 369 
In the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.
2023 FORM 10-K 2


'direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.\n2023 FORM 10-K 2\n\nTable of Contents\nITEM 1B. UNRESOLVED STAFF COMMENTS\nNone.\nITEM 2. PROPERTIES\nThe following is a summary of principal properties owned or leased by NIKE:\nThe NIKE World Campus, owned by NIKE and located near Beaverton, Oregon, USA, is an approximately 400-acre site consisting of over 40 buildings which, together\nwith adjacent leased properties, functions as our world headquarters and is occupied by approximately 11,400 employees engaged in management, research, design,\ndevelopment, marketing, finance and other administrative functions serving ne

In [39]:
from langchain.agents import create_agent

prompt = """너는 2023 나이키 10k 보고서를 검색하는 도구를 다룰 수 있어. 
사용자 질문에 답변하기 위해 필요하면 사용해. 경제분석 전문가처럼 답변해"""

agent = create_agent(
    model="openai:gpt-4.1-mini",
    tools=[search_vectorstore],
    system_prompt=prompt
)

In [40]:
content = "나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함."

agent.invoke(
    {
        "messages": [
            {"role": "user", "content": content}
        ]
    }
)

the following requirements have been met: (1) the store has been open at least one year, (2) square footage has not changed by more than 15% within the past year and
(3) the store has not been permanently repositioned within the past year. Comparable store sales includes revenues from stores that were temporarily closed during the
period as a result of COVID-19. Comparable store sales represents a performance metric that we believe is useful information for management and investors in
understanding the performance of our established NIKE-owned in-line and factory stores. Management considers this metric when making financial and operating
decisions. The method of calculating comparable store sales varies across the retail industry. As a result, our calculation of this metric may not be comparable to similarly
titled metrics used by other companies.
(1)
(2)
2023 FORM 10-K 30


{'messages': [HumanMessage(content='나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함.', additional_kwargs={}, response_metadata={}, id='5adb8f54-5e10-41af-966b-905b66fa3400'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 22, 'prompt_tokens': 117, 'total_tokens': 139, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_84a52ff059', 'id': 'chatcmpl-DCfzk4QEOuc2fDEFLShCRAszEKbms', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019c8e49-f0f3-7633-ac17-ac733025f37c-0', tool_calls=[{'name': 'search_vectorstore', 'args': {'query': 'Nike store count and average sales per store'}, 'id': 'call_d2F6WiqNZn0EOesiLpbiC4rd', 'type': 'tool_call'}], invalid_tool_ca

# Web문서(Html) RAG + Agent

In [4]:
# HTML은 문서 본문외에 필요하지 않은 내용이 많다. 전처리가 필요하다.
import bs4
from langchain_community.document_loaders import WebBaseLoader

# 전처리기
bs4_strainer = bs4.SoupStrainer(class_=('post-title', 'post-header', 'post-content'))
# 로더
loader = WebBaseLoader(
    web_path=("https://lilianweng.github.io/posts/2023-06-23-agent/"),
    bs_kwargs={'parse_only': bs4_strainer} #처리기 넣기
    
)

docs = loader.load()
# 문서 페이지 수
print(len(docs), len(docs[0].page_content))

USER_AGENT environment variable not set, consider setting it to identify your requests.


1 43047


In [6]:
# Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 스플리터
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index = True
)

chunks = text_splitter.split_documents(docs)

print(len(chunks))
print(chunks[0].page_content)


63
LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:

Planning

Subgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.
Reflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results.


Memory


In [12]:
# 3. Embedding (숫자로 바꾸기)
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

# 아래는 테스트용 (실제 텍스트 -> 벡터로 바뀌는 과정)
v1 = embeddings.embed_query(chunks[0].page_content)  # 청크1 벡터로 변환
v2 = embeddings.embed_query(chunks[1].page_content)  # 청크2 벡터로 변환

# 차원수는 같아야 한다.
print(len(v1) == len(v2))
print(v1[:10])  # 벡터 눈으로 확인하기

True
[0.009648381732404232, 0.020312383770942688, 0.041626472026109695, -0.0041494304314255714, 0.00354423257522285, 0.0019164594123139977, -0.014427357353270054, 0.03675706684589386, -0.019032424315810204, 0.058878086507320404]


In [13]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

ids = vector_store.add_documents(documents=chunks)
print(ids)

['9a0917b9-c8de-4a40-8228-b1b4858369e6', '9a1fc3e5-71e8-45e1-a066-adefac6f8347', '79fc8c7a-e8e4-4cc2-bb06-3696e80a2028', '57749f59-8523-4a00-8c0f-5f0a980cef21', '991c11aa-d625-4c7a-b345-c6cdba72b754', '573d3774-8453-4e19-8da1-ea6b7990a292', 'd80e986a-4851-4ab9-8e5d-63273e17f4dc', '7b2dc47a-682e-48b6-81ac-4362e65ed713', '149c7b6c-44ad-4932-bdda-a5ebf134baa5', '53923613-30c0-4827-b353-9aec6a9fb5b3', '0b9efce3-1015-4293-9c98-39f606df717e', '939803b2-6ab5-4c05-bf5a-d02c9dba53df', '024a6796-00d6-42de-a363-01205cd5e9b5', '232f19b5-6472-47bf-ab50-047ae089af71', '74b4e714-e0ed-4014-9eab-e1926a658d1f', '86f9ced2-8363-445b-8f95-d7ce48db8a6e', '97f1e849-275b-403b-ab8a-b504ba7ce89e', '70dc2d7a-82a4-444e-a73a-650b50fc0764', '08b290b3-63e7-406c-a306-e03910bd5fdd', '2cb57c6a-33c7-4b27-971c-9943874dfb58', 'd333dffa-6908-4113-b09e-721d447b3a47', '2a691398-4744-4662-b9f0-c759fac676e9', 'd1f55c6d-7b2d-4a34-8bfa-aeed18d230de', '66716832-940c-46ba-914e-7d72bf77efc4', '89e5e48f-447c-4777-9daf-90b4217bc99a',

In [None]:
retriever = vector_store.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 3}
)

retriever.invoke('RAG 기초 개념')

[Document(id='15340918-7047-406c-89fa-2e7eab21a0df', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'start_index': 34657}, page_content='You always add a comment briefly describing the purpose of the function definition.\nYou try to add comments explaining very complex bits of logic.\nYou always follow the best practices for the requested languages in terms of describing the code written as a defined\npackage/project.\nPython toolbelt preferences:'),
 Document(id='9696f563-9948-4585-9f45-3ac5421718f1', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'start_index': 37124}, page_content='},\n #  … same conversation as earlier, ended with "Make your own assumptions and state them explicitly before starting".\n  {\n    "role": "assistant",\n    "content": "Assumptions:\\n1. Model: The model will contain the game\'s data, such as level information, character states, and enemy positions.\\n2. View: The view will handle the game\'s visu

In [21]:
def search_query(query: str) -> str:
    """Retrieve info to help answer a query about RAG"""
    docs = vector_store.similarity_search(query, k=2)
    result = ''

    for doc in docs:
        result += doc.page_content + '\n\n'
    return result

print(search_query('RAG 기초 개념'))

Memory stream: is a long-term memory module (external database) that records a comprehensive list of agents’ experience in natural language.

Each element is an observation, an event directly provided by the agent.
- Inter-agent communication can trigger new natural language statements.


Retrieval model: surfaces the context to inform the agent’s behavior, according to relevance, recency and importance.

Recency: recent events have higher scores
Importance: distinguish mundane from core memories. Ask LM directly.
Relevance: based on how related it is to the current situation / query.


Reflection mechanism: synthesizes memories into higher level inferences over time and guides the agent’s future behavior. They are higher-level summaries of past events (<- note that this is a bit different from self-reflection above)

Component One: Planning#
A complicated task usually involves many steps. An agent needs to know what they are and plan ahead.
Task Decomposition#
Chain of thought (CoT; W

In [29]:
from langchain.agents import create_agent

prompt = """너는 RAG 전문가야. 랭체인 랭그래프에 대해 적힌 문서를 검색하는 도구를 다룰 수 있어.
초보자인 사용자가 질문하면 답변하기 위해 사용해. 초등학생에게 설명하듯이 친절하게 답변해.
주어진 tool을 이용해서만 답변하고, 네가 원래 알고있던 지식이나 웹 검색을 통한 답변은 하지마.
당연히 환각도 안돼."""

agent = create_agent(
    model="openai:gpt-4.1-mini",
    tools=[search_query],
    system_prompt=prompt
)

In [33]:
content = "MIPS가 뭐야?"

agent.invoke(
    {
        "messages": [
            {"role": "user", "content": content}
        ]
    }
)

{'messages': [HumanMessage(content='MIPS가 뭐야?', additional_kwargs={}, response_metadata={}, id='08f4dcbd-5d38-41bd-9a08-1f4f3128e8cc'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 155, 'total_tokens': 170, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_a391f2cee0', 'id': 'chatcmpl-DChHmKwKJZIp1BiSJqsShYxnMFmaU', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019c8e95-a8bc-7e82-9ac7-9d21c49116c2-0', tool_calls=[{'name': 'search_query', 'args': {'query': 'MIPS'}, 'id': 'call_OHptmcX0XzASh4cuq6iZGGoR', 'type': 'tool_call'}], invalid_tool_calls=[], usage_metadata={'input_tokens': 155, 'output_tokens': 15, '