# 02-rag.ipynb

In [41]:
from dotenv import load_dotenv

# .env 파일에 있는 환경변수들을 불러오기
load_dotenv()

True

In [42]:
# 1. Document Load (PDF)
# 지원하는 문서 
from langchain_community.document_loaders import PyPDFLoader
# 불러올 파일 위치
file_path = './nke-10k-2023.pdf'
# pdf를 변환해줄 폴더
loader = PyPDFLoader(file_path)
# 로더가 pdffmf python에서 쓸 수 있도록 변환(pdf 1page -> 1 Document.)
docs = loader.load()
print(len(docs))

107


In [43]:
# 2. Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Document를 잘라줄 스플리터
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap = 200, add_start_index =True
)

# 쪼개기
chunks = text_splitter.split_documents(docs)

print(len(chunks)) # 전체 chunk 개수
print(chunks[0].page_content) # 첫번째 청크의 원본 텍스트 내용

516
Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other jurisdiction of incorporation) (IRS Employer Identification No.)
One Bowerman Drive, Beaverton, Oregon 97005-6453
(Address of principal executive offices and zip code)
(503) 671-6453
(Registrant's telephone number, including area code)
SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:
Class B Common Stock NKE New York Stock Exchange
(Title of each class) (Trading symbol) (Name of each exchange on which registered)


In [44]:
# 3. Embedding (숫자로 바꾸기)
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

# 아래는 테스트용 (실제 텍스트 -> 벡터로 바뀌는 과정)
v1 = embeddings.embed_query(chunks[0].page_content) # 청크1 벡터로 변환
v2 = embeddings.embed_query(chunks[1].page_content) # 청크2 벡터로 변환

# 차원수는 같아야 한다.
print(len(v1) == len(v2))
print(v1[:10]) # 벡터 눈으로 확인하기

True
[0.054528214037418365, 0.048848193138837814, 0.019134575501084328, -0.0062835244461894035, 0.024802763015031815, -0.008969701826572418, -0.006224357523024082, 0.024258429184556007, 0.0007244247244670987, 0.009809872135519981]


In [45]:
# 4. Vector Store에 저장하기
from langchain_core.vectorstores import InMemoryVectorStore

# 테스트/개발용 메모리 벡터스토어
vector_store = InMemoryVectorStore(embeddings)

# pdf 쪼개놓은 chunks를 벡터스토어에 저장 (저장 후 id들이 나옴)
ids = vector_store.add_documents(documents=chunks)


In [46]:
len(ids)

516

In [47]:
retriever = vector_store.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 3}
)

retriever.invoke('나이키의 미국 영업점 개수?')

[Document(id='f4c5ea77-6f30-4db2-9b86-d0ae8e2e8958', metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './nke-10k-2023.pdf', 'total_pages': 107, 'page': 4, 'page_label': '5', 'start_index': 3125}, page_content='direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further informati

# PDF RAG를 Agent에 통합

In [48]:
# 검색기(retriever)를 Tool(함수)로 만들기

# 검색어를 인자로 받음
def search_vectorstore(query: str) -> str:
    """Retrieve info to help answer a query about Nike"""
    # 검색기 대신 벡터스토어 바로 활용하기 (chunk 2개만 검색)
    docs = vector_store.similarity_search(query, k=2)
    result = ''

    for doc in docs:
        result += doc.page_content + '\n\n'

    print(docs[0].page_content)
    return result

search_vectorstore('나이키 영업점 개수')

direct to consumer operations sell products through the following number of retail stores in the United States:
U.S. RETAIL STORES NUMBER
NIKE Brand factory stores 213 
NIKE Brand in-line stores (including employee-only stores) 74 
Converse stores (including factory stores) 82 
TOTAL 369 
In the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.
2023 FORM 10-K 2


'direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.\n2023 FORM 10-K 2\n\nTable of Contents\nITEM 1B. UNRESOLVED STAFF COMMENTS\nNone.\nITEM 2. PROPERTIES\nThe following is a summary of principal properties owned or leased by NIKE:\nThe NIKE World Campus, owned by NIKE and located near Beaverton, Oregon, USA, is an approximately 400-acre site consisting of over 40 buildings which, together\nwith adjacent leased properties, functions as our world headquarters and is occupied by approximately 11,400 employees engaged in management, research, design,\ndevelopment, marketing, finance and other administrative functions serving ne

In [49]:
from langchain.agents import create_agent

prompt = """너는 2023 나이키 10k 보고서를 검색하는 도구를 다룰 수 있어. 
사용자 질문에 답변하기 위해 필요하면 사용해. 경제분석 전문가처럼 답변해"""

agent = create_agent(
    model="openai:gpt-4.1-mini",
    tools=[search_vectorstore],
    system_prompt=prompt
)

In [50]:
content = "나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함."

agent.invoke(
    {
        "messages": [
            {"role": "user", "content": content}
        ]
    }
)

direct to consumer operations sell products through the following number of retail stores in the United States:
U.S. RETAIL STORES NUMBER
NIKE Brand factory stores 213 
NIKE Brand in-line stores (including employee-only stores) 74 
Converse stores (including factory stores) 82 
TOTAL 369 
In the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.
2023 FORM 10-K 2
Table of Contents
NORTH AMERICA
(Dollars in millions) FISCAL 2023 FISCAL 2022 % CHANGE
% CHANGEEXCLUDINGCURRENCYCHANGESFISCAL 2021 % CHANGE
% CHANGEEXCLUDINGCURRENCYCHANGES
Revenues by:
Footwear $ 14,897 $ 12,228 22 % 22 %$ 11,644 5 % 5 %
Apparel 5,947 5,492 8 % 9 % 5,028 9 % 9 %
Equipment 764 633 21 % 21 % 507 25 % 25 %
TOTAL REVENUES $ 21,608 $ 18,353 18 % 18 %$ 17,179 7 % 7 %
Revenues by:    
Sales to Wholesale Customers $ 11,273 $ 9,621 17 % 18 %$ 10,186 -6 % -6 %
Sales through NIKE Direct 10,335 8,732 18 % 18 % 6,993 25 % 25 %
TOTAL REVENUES $ 21,608 $ 18,35

{'messages': [HumanMessage(content='나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함.', additional_kwargs={}, response_metadata={}, id='1c96d945-a3ea-406f-b521-0b29fa7ff03c'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 26, 'prompt_tokens': 117, 'total_tokens': 143, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_d94e893512', 'id': 'chatcmpl-DCx7H0Hfay9WMAHzoJJ6VVblSWE5V', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019c9236-30ff-7e60-8359-7be54a84c9ce-0', tool_calls=[{'name': 'search_vectorstore', 'args': {'query': 'Nike number of stores and average sales per store 2023'}, 'id': 'call_4CaWjUx6IE88J3EDC8ZkrRKl', 'type': 'tool_call'}], inval

# Web문서(Html) RAG + Agent

In [51]:
# HTML은 문서 본문외에 필요하지 않은 내용이 많다. 전처리가 필요하다.
import bs4
from langchain_community.document_loaders import WebBaseLoader

# 전처리기
bs4_strainer = bs4.SoupStrainer(class_=('post-title', 'post-header', 'post-content'))
# 로더
loader = WebBaseLoader(
    web_path=("https://lilianweng.github.io/posts/2023-06-23-agent/"),
    bs_kwargs={'parse_only': bs4_strainer} #처리기 넣기
    
)

docs = loader.load()
# 문서 페이지 수
print(len(docs), len(docs[0].page_content))

1 43047


In [52]:
# Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 스플리터
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index = True
)

chunks = text_splitter.split_documents(docs)

print(len(chunks))
print(chunks[0].page_content)


63
LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:

Planning

Subgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.
Reflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results.


Memory


In [53]:
# 3. Embedding (숫자로 바꾸기)
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

# 아래는 테스트용 (실제 텍스트 -> 벡터로 바뀌는 과정)
v1 = embeddings.embed_query(chunks[0].page_content)  # 청크1 벡터로 변환
v2 = embeddings.embed_query(chunks[1].page_content)  # 청크2 벡터로 변환

# 차원수는 같아야 한다.
print(len(v1) == len(v2))
print(v1[:10])  # 벡터 눈으로 확인하기

True
[0.009648381732404232, 0.020312383770942688, 0.041626472026109695, -0.0041494304314255714, 0.00354423257522285, 0.0019164594123139977, -0.014427357353270054, 0.03675706684589386, -0.019032424315810204, 0.058878086507320404]


In [54]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

ids = vector_store.add_documents(documents=chunks)
print(ids)

['621bede9-368f-4859-9561-ceb3a1faefa2', '03390c0f-5d77-4fb6-994a-7a3319c84441', '0edcac50-72be-40d2-a01a-35429aa81f25', 'bce99b7d-0786-4dac-b4bf-19eb7aeaff00', 'f8f3d586-274c-44ef-ab5d-2e49753c1070', 'd3e92548-772d-4cd9-9dc7-c36c79d12dfe', '4deccd35-df6a-4075-913a-5862dd7c31ac', '7851cfac-f815-4680-876b-3c1de8a466c2', '861c6a4a-2dbc-461e-8464-f20a65f98ebf', '81c36b5a-6ac8-4109-abe1-2570b9c35bc2', 'f49e4c10-5a21-4e76-a2ec-31baa91e5633', '6efc759b-6b7e-4499-975b-3d06c591dbf3', '97f26766-0d09-4e9b-b3fd-ac8bd08d85be', 'e3743cbe-e758-4cee-9d25-e2dc80aba52c', '84780d8c-18cd-4cfe-86b2-b10816d4347b', '6ea97593-c1cb-4c5e-8ef2-0f80ee954442', '7ad6598c-989d-47b7-91e8-f0bba5c7c903', 'e93687ab-a56f-4ac5-bba0-c67845ef8d55', '2681c7cc-4386-4c89-bedc-a2a2226443df', 'c01a19d7-7be8-4ddd-bc59-e2b78c3a6771', '028f3ff9-8b47-4509-9ff4-f78dacc6decf', '0d00d70c-0ef7-47fd-b644-a8ddfba98046', 'c0f8e13d-de8e-49ce-9f56-8bd0360f12b1', 'f031b0a0-add1-4b04-9502-38b20ae8b4fe', '48803f43-6a9c-4e3f-839b-33598f184b20',

In [55]:
retriever = vector_store.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 3}
)

retriever.invoke('RAG 기초 개념')

[Document(id='bf11a751-5303-4a2d-b95c-fb1f86db169d', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'start_index': 25125}, page_content='Memory stream: is a long-term memory module (external database) that records a comprehensive list of agents’ experience in natural language.\n\nEach element is an observation, an event directly provided by the agent.\n- Inter-agent communication can trigger new natural language statements.\n\n\nRetrieval model: surfaces the context to inform the agent’s behavior, according to relevance, recency and importance.\n\nRecency: recent events have higher scores\nImportance: distinguish mundane from core memories. Ask LM directly.\nRelevance: based on how related it is to the current situation / query.\n\n\nReflection mechanism: synthesizes memories into higher level inferences over time and guides the agent’s future behavior. They are higher-level summaries of past events (<- note that this is a bit different from self-reflection

In [56]:
def search_query(query: str) -> str:
    """Retrieve info to help answer a query about RAG"""
    docs = vector_store.similarity_search(query, k=2)
    result = ''

    for doc in docs:
        result += doc.page_content + '\n\n'
    return result

print(search_query('RAG 기초 개념'))

Memory stream: is a long-term memory module (external database) that records a comprehensive list of agents’ experience in natural language.

Each element is an observation, an event directly provided by the agent.
- Inter-agent communication can trigger new natural language statements.


Retrieval model: surfaces the context to inform the agent’s behavior, according to relevance, recency and importance.

Recency: recent events have higher scores
Importance: distinguish mundane from core memories. Ask LM directly.
Relevance: based on how related it is to the current situation / query.


Reflection mechanism: synthesizes memories into higher level inferences over time and guides the agent’s future behavior. They are higher-level summaries of past events (<- note that this is a bit different from self-reflection above)

Component One: Planning#
A complicated task usually involves many steps. An agent needs to know what they are and plan ahead.
Task Decomposition#
Chain of thought (CoT; W

In [57]:
from langchain.agents import create_agent

prompt = """너는 RAG 전문가야. 랭체인 랭그래프에 대해 적힌 문서를 검색하는 도구를 다룰 수 있어.
초보자인 사용자가 질문하면 답변하기 위해 사용해. 초등학생에게 설명하듯이 친절하게 답변해.
주어진 tool을 이용해서만 답변하고, 네가 원래 알고있던 지식이나 웹 검색을 통한 답변은 하지마.
당연히 환각도 안돼."""

agent = create_agent(
    model="openai:gpt-4.1-mini",
    tools=[search_query],
    system_prompt=prompt
)

In [58]:
content = "MIPS가 뭐야?"

# event라는 임시 변수에 

for event in agent.stream({"messages": [{"role": "user", "content": content}]}, stream_mode='values'):
    event['messages'][-1].pretty_print()


MIPS가 뭐야?
Tool Calls:
  search_query (call_q50HiunqbcQk46DAUmsHftkX)
 Call ID: call_q50HiunqbcQk46DAUmsHftkX
  Args:
    query: MIPS
Name: search_query

Maximum Inner Product Search (MIPS)#
The external memory can alleviate the restriction of finite attention span.  A standard practice is to save the embedding representation of information into a vector store database that can support fast maximum inner-product search (MIPS). To optimize the retrieval speed, the common choice is the approximate nearest neighbors (ANN)​ algorithm to return approximately top k nearest neighbors to trade off a little accuracy lost for a huge speedup.
A couple common choices of ANN algorithms for fast MIPS:

Comparison of MIPS algorithms, measured in recall@10. (Image source: Google Blog, 2020)

Check more MIPS algorithms and performance comparison in ann-benchmarks.com.
Component Three: Tool Use#
Tool use is a remarkable and distinguishing characteristic of human beings. We create, modify and utilize ext