In [1]:
%pip install langchain langchain-community langchain-openai chromadb
# https://python.langchain.com/docs/integrations/document_loaders/web_base/

Note: you may need to restart the kernel to use updated packages.


In [2]:
# I'll provide a comprehensive example of a naive RAG pipeline using the latest LangChain version, broken down into clear steps:

# RAG Pipeline Steps
# Document Loading
# python
# Copy
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 머스크·베이조스 이어…'우주전쟁' 뛰어든 빌 게이츠, 한국경제
url = "https://www.hankyung.com/article/2025012081901"

# Load documents from a web source
loader = WebBaseLoader(url)

docs = loader.load()
# Text Splitting
# python
# Copy
# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
splits = text_splitter.split_documents(docs)


for split in splits[:5]:
    display(split.page_content[:100])

USER_AGENT environment variable not set, consider setting it to identify your requests.


'머스크·베이조스 이어…우주전쟁 뛰어든 빌 게이츠 | 한국경제\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

'TESAT\n\n\n29초영화제\n\n\n교육센터\n\n\n블루밍비트\n\n\n키즈맘\n\n\n포천힐스\n\n\n\n\n\n\n\n빠른 메뉴 찾기\n\n\n\n\n\n검색어 삭제\n\n\n\n\n\n\n\n자동완성창 닫기\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

'수정2025.01.21 00:58\n\n\n                                            지면A2\n\n\n\n\n\n\n\n글자크기 조절\n\n\n\n\n\n\n\n\n\n\n\n\n기사 '

"'10년뒤 2611兆' 우주 시장 겨냥\n스토크스페이스에 2.6억弗 베팅\n중형 재사용 로켓 개발 투입예정\n                                    \n\n\n\n\n\n"

'스토크스페이스는 재사용이 가능한 중형 로켓을 개발하는 민간 우주기업이다. 나스닥시장 상장사 로켓랩과 비슷하다. 최근 워싱턴주 모세스레이크에 있는 발사 시험대에서 1단 ‘제니스’ 엔'

In [3]:
# Embedding and Vector Store
# python
# Copy
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# Create embeddings
embeddings = OllamaEmbeddings(model="llama3.1")

# Create vector store
vectorstore = Chroma.from_documents(
    documents=splits, 
    embedding=embeddings
)
# Create retriever
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 4}  # retrieve top 4 most relevant chunks
)

In [4]:
# RAG Chain Creation
# python
# Copy
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Initialize language model
llm = ChatOllama(model="llama3.1", temperature=0, verbose=True)

# Create prompt template
prompt = ChatPromptTemplate.from_template("""
You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Question: {question} 
Context: {context} 
Answer:""")

# Create RAG chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
# Querying the RAG System
# python
# Copy
# Ask a question
question = "빌 게이츠가 한 일을 설명해주세요"
response = rag_chain.invoke(question)
print(response)

빌 게이츠가 한 일은 우주기업 스토크스페이스에 투자하여 중형 로켓 '노바' 개발과 플로리다주 발사 시설 건설을 지원한 것입니다.


In [5]:
question = "빌 게이츠가 한 일을 구체적으로 설명해주세요"
response = rag_chain.invoke(question)
print(response)

빌 게이츠가 한 일은 우주기업 스토크스페이스에 투자하여 중형 로켓 '노바' 개발과 플로리다주 발사 시설 건설을 지원한 것입니다.
