In [None]:
# LangChain core
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# LangChain OpenAI
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# LangChain Community
from langchain_community.vectorstores import FAISS

# Standard libraries
import requests
from bs4 import BeautifulSoup


import os
from dotenv import load_dotenv

from supabase import create_client, Client

# Task Procedure 
## 1.Pydandic docs llm.txt(https://docs.pydantic.dev/latest/llms.txt)에서 URL과 description을 추출한 후, Supabase 데이터 베이스에 저장

## 2. 문서 리스트 조회  
 (1) URL과 description을 추출출한 것을 가져와 Documet 객체로 변환한다. 
 (2) 임베딩 후 사용자 query와 simiarity check를 통해 URL을 선별한다.
 (3) 각각의 URL들을 하나의 chunk로 만든다. 
## 3. 웹 페이지 Fetch
 (1) 선별된 URL을 fetch를 한 후 각각의 URL에 해당하는 본문을 가져온다. 
## 4. Context Filter 
- 마지막으로 각각의 URL 본문들 중에 사용자 query와의 similarity check 를 통해 context filter를 진행

# Task Procedure

## 1. Extract URL and Description from Pydantic Docs
- Extract the URL and description from `llms.txt` (https://docs.pydantic.dev/latest/llms.txt).
- Save the extracted data to Supabase.

## 2. Retrieve Document List
1. Load the extracted URLs and descriptions, and convert them into `Document` objects.
2. Generate embeddings and select relevant URLs by performing a similarity check with the user query.
3. Chunk each selected URL into manageable pieces.

## 3. Fetch Web Pages
- Fetch the content of each selected URL and retrieve the main body text for each.

## 4. Context Filtering
- Finally, perform a similarity check between the user query and the content of each URL, filtering the contexts to retain only the most relevant ones.

## Load data

In [13]:
# load env variables 
load_dotenv()
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")

# Set supabase client 
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)


# load data from supabase database
response = supabase.table("pydantic_docs_llms").select("*").execute()
data = response.data
print(data)

[{'id': 1, 'source_url': 'https://docs.pydantic.dev/latest/llms.txt', 'url': 'https://docs.pydantic.dev/latest/concepts/alias/index.md', 'description': 'Alias', 'time': '2025-07-01T19:49:46.868847'}, {'id': 2, 'source_url': 'https://docs.pydantic.dev/latest/llms.txt', 'url': 'https://docs.pydantic.dev/latest/concepts/config/index.md', 'description': 'Configuration', 'time': '2025-07-01T19:49:47.655498'}, {'id': 3, 'source_url': 'https://docs.pydantic.dev/latest/llms.txt', 'url': 'https://docs.pydantic.dev/latest/concepts/conversion_table/index.md', 'description': 'Conversion Table', 'time': '2025-07-01T19:49:47.731734'}, {'id': 4, 'source_url': 'https://docs.pydantic.dev/latest/llms.txt', 'url': 'https://docs.pydantic.dev/latest/concepts/dataclasses/index.md', 'description': 'Dataclasses', 'time': '2025-07-01T19:49:47.819167'}, {'id': 5, 'source_url': 'https://docs.pydantic.dev/latest/llms.txt', 'url': 'https://docs.pydantic.dev/latest/concepts/experimental/index.md', 'description': 'E

In [14]:
data

[{'id': 1,
  'source_url': 'https://docs.pydantic.dev/latest/llms.txt',
  'url': 'https://docs.pydantic.dev/latest/concepts/alias/index.md',
  'description': 'Alias',
  'time': '2025-07-01T19:49:46.868847'},
 {'id': 2,
  'source_url': 'https://docs.pydantic.dev/latest/llms.txt',
  'url': 'https://docs.pydantic.dev/latest/concepts/config/index.md',
  'description': 'Configuration',
  'time': '2025-07-01T19:49:47.655498'},
 {'id': 3,
  'source_url': 'https://docs.pydantic.dev/latest/llms.txt',
  'url': 'https://docs.pydantic.dev/latest/concepts/conversion_table/index.md',
  'description': 'Conversion Table',
  'time': '2025-07-01T19:49:47.731734'},
 {'id': 4,
  'source_url': 'https://docs.pydantic.dev/latest/llms.txt',
  'url': 'https://docs.pydantic.dev/latest/concepts/dataclasses/index.md',
  'description': 'Dataclasses',
  'time': '2025-07-01T19:49:47.819167'},
 {'id': 5,
  'source_url': 'https://docs.pydantic.dev/latest/llms.txt',
  'url': 'https://docs.pydantic.dev/latest/concepts/e

## 2. Retrieve Document List


In [82]:
# Create a list of Document objects from data
document = []
for i in data:
    doc = Document(page_content=i["url"], metadata={"description": i["description"]})
    document.append(doc)

In [83]:
document

[Document(metadata={'description': 'Alias'}, page_content='https://docs.pydantic.dev/latest/concepts/alias/index.md'),
 Document(metadata={'description': 'Configuration'}, page_content='https://docs.pydantic.dev/latest/concepts/config/index.md'),
 Document(metadata={'description': 'Conversion Table'}, page_content='https://docs.pydantic.dev/latest/concepts/conversion_table/index.md'),
 Document(metadata={'description': 'Dataclasses'}, page_content='https://docs.pydantic.dev/latest/concepts/dataclasses/index.md'),
 Document(metadata={'description': 'Experimental'}, page_content='https://docs.pydantic.dev/latest/concepts/experimental/index.md'),
 Document(metadata={'description': 'Fields'}, page_content='https://docs.pydantic.dev/latest/concepts/fields/index.md'),
 Document(metadata={'description': 'Forward Annotations'}, page_content='https://docs.pydantic.dev/latest/concepts/forward_annotations/index.md'),
 Document(metadata={'description': 'JSON'}, page_content='https://docs.pydantic.

## 문서 리스트 조회
- 

In [84]:
# Embedding
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Create VectorDB
vectorstore = FAISS.from_documents(documents=document, embedding=embeddings)

# Set Retriever

retriever = vectorstore.as_retriever()

In [85]:
# vector similarity search
filtered_urls = []
for doc in vectorstore.similarity_search("What is the Fields class?"):
    filtered_urls.append(doc.page_content)
    print(doc.page_content)

https://docs.pydantic.dev/latest/api/fields/index.md
https://docs.pydantic.dev/latest/concepts/fields/index.md
https://docs.pydantic.dev/latest/concepts/dataclasses/index.md
https://docs.pydantic.dev/latest/api/dataclasses/index.md


## 3. Fetch Web Pages

In [23]:
def fetch_page_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    # 본문만 추출 (사이트 구조에 따라 조정 필요)
    text = soup.get_text(separator="\n")
    return text

In [87]:
## Extract text from filtered URLs
page_texts = []
for url in filtered_urls:
    text = fetch_page_text(url)
    page_texts.append({"url": url, "text": text})

In [88]:
page_texts

[{'url': 'https://docs.pydantic.dev/latest/api/fields/index.md',
 {'url': 'https://docs.pydantic.dev/latest/concepts/fields/index.md',
 {'url': 'https://docs.pydantic.dev/latest/concepts/dataclasses/index.md',
  'text': 'API Documentation\n\npydantic.dataclasses.dataclass\n\nIf you don\'t want to use Pydantic\'s BaseModel you can instead get the same data validation on standard dataclasses.\n\n```python\nfrom datetime import datetime\nfrom typing import Optional\n\nfrom pydantic.dataclasses import dataclass\n\n\n@dataclass\nclass User:\n    id: int\n    name: str = \'John Doe\'\n    signup_ts: Optional[datetime] = None\n\n\nuser = User(id=\'42\', signup_ts=\'2032-06-21T12:00\')\nprint(user)\n"""\nUser(id=42, name=\'John Doe\', signup_ts=datetime.datetime(2032, 6, 21, 12, 0))\n"""\n\n```\n\n```python\nfrom datetime import datetime\n\nfrom pydantic.dataclasses import dataclass\n\n\n@dataclass\nclass User:\n    id: int\n    name: str = \'John Doe\'\n    signup_ts: datetime | None = None\n

In [89]:
## Wrap the extracted texts in a document object.
context_document = []
for i in page_texts:
    doc = Document(page_content=i["text"], metadata={"url": i["url"]})
    context_document.append(doc)

In [90]:
context_document

 Document(metadata={'url': 'https://docs.pydantic.dev/latest/concepts/dataclasses/index.md'}, page_content='API Documentation\n\npydantic.dataclasses.dataclass\n\nIf you don\'t want to use Pydantic\'s BaseModel you can instead get the same data validation on standard dataclasses.\n\n```python\nfrom datetime import datetime\nfrom typing import Optional\n\nfrom pydantic.dataclasses import dataclass\n\n\n@dataclass\nclass User:\n    id: int\n    name: str = \'John Doe\'\n    signup_ts: Optional[datetime] = None\n\n\nuser = User(id=\'42\', signup_ts=\'2032-06-21T12:00\')\nprint(user)\n"""\nUser(id=42, name=\'John Doe\', signup_ts=datetime.datetime(2032, 6, 21, 12, 0))\n"""\n\n```\n\n```python\nfrom datetime import datetime\n\nfrom pydantic.dataclasses import dataclass\n\n\n@dataclass\nclass User:\n    id: int\n    name: str = \'John Doe\'\n    signup_ts: datetime | None = None\n\n\nuser = User(id=\'42\', signup_ts=\'2032-06-21T12:00\')\nprint(user)\n"""\nUser(id=42, name=\'John Doe\', sign

In [91]:
# Embed the extracted text and store it in a vectorDB
embeddings_context = OpenAIEmbeddings(model="text-embedding-3-small")

context_vectorstore = FAISS.from_documents(
    documents=context_document, embedding=embeddings_context
)

In [92]:
# Set context retriever
retriever = context_vectorstore.as_retriever()

## 4. Context Filtering

In [93]:
filtered_context = []
for doc in context_vectorstore.similarity_search("What is the Fields class?"):
    filtered_txt = doc.page_content
    filtered_url = doc.metadata["url"]
    filtered_context.append({"context": filtered_txt, "url": url})

In [94]:
filtered_context

  'url': 'https://docs.pydantic.dev/latest/api/dataclasses/index.md'},
  'url': 'https://docs.pydantic.dev/latest/api/dataclasses/index.md'},
 {'context': 'API Documentation\n\npydantic.dataclasses.dataclass\n\nIf you don\'t want to use Pydantic\'s BaseModel you can instead get the same data validation on standard dataclasses.\n\n```python\nfrom datetime import datetime\nfrom typing import Optional\n\nfrom pydantic.dataclasses import dataclass\n\n\n@dataclass\nclass User:\n    id: int\n    name: str = \'John Doe\'\n    signup_ts: Optional[datetime] = None\n\n\nuser = User(id=\'42\', signup_ts=\'2032-06-21T12:00\')\nprint(user)\n"""\nUser(id=42, name=\'John Doe\', signup_ts=datetime.datetime(2032, 6, 21, 12, 0))\n"""\n\n```\n\n```python\nfrom datetime import datetime\n\nfrom pydantic.dataclasses import dataclass\n\n\n@dataclass\nclass User:\n    id: int\n    name: str = \'John Doe\'\n    signup_ts: datetime | None = None\n\n\nuser = User(id=\'42\', signup_ts=\'2032-06-21T12:00\')\nprint

In [None]:
# all_context = "\n\n".join([c["context"] for c in filtered_context])

## Build RAG Chain 

In [99]:
# Set prompt
prompt = PromptTemplate.from_template(
    """
You are an AI assistant for question-answering tasks.

Use the retrieved context provided below to answer the user's question.
Your response **must** include any relevant URLs mentioned in the context.
Please exclude "/index.md" at the end of URLs
If you don't know the answer, just say that you don't know. 
Answer in **ENGLISH**.
Please follow the format below:
# FORMAT

Answer

[Related Links]
- URL1
- URL2
...

#Context: 
{context}

#Question:
{question}

#Answer:"""
)


# Set llm
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# Build the chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [100]:
question = "What is the Fields class and give the example code?"
response = chain.invoke(question)
print(response)

The `Field` class in Pydantic is used to define fields on models, providing extra information about a field for the model schema or complex validation. It allows you to specify various parameters such as default values, aliases, validation constraints, and more.

Here is an example code snippet using the `Field` class:

```python
from pydantic import BaseModel, Field

class User(BaseModel):
    name: str = Field(default='John Doe', title='Name', description='The name of the user')
    age: int = Field(default=20, ge=0, title='Age', description='The age of the user, must be non-negative')

user = User()
print(user)
```

In this example, the `Field` class is used to define the `name` and `age` fields with default values, titles, descriptions, and a validation constraint for the `age` field to ensure it is non-negative.

[Related Links]
- https://docs.pydantic.dev/latest/api/fields
- https://docs.pydantic.dev/latest/concepts/fields
