In [65]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres import PGVector
from langchain_ollama import OllamaEmbeddings
from langchain import hub
from typing_extensions import List, TypedDict
from langchain_core.documents import Document
from langchain_ollama import ChatOllama
from langgraph.graph import START, StateGraph
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader

In [16]:
# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

In [66]:
class Utf8TextLoader(TextLoader):
    def __init__(self, *args, **kwargs):
        kwargs['encoding'] = 'utf-8'
        super().__init__(*args, **kwargs)

loader = DirectoryLoader(
    "./DATA", glob="**/*.txt", loader_cls=Utf8TextLoader, show_progress=True
)
docs = loader.load()

100%|██████████| 5/5 [00:00<00:00, 1250.09it/s]


In [67]:
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

100%|██████████| 5/5 [00:00<00:00, 4993.22it/s]


In [68]:
embedding_function = OllamaEmbeddings(model="nomic-embed-text")
CONNECTION_STRING = "postgresql+psycopg2://admin:admin@127.0.0.1:5433/vectordb"
COLLECTION_NAME = "test_lc"

vectorstore = PGVector(
    connection=CONNECTION_STRING,
    embeddings=embedding_function,
    collection_name=COLLECTION_NAME,
)
# Index chunks
vectorstore.add_documents(documents=all_splits)

['ca8866da-4f01-41ab-a938-77689e0d5d28',
 '2cd13323-3b07-4e28-8042-4104f5c04131',
 'b3bbaa8a-3787-4a90-8b24-d0f413401e76',
 '796368d8-f74f-44ec-8d81-5fa650bdfc6a',
 '88de68b1-00e3-4ffe-b4d0-30ee61258095',
 '3d6856fd-7bd5-49e3-a42c-a6e387f653f5',
 'b8045ee3-26e6-4f70-ba9a-6e27128b872b',
 '1adec41d-1972-492c-99fe-f11d2e0f211a',
 '96613fe4-95d7-4e6a-a6cb-21f54cfdf789',
 '0d834f4d-0424-4d5f-81a1-2595bf409199',
 'd94e836e-fe64-4ac0-8490-34cfd7e712be',
 '247fdc86-0429-4e13-ad8f-b13ebd181a9d',
 '58316d1d-ab30-4730-b1c7-144ba8dfca90',
 '4b1f9573-3562-4856-b45a-83afc41aa84e',
 'a737dbe5-20d1-47ef-92da-3158a6325c1e',
 '4ea43363-6d9f-4a32-b528-c072fab284d6']

In [69]:
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.
Don't include the thinking process in the answer, just the final answer.

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

In [70]:
llm = ChatOllama(model="deepseek-r1:7b",temperature=0,stream=True )

In [71]:
# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# Define application steps
def retrieve(state: State):
    retrieved_docs = vectorstore.similarity_search(state["question"])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = custom_rag_prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [72]:
# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [76]:
response = graph.invoke({"question": "en que consite el bono de matrimonio?"})
print(response["answer"])

KeyboardInterrupt: 