# Setup

In [1]:
from langchain_ollama import ChatOllama

base_url="http://host.docker.internal:11434"
model_name = "gemma3:27b"

llm = ChatOllama(
    model=model_name,
    base_url=base_url,
)

In [2]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
    base_url=base_url,
)


from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

# load documents

In [3]:
# import bs4
# from langchain import hub
# from langchain_community.document_loaders import WebBaseLoader, Docx2txtLoader
# from langchain_core.documents import Document
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from typing_extensions import List, TypedDict

# # Load and chunk contents of the blog
# loader = WebBaseLoader(
#     web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
#     bs_kwargs=dict(
#         parse_only=bs4.SoupStrainer(
#             class_=("post-content", "post-title", "post-header")
#         )
#     ),
# )
# docs = loader.load()

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # openai file search chunck_size=800, chuck_overlap=400
# all_splits = text_splitter.split_documents(docs)

# # Index chunks
# _ = vector_store.add_documents(documents=all_splits)

In [8]:
from glob import glob
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

docs = []
for file_path in glob("sample_data/salamat/*"):
    loader = Docx2txtLoader(file_path)
    docs.append(*loader.load())


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # openai file search chunck_size=800, chuck_overlap=400
all_splits = text_splitter.split_documents(docs)

_ = vector_store.add_documents(documents=all_splits)

# Chain

In [9]:
from typing import Optional

from pydantic import BaseModel, Field

class Query(BaseModel):
    """Optimized query for information retrieval"""
    original_question: str = Field(description="user's original question")
    optimized_query: str = Field(description="optimized query for embedding")


structured_llm = llm.with_structured_output(Query)

In [10]:
def retrieve(query: str):
    """Retrieve information related to complex and special queries."""
    print("="*40)
    print("retrieve")
    print(query)
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
        for doc in retrieved_docs
    )
    print(serialized)
    print("End of retrieve")
    print("="*40)
    return serialized

In [11]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template("""
Use the context below to answer the question.

Context:
{context}

Question:
{question}

Answer only using the context. If the context does not contain enough information, say you don't know.
""")


# prompt_template = PromptTemplate.from_template("""

# You are an assistant for question-answering tasks.
# Use the following pieces of retrieved context to answer
# the question. If you don't know the answer, say that you
# don't know. Use three sentences maximum and keep the
# answer concise

# Context:
# {context}

# Question:
# {question}

# """)

In [12]:
# Wrap it properly to unpack the structured output:
chain = (
    structured_llm
    | (lambda query: {
        "context": retrieve(query.optimized_query),
        "question": query.original_question
      })
    | prompt_template
    | llm
)

# Usage

In [14]:
chain.invoke("تعهدات ivf چجوریه؟")

retrieve
هزینه IVF چقدر است؟ مراحل انجام IVF چیه؟
Source: {'source': 'sample_data/salamat/بستری.docx'}
Content: بستری

پرسش:

تعهدات IVF در صندوق های بیمه سلامت در مراکز طرف قرارداد و غیر طرف قرارداد به چه صورت می باشد ؟

پاسخ:

1-در مراکز دولتی با90% تعرفه مصوب بخش دولتی 2- در مراکز عمومی غیردولتی با 90%تعرفه مصوب بخش عمومی غیردولتی 3- درمراکز خصوصی وخیریه با 70% تعرفه مصوب بخش خصوصی وخیریه می توانند از 4 بسته خدمتی استفاده نمایند.

پرسش:

 فرانشیز خدمات بستری جهت مددجویان بهزیستی و کمیته امداد و همچنین روستاییان با رعایت نظام ارجاع صفر می باشد آیا این موضوع صرفا مربوط به مراکز دولتی و دانشگاهی بوده یا اینکه در مراکز دولتی غیر دانشگاهی نیز اعمال گردیده است؟

پاسخ:

  بر اساس مصوبه هیأت محترم وزیران ، فرانشیز مددجویان بهزیستی و کمیته امداد و همچنین روستاییان در صورت دریافت خدمات بستری و بستری فوریت های پزشکی(اورژانس)(دارای پرونده) در مراکز دولتی دانشگاهی در قالب  نظام ارجاع صفر می باشد.

پرسش:

هزینه اقامت (هتلینگ) اعلام شده در خصوص خدمات بستری به ازای هر شب بستری شدن بیمه شده می باشد 

AIMessage(content='1-در مراکز دولتی با90% تعرفه مصوب بخش دولتی 2- در مراکز عمومی غیردولتی با 90%تعرفه مصوب بخش عمومی غیردولتی 3- درمراکز خصوصی وخیریه با 70% تعرفه مصوب بخش خصوصی وخیریه می توانند از 4 بسته خدمتی استفاده نمایند.', additional_kwargs={}, response_metadata={'model': 'gemma3:27b', 'created_at': '2025-05-24T05:59:29.11883426Z', 'done': True, 'done_reason': 'stop', 'total_duration': 2519052211, 'load_duration': 31839107, 'prompt_eval_count': 496, 'prompt_eval_duration': 381083953, 'eval_count': 79, 'eval_duration': 2105746603, 'model_name': 'gemma3:27b'}, id='run--6604e778-4702-4537-b264-52d2693926da-0', usage_metadata={'input_tokens': 496, 'output_tokens': 79, 'total_tokens': 575})

In [42]:
chain.invoke("Hi there how are you? What is task decomposition")

retrieve
What is task decomposition?
Source: {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}
Content: Component One: Planning#
A complicated task usually involves many steps. An agent needs to know what they are and plan ahead.
Task Decomposition#
Chain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.
Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each stat

AIMessage(content='Task decomposition is breaking down a complicated task into multiple steps. It can be done by an LLM with simple prompting (like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?"), by using task-specific instructions (e.g. "Write a story outline."), or with human inputs. It’s also achieved through methods like Chain of Thought (CoT) and Tree of Thoughts, which explore reasoning possibilities step-by-step.\n', additional_kwargs={}, response_metadata={'model': 'gemma3:27b', 'created_at': '2025-05-24T05:37:18.435044908Z', 'done': True, 'done_reason': 'stop', 'total_duration': 3096222717, 'load_duration': 32482283, 'prompt_eval_count': 561, 'prompt_eval_duration': 452622052, 'eval_count': 98, 'eval_duration': 2610587915, 'model_name': 'gemma3:27b'}, id='run--c7bf6555-eb78-4531-8f4a-76214f51a7f0-0', usage_metadata={'input_tokens': 561, 'output_tokens': 98, 'total_tokens': 659})

In [43]:
chain.invoke("Hi there how are you? When the first man landed on the moon?")

retrieve
When did the first man land on the moon?
Source: {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}
Content: GOALS:

1. {{user-provided goal 1}}
2. {{user-provided goal 2}}
3. ...
4. ...
5. ...

Constraints:
1. ~4000 word limit for short term memory. Your short term memory is short, so immediately save important information to files.
2. If you are unsure how you previously did something or want to recall past events, thinking about similar events will help you remember.
3. No user assistance
4. Exclusively use the commands listed in double quotes e.g. "command name"
5. Use subprocesses for commands that will not terminate within a few minutes

Source: {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}
Content: Illustration of the Reflexion framework. (Image source: Shinn & Labash, 2023)

The heuristic function determines when the trajectory is inefficient or contains hallucination and should be stopped. Inefficient planning refers to trajectorie

AIMessage(content="I don't know.\n", additional_kwargs={}, response_metadata={'model': 'gemma3:27b', 'created_at': '2025-05-24T05:37:21.960683911Z', 'done': True, 'done_reason': 'stop', 'total_duration': 609872536, 'load_duration': 45695989, 'prompt_eval_count': 460, 'prompt_eval_duration': 371242166, 'eval_count': 8, 'eval_duration': 192429105, 'model_name': 'gemma3:27b'}, id='run--1f07c650-cc64-43c9-8885-8c29975c813d-0', usage_metadata={'input_tokens': 460, 'output_tokens': 8, 'total_tokens': 468})