In [None]:
import os
from dataclasses import dataclass
from pathlib import Path

from langchain_classic.retrievers import ParentDocumentRetriever
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_community.storage import SQLStore
from langchain_core.documents import Document
from langchain_qdrant import QdrantVectorStore
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
import frontmatter

In [None]:
from dotenv import load_dotenv

load_dotenv()
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_key = os.getenv("AZURE_OPENAI_API_KEY")
deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT")
embeddings_deployment = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")
api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-06-01")

In [None]:
POSTGRES_URL = os.getenv("POSTGRES_URL")
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")

In [None]:
embeddings = AzureOpenAIEmbeddings(
        azure_endpoint=endpoint,
        api_key=api_key,
        azure_deployment=embeddings_deployment,
        api_version=api_version,
    )

In [None]:
llm = AzureChatOpenAI(
        azure_endpoint=endpoint,
        api_key=api_key,
        azure_deployment=deployment,
        api_version=api_version,
    )

In [None]:
llm.invoke("hello")

In [None]:
ARTICLES_DIR = Path("../data/articles")
DEFAULT_QUERY = "How do cancellations impact SaaS growth?"

In [None]:
@dataclass(frozen=True)
class ArticleMetadata:
    article_id: str
    title: str
    source_path: str
    url: str | None
    author: str | None
    date: str | None

In [None]:
article_files = [
    ARTICLES_DIR / name
    for name in sorted(os.listdir(ARTICLES_DIR))
    if name.endswith(".md")
]
len(article_files)

In [None]:
articles: list[Document] = []
for path in article_files:
    post = frontmatter.load(path)
    metadata = {key: str(value) for key, value in post.metadata.items()}

    stem = path.stem
    article_id = stem.split("_", 1)[0]
    title = metadata.get("title") or stem.replace("_", " ")

    articles.append(
        Document(
            page_content=post.content.lstrip(),
            metadata={
                "article_id": article_id,
                "title": title,
                "source_path": str(path),
                "url": metadata.get("url"),
                "author": metadata.get("author"),
                "date": metadata.get("date"),
            },
        )
    )


In [50]:
def preview(text: str, limit: int = 220) -> str:
    compact = " ".join(text.split())
    return compact if len(compact) <= limit else compact[: limit - 3] + "..."


def show_chunks(chunks: list[Document], title: str) -> None:
    print("\n" + title)
    for i, chunk in enumerate(chunks, start=1):
        meta = chunk.metadata
        print(
            f"{i}. Article {meta.get('article_id')} | chunk {meta.get('chunk_index')}: "
            f"{preview(chunk.page_content)}"
        )

def group_by_article(chunks: list[Document]) -> dict[str, list[int]]:
    grouped: dict[str, list[int]] = {}
    for chunk in chunks:
        grouped.setdefault(chunk.metadata.get("article_id", "?"), []).append(
            int(chunk.metadata.get("chunk_index", 0))
        )
    for key in grouped:
        grouped[key] = sorted(grouped[key])
    return grouped

def print_queries_markdown_table(queries: list[str]) -> None:
    print("| pos | content | words | chars |")
    print("|---:|---|---:|---:|")
    for i, q in enumerate(queries, start=1):
        words = len(q.split())
        chars = len(q)
        safe_q = q.replace("|", "\\|")
        print(f"| {i} | {safe_q} | {words} | {chars} |")

## Tools to tokens

In [None]:
import json
from langchain_core.tools import tool

from pydantic import BaseModel, Field
from typing import Literal

class WeatherInput(BaseModel):
    """Input for weather queries."""
    location: str = Field(description="City name or coordinates")
    units: Literal["celsius", "fahrenheit"] = Field(
        default="celsius",
        description="Temperature unit preference"
    )
    include_forecast: bool = Field(
        default=False,
        description="Include 5-day forecast"
    )

@tool(args_schema=WeatherInput)
def get_weather(location: str, units: str = "celsius", include_forecast: bool = False) -> str:
    """Get current weather and optional forecast."""
    temp = 22 if units == "celsius" else 72
    result = f"Current weather in {location}: {temp} degrees {units[0].upper()}"
    if include_forecast:
        result += "\nNext 5 days: Sunny"
    return result

llm_with_tools = llm.bind_tools([get_weather])

# Inspect the stored tool schemas immediately
print(json.dumps(llm_with_tools.kwargs, indent=2))

## Naive RAG - 5

In [None]:
static_splitter = CharacterTextSplitter(
    separator="",
    chunk_size=400,
    chunk_overlap=0,
)
static_chunks: list[Document] = []
for doc in articles:
    pieces = static_splitter.split_text(doc.page_content)
    for i, text in enumerate(pieces, start=1):
        static_chunks.append(
            Document(
                page_content=text,
                metadata={
                    **doc.metadata,
                    "chunk_index": i,
                    "chunk_uid": f"{doc.metadata['article_id']}:{i}",
                },
            )
        )

len(static_chunks)

### Connecting vector store

If your collection is empty, start with the commented code, to create a new qdrant collection from our chunks by a method `from_documents()`. If you already have a collection, continue to use `from_existing_collection()`

In [None]:
# client = QdrantClient(url=QDRANT_URL)

# vectorstore = QdrantVectorStore.from_documents(
#     static_chunks,
#     collection_name="step1_chunks",
#     embedding=embeddings,
# )

In [None]:
vectorstore = QdrantVectorStore.from_existing_collection(
    collection_name="step1_chunks",
    embedding=embeddings
)

In [None]:
step1_results = vectorstore.similarity_search("How do cancellations impact SaaS growth?", k=5)
show_chunks(step1_results, "Step 1: Naive RAG (chunk_size=400, k=5)")
print("\nProblem: chunks are mid-article and missing context.")

In [None]:
vectorstore.similarity_search("How do cancellations impact SaaS growth?", k=5)

## Naive RAG - 10

In this example, we are reusing the previous collection, but we want to show 10 most similar chunks, not just 5

In [None]:
step2_results = vectorstore.similarity_search("How do cancellations impact SaaS growth?", k=10)
print("\nStep 2: Increase k to 10")


for article_id, indices in group_by_article(step2_results).items():
    print(f"Article {article_id}: chunks {indices}")

## Multi-Query



In [51]:
ORIGINAL_QUERY = "How do cancellations impact SaaS growth?"
rewrite_prompt = (
    "Generate exactly 3 diverse query rewrites for semantic search. Each query should be longer than 1 sentence to optimize the cosine similarity. Return each query on its own line. No numbering"
)
rewrites = llm.invoke(
    [
        {"role": "system", "content": rewrite_prompt},
        {"role": "user", "content": f"Original query: {ORIGINAL_QUERY}"},
    ]
).content
queries = [line.strip() for line in rewrites.splitlines() if line.strip()]
queries = queries[:3]
queries.append(ORIGINAL_QUERY)

print_queries_markdown_table(queries)

| pos | content | words | chars |
|---:|---|---:|---:|
| 1 | How do customer cancellations and churn rates directly affect SaaS growth in terms of MRR and ARR, and how should I calculate their impact on customer lifetime value and unit economics? What models can I use to translate cancellation activity into projected revenue loss and slowed growth trajectories? | 48 | 302 |
| 2 | In a subscription SaaS business, what is the relationship between cancellation behavior and growth velocity, and which retention and cohort metrics best quantify the drag caused by cancellations? How can I track and report net revenue retention, gross churn, and expansion MRR to understand the real effect on growth? | 49 | 317 |
| 3 | What operational and strategic changes reduce the negative impact of cancellations on SaaS growth, such as improvements in onboarding, pricing, product-market fit, or customer success programs? How do cancellations influence CAC payback period, churn-adjusted LTV, and long-term g

In [None]:
multi_chunks: list[Document] = []
for q in queries:
    multi_chunks.extend(vectorstore.similarity_search(q, k=10))
    
unique: dict[str, Document] = {}
for chunk in multi_chunks:
    uid = chunk.metadata.get("chunk_uid")
    if uid and uid not in unique:
        unique[uid] = chunk
        
multi_results = list(unique.values())

In [None]:
print("\nStep 4: Multi-query (4 variations x 10) grouped by article")
for article_id, indices in group_by_article(multi_results).items():
    print(f"Article {article_id}: chunks {indices}")

## Big Chunks

In this example, we are chunking on 2000 characters, to have more knowledge in each of the chunks

In [None]:
big_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
big_chunks: list[Document] = []
for doc in articles:
    pieces = big_splitter.split_text(doc.page_content)
    for i, text in enumerate(pieces, start=1):
        big_chunks.append(
            Document(
                page_content=text,
                metadata={
                    **doc.metadata,
                    "chunk_index": i,
                    "chunk_uid": f"{doc.metadata['article_id']}:{i}",
                },
            )
        )

len(big_chunks)

### Create new vector store

If you have not yet ingested the big chunks into the vector store - proceed with the `from_documents()`. If you have already documents in this collection, use `from_existing_collection()`

In [None]:
vectorstore = QdrantVectorStore.from_documents(
    big_chunks,
    collection_name="step5_chunks",
    embedding=embeddings,
)

In [None]:
vectorstore = QdrantVectorStore.from_existing_collection(
    embedding=embeddings,
    collection_name="step5_chunks",
)

### Execute retrieval

In [None]:
step5_results = vectorstore.similarity_search("How do cancellations impact SaaS growth?", k=5)
show_chunks(step5_results, "Step 5: Naive fix (chunk_size=2000)")
print("\nEmbedding signal is diluted; relevance drops.")

## Parent Document Retriever

We are connecting now big chunks (in this case we do not chunk articles, but return full length) with small chunks for the retrieval purpose. We are using ParentDocumentRetriever, where we store big documents in Postgres database, and small chunks inside qdrant vector store

In [None]:
byte_store = SQLStore(namespace="pdr_parent_docs", db_url=POSTGRES_URL)
byte_store.create_schema()

In [None]:
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
parent_splitter = None

### Creating vector store

ParentDocumentRetriever requires us to pass already initialized vector store. Because we use qdrant, we need to first create the new collection, it will be ingesting data using `collection_exists()`. If the collection is not yet created, we will create it using `create_collection()`, according to our embedding model

In [66]:
client = QdrantClient(url=QDRANT_URL)
is_collection_existing = client.collection_exists(
    collection_name="pdr_child_chunks",
)

if not is_collection_existing:
    vector_dim = len(embeddings.embed_query("dimension probe"))

    client.create_collection(
        collection_name="pdr_child_chunks",
        vectors_config=VectorParams(size=vector_dim, distance=Distance.COSINE)
    )

In [67]:
vectorstore = QdrantVectorStore(
    client=client,
    collection_name="pdr_child_chunks",
    embedding=embeddings,
)

### Creating and ingesting retriever

In [None]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    byte_store=byte_store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    child_metadata_fields=["article_id", "title", "source_path", "url"],
    search_kwargs={"k": 10},
)

In [None]:
retriever.add_documents(articles)

### Using the ParentDocument Retriever

In [None]:
parent_results = retriever.invoke("How do cancellations impact SaaS growth?")

print("\nStep 6: Parent Document Retriever (Qdrant + Postgres)")
for i, doc in enumerate(parent_results, start=1):
    meta = doc.metadata
    print(
        f"{i}. Article {meta.get('article_id')} | {meta.get('title')} | chars: {len(doc.page_content)}"
    )
