In [0]:
%pip install -U langchain langchain-community langchain-databricks faiss-cpu tiktoken langchain_text_splitters langchain_experimental

In [0]:
dbutils.library.restartPython()

In [0]:
import re
from typing import List, Callable
from langchain_text_splitters import TokenTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import TokenTextSplitter
from langchain_databricks import ChatDatabricks, DatabricksEmbeddings


### RecursiveCharacterTextSplitter

In [0]:
import re
from typing import List, Callable
from langchain_text_splitters import TokenTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import TokenTextSplitter
from langchain_databricks import ChatDatabricks, DatabricksEmbeddings


# -----------------------
# 1) Databricks LLM + Embeddings
# -----------------------
# Make sure your Databricks auth is configured (e.g., DATABRICKS_HOST + DATABRICKS_TOKEN)
LLM_ENDPOINT_NAME = "databricks-meta-llama-3-1-8b-instruct"
EMBEDDING_ENDPOINT_NAME = "databricks-bge-large-en"  # <-- change to your embedding endpoint name

llm = ChatDatabricks(endpoint=LLM_ENDPOINT_NAME, temperature=0.1)
embeddings = DatabricksEmbeddings(endpoint=EMBEDDING_ENDPOINT_NAME)

In [0]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=5,
    length_function=len,
    separators=["\n", ".", ""],
)

text = """
Why does chunking cause context fragmentation?

1. Loss of Coherence
When a complete semantic unit is forcibly split, the information becomes incomplete
(the meaning is broken). For example, if an argument is distributed across two chunks,
neither chunk alone can accurately convey the original meaning. This interferes with
the language model’s understanding and generation, leading to incomplete or even
misleading outputs.

2. Diluted Relevance
If a chunk mixes relevant and irrelevant content, the key information becomes diluted.
This negatively affects the accuracy of vector representations and, in turn, lowers
retrieval ranking performance.

3. Scattered Information
For complex questions that require multi-hop reasoning, relevant information may be
scattered across multiple chunks. If not all of them are retrieved, a RAG system cannot
produce a complete answer.

When these issues compound, they directly lead to a “garbage in, garbage out” effect,
and may even increase the risk of model hallucinations.
"""

# Text to be processed
documents = text_splitter.create_documents([text])

for doc in documents:
    print(doc)

### SemanticChunker

In [0]:
# LangChain equivalent of LlamaIndex SemanticSplitterNodeParser
# pip install -U langchain-text-splitters langchain-openai

from langchain_experimental.text_splitter import SemanticChunker
from langchain_core.documents import Document

# Example text containing multiple topics
multi_theme_text = (
    "Artificial intelligence (AI) is fundamentally transforming the healthcare industry. "
    "AI algorithms can analyze medical images and diagnose diseases such as cancer earlier "
    "and more accurately than human radiologists. "
    "In addition, AI plays a critical role in drug discovery by predicting the effectiveness "
    "of chemical compounds, significantly reducing the time required to bring new drugs to market. "
    "Shifting gears, let us talk about financial technology (FinTech). "
    "Mobile payments have become mainstream worldwide, with digital wallets and contactless "
    "payments reshaping consumer behavior. "
    "Blockchain technology provides decentralized solutions for cross-border payments and "
    "asset tokenization, with the potential to reshape the underlying infrastructure of the "
    "entire financial system."
)

# Create a LangChain Document
docs = [Document(page_content=multi_theme_text)]

# Initialize semantic chunker (closest LangChain equivalent)
splitter = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=90,  # roughly analogous to LlamaIndex's percentile threshold
)

# Perform semantic splitting
chunks = splitter.split_documents(docs)

# Print results
print(f"Semantic splitting produced {len(chunks)} chunks:")
for i, chunk in enumerate(chunks, 1):
    print(f"--- Chunk {i} ---")
    print(chunk.page_content)
    print("-" * 20)


### DenseXRetrievalPack

In [0]:
from langchain_core.prompts import PromptTemplate


# -----------------------------
# 1) Prompt (force JSON-only)
# -----------------------------
PROPOSITIONS_PROMPT = PromptTemplate.from_template(
    """You are a precise information extraction system.

Task: Decompose the given Content into clear, simple propositions that are interpretable out of context.

Rules:
1) Split compound sentences into simple sentences. Keep original phrasing when possible.
2) If a named entity has extra descriptive info, put that info into its own proposition.
3) Decontextualize: replace pronouns with the full entity names they refer to.
4) Output MUST be a JSON array of strings ONLY (no markdown, no extra text).

Content:
{node_text}

Output:
"""
)


# -----------------------------
# 2) Robust parsing helpers
# -----------------------------
def _strip_code_fences(text: str) -> str:
    text = text.strip()
    # Remove opening fence like ```json or ``` (any tag)
    text = re.sub(r"^\s*```[a-zA-Z0-9_-]*\s*", "", text)
    # Remove trailing fence
    text = re.sub(r"\s*```\s*$", "", text)
    return text.strip()


def _extract_first_json_array(text: str) -> str | None:
    """
    Extract the first JSON array from a possibly messy response like:
      - Output: [...]
      - ```json [...] ```
      - Explanation ... [...] trailing text
    """
    if not text:
        return None

    text = _strip_code_fences(text)

    start = text.find("[")
    end = text.rfind("]")
    if start == -1 or end == -1 or end <= start:
        return None

    return text[start : end + 1].strip()


def _message_to_text(msg: Any) -> str:
    """
    Databricks/LangChain messages can sometimes be:
      - content: str
      - content: list[dict] (blocks)
    Convert robustly to plain text.
    """
    if msg is None:
        return ""

    content = getattr(msg, "content", "")

    if isinstance(content, str):
        return content

    if isinstance(content, list):
        parts: List[str] = []
        for block in content:
            if isinstance(block, str):
                parts.append(block)
            elif isinstance(block, dict):
                # common patterns
                if isinstance(block.get("text"), str):
                    parts.append(block["text"])
                elif isinstance(block.get("content"), str):
                    parts.append(block["content"])
        return "\n".join(parts).strip()

    return str(content).strip()


def safe_json_list(text: str, *, debug: bool = False) -> List[str]:
    """
    Parse a list[str] from model output.
    Returns [] only if it truly can't parse a JSON array.
    """
    candidate = _extract_first_json_array(text)
    if not candidate:
        if debug:
            print("[DEBUG] No JSON array found in output.")
        return []

    try:
        data = json.loads(candidate)
    except json.JSONDecodeError as e:
        if debug:
            print("[DEBUG] JSONDecodeError:", e)
            print("[DEBUG] Candidate snippet:", candidate[:800])
        return []

    if not isinstance(data, list):
        if debug:
            print("[DEBUG] Parsed JSON is not a list:", type(data))
        return []

    return [x for x in data if isinstance(x, str)]


# -----------------------------
# 3) Core extraction function
# -----------------------------
def extract_propositions(node_text: str, llm: Any, *, debug: bool = True) -> List[str]:
    prompt = PROPOSITIONS_PROMPT.format(node_text=node_text)

    msg = llm.invoke(prompt)
    raw_text = _message_to_text(msg).strip()

    props = safe_json_list(raw_text, debug=debug)

    if debug and not props:
        print("\n[DEBUG] Raw model output (first 1200 chars):")
        print(raw_text[:1200])
        ak = getattr(msg, "additional_kwargs", None)
        if isinstance(ak, dict) and ak:
            print("\n[DEBUG] additional_kwargs keys:", list(ak.keys()))

    return props


# -----------------------------
# 5) Test
# -----------------------------
test_text = "The Eiffel Tower is located in Paris and was built in 1889."

propositions = extract_propositions(test_text, llm, debug=True)
print("\nExtracted propositions:")
print(propositions)


In [0]:
# -----------------------------
# 1) In-memory sample documents (replace dir_path)
# -----------------------------
raw_docs = [
    Document(
        page_content=(
            "The Eiffel Tower is located in Paris, France. "
            "It was constructed between 1887 and 1889 and officially opened in 1889 "
            "for the World's Fair (Exposition Universelle). "
            "The tower was designed by Gustave Eiffel's engineering company."
        ),
        metadata={"source": "eiffel_tower"},
    ),
    Document(
        page_content=(
            "The Statue of Liberty is located in New York Harbor. "
            "It was a gift from France to the United States and was dedicated in 1886. "
            "The statue was designed by Frédéric Auguste Bartholdi."
        ),
        metadata={"source": "statue_of_liberty"},
    ),
]

print(f"Loaded {len(raw_docs)} in-memory documents.")

In [0]:
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.vectorstores import FAISS

# -----------------------------
# 4) Build proposition documents (DenseX-style indexing step)
# -----------------------------
prop_docs: List[Document] = []

for d in raw_docs:
    text = d.page_content or ""
    props = extract_propositions(text, llm, debug=False)

    # Fallback if the model returns non-JSON or empty
    if not props:
        props = [text]

    for p in props:
        prop_docs.append(
            Document(
                page_content=p,
                metadata={**(d.metadata or {}), "source_type": "proposition"},
            )
        )

print(f"Built {len(prop_docs)} proposition documents.")


# -----------------------------
# 5) Vector index (FAISS local) using Databricks embeddings
# -----------------------------
vectorstore = FAISS.from_documents(prop_docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})


# -----------------------------
# 6) Retrieval + answer using Databricks chat LLM
# -----------------------------
ANSWER_PROMPT = PromptTemplate.from_template(
    """Answer the question using ONLY the context below.
If the context is insufficient, say so.

Context:
{context}

Question: {question}
Answer:
"""
)

def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(f"- {d.page_content}" for d in docs)

qa_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough(),
    }
    | ANSWER_PROMPT
    | llm
    | StrOutputParser()
)

query = "When was the Eiffel Tower built?"
answer = qa_chain.invoke(query)

print("\n=== Answer ===")
print(answer)