## Phase 1: Taxonomy Generation


Scratch: get runs

In [79]:
# "Data"

from datetime import datetime, timedelta

from langsmith import Client

client = Client(timeout_ms=30_000)

yday = datetime.now() - timedelta(days=3)
runs = list(
    client.list_runs(
        # project_id="f53ccf51-57c1-4c97-afdf-7ca5569945cd",
        project_name="chat-langchain",
        filter="eq(is_root, true)",
        start_time=yday,
        select=["inputs", "outputs", "feedback_stats"],
    )
)

In [111]:
import random


def run_to_doc(run) -> Doc:
    turns = []
    idx = 0
    for turn in run.inputs["chat_history"] or []:
        key, value = next(iter(turn.items()))
        turns.append(f"<{key} idx={idx}>\n{value}\n</{key}>")
        idx += 1
    turns.append(
        f"""
<human idx={idx}>
{run.inputs['question']}
</human>"""
    )
    if run.outputs and run.outputs["output"]:
        turns.append(
            f"""<ai idx={idx+1}>
{run.outputs['output']}
</ai>"""
        )
    return {
        "id": str(run.id),
        "content": ("\n".join(turns)),
    }


docs = [run_to_doc(run) for run in runs if run.inputs]
docs = random.sample(docs, min(len(docs), 1000))

In [261]:
use_case = (
    "Generate the taxonomy that can be used both to label the user intent"
    " as well as to identify any required documentation (references, how-tos, etc.)"
    " that would benefit the user."
)

#### 1.a Summarize Docs

In [272]:
import operator
import re
from typing import Annotated, List, Optional, Sequence, TypedDict

from langchain import hub
from langchain_anthropic import ChatAnthropic
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableConfig, RunnableLambda, RunnablePassthrough


class Doc(TypedDict):
    id: str
    content: str
    summary: Optional[str]
    explanation: Optional[str]


class TaxonomyGenerationState(TypedDict):
    documents: List[Doc]
    # Indices
    minibatches: List[List[int]]
    # Candidate Taxonomies
    clusters: Annotated[List[List[dict]], operator.add]


# Phase 1.a: Generate summaries + explanations


summary_prompt = hub.pull("wfh/tnt-llm-summary-generation").partial(
    summary_length=20, explanation_length=30
)


def parse_summary(xml_string: str) -> dict:
    summary_pattern = r"<summary>(.*?)</summary>"
    explanation_pattern = r"<explanation>(.*?)</explanation>"

    summary_match = re.search(summary_pattern, xml_string, re.DOTALL)
    explanation_match = re.search(explanation_pattern, xml_string, re.DOTALL)

    summary = summary_match.group(1).strip() if summary_match else ""
    explanation = explanation_match.group(1).strip() if explanation_match else ""

    return {"summary": summary, "explanation": explanation}


summary_llm_chain = (
    summary_prompt
    | ChatAnthropic(model="claude-3-haiku-20240307")
    | StrOutputParser()
    # Customize the tracing name for easier organization
).with_config(run_name="GenerateSummary")
summary_chain = summary_llm_chain | parse_xml


# Now combine as a "map" operation in a map-reduce chain
# Input: state
# Output: state U summaries
# Processes docs in parallel
def get_content(state: TaxonomyGenerationState):
    docs = state["documents"]
    return [{"content": doc["content"]} for doc in docs]


map_step = RunnablePassthrough.assign(
    summaries=get_content
    | RunnableLambda(func=summary_chain.batch, afunc=summary_chain.abatch)
)


def reduce_summaries(combined: dict) -> TaxonomyGenerationState:
    summaries = combined["summaries"]
    documents = combined["documents"]
    return {
        "documents": [
            {
                "id": doc["id"],
                "content": doc["content"],
                "summary": summ_info["summary"],
                "explanation": summ_info["explanation"],
            }
            for doc, summ_info in zip(documents, summaries)
        ]
    }


# This is actually the node itself!
map_reduce_chain = map_step | reduce_summaries

In [114]:
summarized_docs = map_reduce_chain.invoke({"documents": docs}, {"max_concurrency": 5})

Failed to batch ingest runs: LangSmithConnectionError('Connection error caused failure to post https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. SSLError(MaxRetryError("HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, \'EOF occurred in violation of protocol (_ssl.c:2393)\')))"))')
Failed to batch ingest runs: LangSmithConnectionError('Connection error caused failure to post https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. SSLError(MaxRetryError("HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, \'EOF occurred in violation of protocol (_ssl.c:2393)\')))"))')


In [121]:
import json

with open("docs.json", "w") as f:
    json.dump(summarized_docs, f)

#### 1.b Split into Minibatches

In [236]:
def get_minibatches(state: TaxonomyGenerationState, config: RunnableConfig):
    batch_size = config["configurable"].get("batch_size", 200)
    original = state["documents"]
    indices = list(range(len(original)))
    random.shuffle(indices)
    if len(indices) < batch_size:
        # Don't pad needlessly if we can't fill a single batch
        return [indices]

    num_full_batches = len(indices) // batch_size

    batches = [
        indices[i * batch_size : (i + 1) * batch_size] for i in range(num_full_batches)
    ]

    leftovers = len(indices) % batch_size
    if leftovers:
        last_batch = indices[num_full_batches * batch_size :]
        elements_to_add = batch_size - leftovers
        last_batch += random.sample(indices, elements_to_add)
        batches.append(last_batch)

    return {
        "minibatches": batches,
    }

In [243]:
batched_state = get_minibatches(summarized_docs, {"configurable": {}})
batched_state = {**summarized_docs, **batched_state}

#### 1.c Generate Taxonomy

In [273]:
import random
import re
from typing import Any, Dict

# We instruct our LLMs to generate 10
# intent categories and 25 domain categories for taxonomy generation.


def parse_taxa(output_text: str) -> Dict:
    cluster_matches = re.findall(
        r"\s*<id>(.*?)</id>\s*<name>(.*?)</name>\s*<description>(.*?)</description>\s*",
        output_text,
        re.DOTALL,
    )
    clusters = [
        {"id": id.strip(), "name": name.strip(), "description": description.strip()}
        for id, name, description in cluster_matches
    ]
    # We don't parse the explanation since it isn't used downstream
    return {"clusters": clusters}


def format_docs(docs: List[Doc]) -> str:
    xml_table = "<conversations>\n"
    for doc in docs:
        xml_table += f'<conv_summ id={doc["id"]}>{doc["summary"]}</conv_summ>\n'
    xml_table += "</conversations>"
    return xml_table


def format_taxonomy(clusters):
    xml = "<cluster_table>\n"
    for label in clusters:
        xml += "  <cluster>\n"
        xml += f'    <id>{label["id"]}</id>\n'
        xml += f'    <name>{label["name"]}</name>\n'
        xml += f'    <description>{label["description"]}</description>\n'
        xml += "  </cluster>\n"
    xml += "</cluster_table>"
    return xml


# def generate_taxonomy(
#     state: TaxonomyGenerationState, config: RunnableConfig
# ) -> TaxonomyGenerationState:
#     """Prompt an LLM to generate an initial taxonomy."""
#     configurable = config["configurable"]
#     docs = state["documents"]
#     mb_indices = state["minibatches"][0]
#     first_minibatch = [docs[idx] for idx in mb_indices]
#     data_table_xml = format_docs(first_minibatch)
#     initial_taxonomy = generate_taxonomy_chain.invoke(
#         {
#             "data_xml": data_table_xml,
#             "cluster_name_length": configurable.get("cluster_name_length", 10),
#             "cluster_description_length": configurable.get(
#                 "cluster_description_length", 30
#             ),
#             "explanation_length": configurable.get("explanation_length", 20),
#             "max_num_clusters": configurable.get("max_num_clusters", 25),
#         }
#     )

#     return {
#         "clusters": [initial_taxonomy["clusters"]],
#     }

In [None]:
taxonomy_generation_prompt = hub.pull("wfh/tnt-llm-taxonomy-generation").partial(
    use_case="Generate the taxonomy that can be used to label the user intent in the conversation.",
)

taxonomy_generation_llm = ChatAnthropic(
    model="claude-3-haiku-20240307", max_tokens_to_sample=2000
)
taxa_gen_llm_chain = (
    taxonomy_generation_prompt | taxonomy_generation_llm | StrOutputParser()
).with_config(run_name="GenerateTaxonomy")


generate_taxonomy_chain = taxa_gen_llm_chain | parse_generation_output

In [258]:
taxonomies = generate_taxonomy(batched_state, {"configurable": {"max_concurrency": 5}})
taxonomies = {**batched_state, **taxonomies}

#### 1.c Update taxonomy

Mostly the same.

In [274]:
taxonomy_update_prompt = hub.pull("wfh/tnt-llm-taxonomy-update")

taxa_update_llm_chain = (
    taxonomy_update_prompt | taxonomy_update_llm | StrOutputParser()
).with_config(run_name="UpdateTaxonomy")


update_taxonomy_chain = taxa_update_llm_chain | parse_taxa

# def update_taxonomy(
#     state: TaxonomyGenerationState, config: RunnableConfig
# ) -> TaxonomyGenerationState:
#     """Prompt an LLM to update the taxonomy based on the current state."""
#     configurable = config["configurable"]
#     docs = state["documents"]
#     minibatches = state["minibatches"]
#     previous_taxonomy = state["clusters"][-1]
#     which_mb = len(state["clusters"]) % len(minibatches)
#     mb_indices = minibatches[which_mb]
#     minibatch = [docs[idx] for idx in mb_indices]
#     # The new data we will be using to
#     data_table_xml = format_docs(minibatch)
#     cluster_table_xml = format_taxonomy(previous_taxonomy)
#     updated_taxonomy = update_taxonomy_chain.invoke(
#         {
#             "data_xml": data_table_xml,
#             "use_case": configurable["use_case"],
#             "cluster_table_xml": cluster_table_xml,
#             "suggestion_length": configurable.get("suggestion_length", 30),
#             "cluster_name_length": configurable.get("cluster_name_length", 10),
#             "cluster_description_length": configurable.get(
#                 "cluster_description_length", 30
#             ),
#             "explanation_length": configurable.get("explanation_length", 20),
#             "max_num_clusters": configurable.get("max_num_clusters", 25),
#         }
#     )
#     return {
#         "clusters": [updated_taxonomy["clusters"]],
#     }

In [270]:
updated_taxonomies = update_taxonomy(
    taxonomies, {"configurable": {"use_case": use_case}}
)
updated_taxonomies = {**taxonomies, **updated_taxonomies}

In [271]:
updated_taxonomies["clusters"][-1]

[{'id': '1',
  'name': 'Manage file operations',
  'description': 'Load, read, write, and list files in various formats like JSON, CSV, and Parquet.'},
 {'id': '2',
  'name': 'Resolve code and network issues',
  'description': 'Debug directory errors, network firewall problems, and create AI-powered models.'},
 {'id': '3',
  'name': 'Implement conversational AI',
  'description': 'Handle user queries, provide responses, and manage user-AI interactions.'},
 {'id': '4',
  'name': 'Build chatbots with LangChain',
  'description': 'Develop chatbots that use LangChain for knowledge retrieval, conversation history, and response generation.'},
 {'id': '5',
  'name': 'Integrate LangChain with external services',
  'description': 'Combine LangChain with tools like Ollama, Mistral, and Azure OpenAI for advanced capabilities.'},
 {'id': '6',
  'name': 'Process web content with LangChain',
  'description': 'Load web pages, extract text, and handle various file formats from online sources.'},
 {'id

#### 1.d Review Taxonomy

In [275]:
taxonomy_review_prompt = hub.pull("wfh/tnt-llm-taxonomy-review")

taxa_review_llm_chain = (
    taxonomy_generation_llm | taxonomy_review_llm | StrOutputParser()
).with_config(run_name="ReviewTaxonomy")


review_taxonomy_chain = taxa_review_llm_chain | parse_taxa


# def review_taxonomy(
#     state: TaxonomyGenerationState, config: RunnableConfig
# ) -> TaxonomyGenerationState:
#     """Prompt an LLM to update the taxonomy based on the current state."""
#     configurable = config["configurable"]
#     docs = state["documents"]
#     minibatches = state["minibatches"]
#     previous_taxonomy = state["clusters"][-1]
#     mb_indices = random.sample
#     which_mb = len(state["clusters"]) % len(minibatches)
#     mb_indices = minibatches[which_mb]
#     minibatch = [docs[idx] for idx in mb_indices]
#     # The new data we will be using to
#     data_table_xml = format_docs(minibatch)
#     cluster_table_xml = format_taxonomy(previous_taxonomy)
#     reviewed_taxonomy = review_taxonomy_chain.invoke(
#         {
#             "data_xml": data_table_xml,
#             "use_case": configurable["use_case"],
#             "cluster_table_xml": cluster_table_xml,
#             "suggestion_length": configurable.get("suggestion_length", 30),
#             "cluster_name_length": configurable.get("cluster_name_length", 10),
#             "cluster_description_length": configurable.get(
#                 "cluster_description_length", 30
#             ),
#             "explanation_length": configurable.get("explanation_length", 20),
#             "max_num_clusters": configurable.get("max_num_clusters", 25),
#         }
#     )
#     return {
#         "clusters": [reviewed_taxonomy["clusters"]],
#     }

In [None]:
def invoke_taxonomy_chain(
    chain: Callable,
    state: TaxonomyGenerationState,
    config: RunnableConfig,
    mb_indices: List[int],
) -> TaxonomyGenerationState:
    configurable = config["configurable"]
    docs = state["documents"]
    minibatch = [docs[idx] for idx in mb_indices]
    data_table_xml = format_docs(minibatch)

    previous_taxonomy = state["clusters"][-1] if state["clusters"] else []
    cluster_table_xml = format_taxonomy(previous_taxonomy)

    updated_taxonomy = chain.invoke(
        {
            "data_xml": data_table_xml,
            "use_case": configurable["use_case"],
            "cluster_table_xml": cluster_table_xml,
            "suggestion_length": configurable.get("suggestion_length", 30),
            "cluster_name_length": configurable.get("cluster_name_length", 10),
            "cluster_description_length": configurable.get(
                "cluster_description_length", 30
            ),
            "explanation_length": configurable.get("explanation_length", 20),
            "max_num_clusters": configurable.get("max_num_clusters", 25),
        }
    )

    return {
        **state,
        "clusters": state["clusters"] + [updated_taxonomy["clusters"]],
    }


def generate_taxonomy(
    state: TaxonomyGenerationState, config: RunnableConfig
) -> TaxonomyGenerationState:
    return invoke_taxonomy_chain(
        generate_taxonomy_chain, state, config, state["minibatches"][0]
    )


def update_taxonomy(
    state: TaxonomyGenerationState, config: RunnableConfig
) -> TaxonomyGenerationState:
    which_mb = len(state["clusters"]) % len(state["minibatches"])
    return invoke_taxonomy_chain(
        update_taxonomy_chain, state, config, state["minibatches"][which_mb]
    )


def review_taxonomy(
    state: TaxonomyGenerationState, config: RunnableConfig
) -> TaxonomyGenerationState:
    return invoke_taxonomy_chain(
        review_taxonomy_chain, state, config, random.choice(state["minibatches"])
    )

In [276]:
reviewed_taxonomies = review_taxonomy(
    updated_taxonomies, {"configurable": {"use_case": use_case}}
)
reviewed_taxonomies = {**updated_taxonomies, **reviewed_taxonomies}

In [None]:
# Define the graph
graph = StateGraph(TaxonomyGenerationState)
graph.add_node("summarize", map_reduce_chain)
graph.add_node("get_minibatches", get_minibatches)
graph.add_node("generate_taxonomy", generate_taxonomy)
graph.add_node("update_taxonomy", update_taxonomy)
graph.add_node("review_taxonomy", review_taxonomy)

graph.add_edge("summarize", "add_minibatches")
graph.add_edge("add_minibatches", "generate_taxonomy")
graph.add_edge("generate_taxonomy", "update_taxonomy")
def should_review(
graph.add_conditional_edge(should_review)
graph.set_finish_point("review_taxonomy")

graph.set_entry_point("summarize")
app = graph.compile()

In [None]:
# Phase 2: Text Classification


# Define the state for the text classification graph
class TextClassificationState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]
    labels: Annotated[Sequence[str], operator.add]


def assign_labels(state: TextClassificationState) -> TextClassificationState:
    """Prompt an LLM to assign labels to the given text."""
    llm = ChatOpenAI(temperature=0)
    messages = [HumanMessage(content=text) for text in state["messages"]]
    responses = llm.batch(messages)
    labels = [response.additional_kwargs["labels"] for response in responses]
    return {"messages": messages, "labels": labels}


# Define the TnT-LLM workflow using LangGraph

# Phase 1: Taxonomy Generation
taxonomy_graph = StateGraph(TaxonomyGenerationState)
taxonomy_graph.add_node("summarize", summarize_text)
taxonomy_graph.add_node("generate", generate_taxonomy)
taxonomy_graph.add_node("update", update_taxonomy)
taxonomy_graph.add_node("review", review_taxonomy)

taxonomy_graph.add_edge("summarize", "generate")
taxonomy_graph.add_edge("generate", "update")
taxonomy_graph.add_conditional_edges(
    "update",
    lambda state: "update" if len(state["summaries"]) < 10 else "review",
)
taxonomy_graph.add_edge("review", END)

taxonomy_graph.set_entry_point("summarize")
taxonomy_generator = taxonomy_graph.compile()

# Phase 2: Text Classification
classification_graph = StateGraph(TextClassificationState)
classification_graph.add_node("assign_labels", assign_labels)

classification_graph.add_edge("assign_labels", END)

classification_graph.set_entry_point("assign_labels")
text_classifier = classification_graph.compile()