In [1]:
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI

from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import LanceDB
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate

import lancedb

## Source data

In [2]:
import pandas as pd
from langchain_openai import OpenAI

df = pd.read_parquet('./data/parquet/sources-camel-kafka.parquet', engine='pyarrow')
df.head()

Unnamed: 0,project,path,language,tags,description,source
0,camel,/Users/opiske/code/java/camel/components/camel...,java,,"""Shows how to create a custom check that can d...","if (count.intValue() <= 1) {\n LOG.info(""Coun..."
1,camel,/Users/opiske/code/java/camel/components/camel...,java,,"""Shows how to build a Camel route can pause a ...",return new RouteBuilder(){\n @Override public...
2,camel,/Users/opiske/code/java/camel/components/camel...,java,,"""Shows to create an adapter that is run by Cam...",if (count.intValue() <= 1) {\n return true;\n...
3,camel,/Users/opiske/code/java/camel/components/camel...,java,,"""Shows to create a route that uses the resume ...",return new RouteBuilder(){\n @Override public...


In [3]:
from langchain_community.document_loaders import DataFrameLoader
loader = DataFrameLoader(df, page_content_column="source")
docs = loader.load()
print("Num docs: ", len(docs))

[print(d) for d in list(df["description"])]

Num docs:  4
"Shows how to create a custom check that can determine whether to pause or continue"
"Shows how to build a Camel route can pause a Kafka consumer when using the circuit breaker pattern"
"Shows to create an adapter that is run by Camel when the resume happens "
"Shows to create a route that uses the resume API"


[None, None, None, None]

## Retriever setup

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
print(f"Created {len(splits)} splits")

# Use `description` as lancedb.embedding and `page_content` as lancedb.text
# this way the lookups will be performed against the description values, but still return the actual sources into the LLM context
embeddings = OpenAIEmbeddings()
db = lancedb.connect("/tmp/lancedb")
table = db.create_table(
    "camel",
    data=[
        {
            "vector": embeddings.embed_query("Hello World"),
            "text": "Hello World",
            "id": "1",
        }
    ],
    mode="overwrite",
)

# manually create the store so we get the embeddings right
lancedb_data = []
for i,doc in enumerate(splits):
    lancedb_data.append(
        {
            "vector": embeddings.embed_query(doc.metadata["description"]),
            "text": doc.page_content,
            "id": i,
        }
    )
table.add(lancedb_data)

# create a vectorstore from the pre-populated table
vectorstore = LanceDB(table,embeddings)

# Retrieve and generate using the relevant snippets of the sources found in the parquet files.
retriever = vectorstore.as_retriever()
prompt = PromptTemplate.from_template("""
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    Please provide end-to-end examples in Java when applicable.

    Question: {question} 

    Context: {context} 

    Answer:
""")
print(type(vectorstore))

Created 6 splits
<class 'langchain_community.vectorstores.lancedb.LanceDB'>


In [None]:
# let's take a look at how the retriever actually works
matches = retriever.get_relevant_documents("How can I pause a Kafka consumer?")
[print(f"{i}:\n {m.page_content}") for i,m in enumerate(matches)]

## The actual Q&A involving the LLM

In [9]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo-1106", temperature=0)

def format_docs(docs):    
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

questions = [
    "Can you show me how to use the resume API?",    
    "How do I determine whether to pause or continue a route? Do you have an example?",        
]

from langchain.globals import set_debug
set_debug(False)

for i,q in enumerate(questions):
    print(f"Q{i}: {q}")
    result = rag_chain.invoke(q)
    print(f"A{i}: {result} \n\n")



Q0: Can you show me how to use the resume API?
A0: I'm sorry, but based on the provided context, I don't have enough information to show you how to use the resume API. 


Q1: How do I determine whether to pause or continue a route? Do you have an example?
A1: To determine whether to pause or continue a route, you can use a circuit breaker to monitor the downstream availability. If the downstream call succeeds, the route can continue. If there is an error, you can start a thread to simulate checking for the downstream availability. Here's an example in Java using Apache Camel:

```java
CircuitBreaker circuitBreaker = CircuitBreaker.ofDefaults("pausable");
circuitBreaker.getEventPublisher().onSuccess(event -> {
    LOG.info("Downstream call succeeded");
    if (executorService != null) {
        executorService.shutdownNow();
        executorService = null;
    }
}).onError(event -> {
    LOG.info("Downstream call error. Starting a thread to simulate checking for the downstream availabil