## 2.1 - Preparing Data for RAG

In [None]:
# Reduce the arrow batch size as our PDF can be big in memory
Spark. con. set("spark. sql. execut on. arrow. naxRecordsPerBatch, 10) |

In [None]:
articles path = f"{DA.paths.datasets}/arxiv-articles/"
table name = f"{DA catalog_name. {DA.schema_name}. pdf_raw_text”

# read df files
df=(
    spark.read.format ("binary file")
    .option("recursiveFileLookup, "true")  
    .load(articles_path)
    )
# save List of the files to table
df.write.mode ("overwrite").saveAsTable(table_name)
display(df)

In [None]:
with open(f"{articles_path.replace('dbfs:','/dbfs/')}2302.06476.pdf", mode="rb") as pdf:
    doc = extract_doc_text(pdf.read())
    print(doc)

In [None]:
import io
import os
import pandas as pd

from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index import Document, set_global_tokenizer
from transformers import AutoTokenizer
from typing import Iterator
from pyspark.sql.functions import col,udf,length,pandas_udf,explode
from unstructured.partition.auto import partition

@pandas_udf("array<string>")
def read_as_chunk(batch_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
    #set llama2 as tokenizer
    set_global_tokenizer(
        AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
    )
    #Sentence splitter from llama_index to split on sentences
    splitter = SentenceSplitter(chunk_size=500, chunk_overlap=50)
    def extract_and_split(b):
        txt = extract_doc_text(b)
        nodes = splitter.get_nodes_from_documents([Documents(text=txt)])
        return [n.text for n in nodes]
        
    for x in batch_iter:
        yield x.apply(extract_and_split)
    

In [None]:
def_chunks = (df
                  .withColumn("content", explode(read_as_chunk("content")))
                  .selectExpr('path as pdf_name','content')
              }
display(df_chunks)

## How to Use Foundation model API

In [None]:
from mlflow.deployments import get_deploy_client

# bgee-large-en Foundation models are available using the /serving-endpoints/databricks-bge-large-en/invocationsapi.
deploy_client = get_deploy_client("databricks")

#NOTE: if you change your embeddings model here, make sure you change it in the query step too
embeddings = deploy_client.predict(endpoint="databricks-bge-large-en", inputs={"input": ["What is Apache Spark?"]})
print(embeddings)


## Compute Chunking Embeddings

In [None]:
@pandas_udf("array<float>")
def get_embedding(contents: pd.Series) -> pd.Series:
    import mlflow.deployments
    deploy_clients = ml.flow.deployments.get_deploy_client("databricks")
    def get_embeddings(batch):
        # Note: this will fail id an excecution is thrown during emedding creation (add try/except if needed)
        response =deploy_client.predict(endpoint="databricks-bge-large-en", inputs={"input": batch})

# Splitting the contents into batches of 150 items each, since the embedding model takes ar most 150 inputs oer request.
max_batch_size =150
batches = [contents.iloc[i:i + max_batch_size] for i in range(0, len(contents), max_batch_size)]

#Process each batch and collect the results
all_embeddings = []
for batch in batches:
    all_embeddings += get_embeddings(batch.tolist())

return pd.Series(all_embeddings)


In [None]:
import pyspark.sql.functions as F

df_chunk_emd = (df_chunks
                .withColumn("embedding", get_embedding("content"))
                .selectExpr('pdf_name', 'content', 'embedding')
display(df_chunk_emd)

## Save Embeddings to a Delta Table

In [None]:
%sql
CREATE TABLE IF NOT EXISTS pdf_text_embeddings (
    id BIGINT GENERATED BY DEFAULT AS IDENTITY,
    pdf_name STRING,
    content STRING,
    embedding ARRAY <FLOAT>
    --Note: The table has to be CDC beacuase VectorSearch is using DLT that is requiring CDC state
) TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [None]:
embedding_table_name = f"{DA.catalog_name}.{DA.schema_name}.pdf_text_embeddings"
df_chunk_emd.write.mode("append").saveAsTable(embedding_table_name)


## Setup a Vector Search Endpoint
Create self-managed vector search index

In [None]:
# assign vs search endpoint by username
vs_endpoint_prefix = "vs_endpoint_"
vs_endpoint_fallback = "vs_endpoint_fallback"
vs_endpoint_name = vs_endpoint_prefix+"7"
# vs_endpoint_name = vs_endpoint_prefix+str(get_fixed_integer(DA.unique_name("_")))

print(f"Vector Endpoint name: {vs_endpoint_name}. In case of any issues, replace variable 'vs_endpoint_name' with
'vs_endpoint_fallback' in demos and labs.")

In [None]:
from databricks.vector_search.client import VectorSearchClient
from databricks.sdk import WorkspaceClient
import databricks.sdk.service.catalog as c

vsc = VectorSearchClient(disable_notice = True)

In [None]:
# check the status of the endpoint
wait_for_vs_endpoint_to_be_ready(vsc, vsc_endpoint_name)
print(f"Endpoint named {vs_endpoint_name} is ready.")


In [None]:
# The table we'd like to index
source_table_fullname = f"{DA.catalog_name},{DA.schema_name}.pdf_text_embeddings"

# where we want to store our index
vs_index_fullname = f"{DA.catalog_name}.{DA.schema_name}.pdf_text_self_managed_vs_index"

# create or sync the index
if not index_exists(vsc, vs_endpoint_name, vs_index_fullname):
    print(f"Creating index {vs_index_fullname} on endpoint {vs_endpoint_name}...")
    vsc.create_delta_sync_index(
        endpoint_name = vs_endpoint_name,
        index_name = vs_index_fullname,
        source_table_name = source_table_fullname,
        pipeline_type = "TRIGGERED", #Sync needs to be manually triggered
        primary_key = "id",
        embedding_dimensions=1024, #Match your model embedding size (bge)
        embedding_vector_column = "embedding"
    )
else:
    #Trigger a sync to update our vs content with the nwe data saved in the table
    vsc.get_index(vs_endpoint_name, vs_index_fullname).sync()
#Let's wait for the index to be ready and all our embeddings to be created and indexed
wait_for_index_to_be_ready(vsc, vs_endpoint_name, vs_index_fullname)

    


## Search for similar content

In [None]:
import mlflow.deployments

deploy_clients = mlflow.deployments.get_deploy_client("databricks")
question = "How Generative AI impacts humans?"
response = deploy_client.predict(endpoint="databricks-bge-large-en", inputs={"input": [question]})
embeddings = [e['embedding'] for e in response.data]
print(embeddings)

In [None]:
# get similar 5 documents
results = vsc.get_index(vs_endpoint_name, vs_index_fullname).similarity_search(
    query_vector = embeddings[0],
    columns = ["pdf_name", "content"],
    num_results = 5)
#format result to align with remainder lib format
passages =[]
for doc in results.get('result', {}).get('data_array',[]):
    new_doc = {"file": doc[0[, "text":doc[1]}
    passages.append(new_doc)
print(passages)
    

## Re-ranking search results

In [None]:
from flashrank import Ranker, RerankRequest

ranker = Ranker(model_name = "rank-T5-flan", cache_dir=f"{DA.paths.working_dir.replace('dbfs:/','/dbfs/')}/opt")

rerankrequest = RerankRequest(query=question, passages=passages)
results = ranker.rerank(rerankrequest)
print(*resultss[:3], sep="\n\n")

## Assembling and evaluating a RAG application

# Set up the Retriever

In [None]:
# components we created before
# assign vs search endpoint by username
vs_endpoint_prefix = "vs_endpoint_"
vs_endpoint_fallback = "vs_endpoint_fallback"
# vs_endpoint_name = vs_endpoint_prefix+str(get_fixed_integer(DA.unique_name("_")))
vs_endpoint_name = vs_endpoint_prefix+"7"

print(f"Vector Endpoint name: {vs_endpoint_name}. In case of any issues, replace variable 'vs_endpoint_name with 'vs_endpoint_fallback' in demos and labs/")
vs_index_fullname = f"{DA.catalog_name}.{DA.schema_name}.pdf_text_self_managed_vs_index"


In [None]:
from databricks.vector_search.client import VectorSearchClient
from langchain.vectorstores import DatabricksVectorSearch
from langchain.embeddings import DatabricksEmbeddings

#Test embedding Langchain model
# NOTE: your question embedding model must match the one used in the chunk in the prevous model
embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
print(f"Test embeddings: {embeddings_model.embed_query('What is GenerativeAI?')[:20]...")

def get_retriever(persist_dir:str=None):
    # Get the vector search index
    vsc = VectorSearchClient()
    vs_index = vsc.get_index(
        endpoint_name=vs_endpoint_name,
        index_name = vs_index_fullname
    )
    # Create the retriever
    vectorstore = DatabricksVectorSearch(
        vs_index, text_column = "content", embedding=embedding_model
    )
    # k defines the top k documents to retrieve
    return vectorstore.as_retriever(search_kwargs={"k": 2})
#test our retreiver
vectorstore = get_retriever()
similar_documents = vectorstore.invoke("How Generative AI impacts humans?")
print(f"Relevant documents: {similar_documents}")
    

## Setup the Foundation model

In [None]:
from langchain.chat_models import ChatDatabricks

#Test Databricks Foundation LLm model
chat_model = ChatDatabricks(endpoint="databricks-llama-2-70b-chat",max_tokens = 300)
print(f"Test chat model: {chat_model.invoke('What is Generative AI?')}")


In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatDatabricks

TEMPLATE = """You are a assistant for GENAI teaching class. You are answering questions realated to Generative AI
and how it impacts human life. If the question is nor reated to one of these topics, kindly decline to ansert.
If you don't know the answer, just say that ou don't know, don't try to make up an answer. Keep the answer as concise as possible.
Use the following pieces of context to answer the qeustion at the end:

<context>
{context}
</context>

Question: {question}

Answer:
"""
prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever=get_retriever(),
    chain_type_kwargs={"prompt": prompt}
)
    

In [None]:
question = {"query": "How does Generative AI impact humans?")
answer = chain.invoke(question)
print(answer)


In [None]:
type(chain)

## Evaluating the RAG pipeline
### Prepare the Evaluation Dataset

In [None]:
import pandas as pd
from io import StringIO

eval_set = """question, ground_truth, evolution_type, episode_done
"What are the limitations of symbolic planning in task and motion planning, and how can leveraging large language models help overcome these limitations?", "Symbolic planning in task and motion planning can be limited by the need for explicit primitives and constraints. Leveraging large language models can help overcome these limitations by enabling the robot to use language models for planning and execution, and by providing a way to extract and leverage knowledge from large language models to solve temporally extended tasks.", simple, TRUE
"What are some techniques used to fine-tune transformer models for personalized code generation, and how effective are they in improving prediction accuracy and preventing runtime errors? ", "The techniques used to fine-tune transformer models for personalized code generation include fine-tuning transformer models, adopting a novel approach called Target Similarity Tuning (TST) to retrieve a small set of examples from a training bank, and utilizing these examples to train a pretrained language model. The effectiveness of these techniques is shown in the improvement in prediction accuracy and the prevention of runtime errors.", simple, TRUE How does the PPO-ptx model mitigate performance regressions in the few-shot setting?, "The PPO-ptx model mitigates performance regressions in the few-shot setting by incorporating pre-training and fine-tuning on the downstream task. This approach allows the model to learn generalizable features and adapt to new tasks more effectively, leading to improved few-shot performance.", simple, TRUE
How can complex questions be decomposed using successive prompting?, "Successive prompting is a method for decomposing complex questions into simpler sub-questions, allowing language models to answer them more accurately.
This approach was proposed by Dheeru Dua, Shivanshu Gupta, Sameer Singh, and Matt Gardner in their paper
'Successive Prompting for Decomposing Complex Questions', presented at EMNLP 2022.", simple, TRUE
"Which entity type in Named Entity Recognition is likely to be involved in information extraction, question answering, semantic parsing, and machine translation?", Organization, reasoning, TRUE What is the purpose of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) in automatic evaluation methods?,
manipulation include integrating natural language understanding with reinforcement learning, understanding natural language directions for robotic navigation, and mapping instructions and visual observations to actions with reinforcement learning.", simple, TRUE
"How does chain of thought prompting elicit reasoning in large language models, and what are the potential applications of this technique in neural text generation and human-AI interaction?", "The context discusses the use of chain of thought prompting to elicit reasoning in large language models, which can be applied in neural text generation and human-AI interaction. Specifically, researchers have used this technique to train language models to generate coherent and contextually relevant text, and to create transparent and controllable human-AI interaction systems. The potential applications of this technique include improving the performance of language models in generating contextually appropriate responses, enhancing the interpretability and controllability of AI systems, and facilitating more effective human-AI collaboration.", simple, TRUE
"Using the given context, how can the robot be instructed to move objects around on a tabletop to complete rearrangement tasks?", "The robot can be instructed to move objects around on a tabletop to complete rearrangement tasks by using natural language instructions that specify the objects to be moved and their desired locations. The instructions can be parsed using functions such as parse_obj_name and parse_position to extract the necessary information, and then passed to a motion primitive that can pick up and place objects in the specified locations. The get_obj_names and get_obj_pos APIs can be used to access information about the available objects and their locations in the scene.", reasoning, TRUE
"How can searching over an organization's existing knowledge, data, or documents using LLM-powered applications reduce the time it takes to complete worker activities?", "Searching over an organization's existing knowledge, data, or documents using LLM-powered applications can reduce the time it takes to complete worker activities by retrieving information quickly and efficiently. This can be done by using the LLM's capabilities to search through large amounts of data and retrieve relevant information in a short amount of time.", simple, TRUE
"""
objStringI0(eval_set) 
eval_df pd.read_csv(obj)
display(eval_df)


In [None]:
from datasets import Dataset

test_questions = eval_df ["question"].values.tolist()
test_groundtruths = eval_df ["ground_truth"].values.tolist()

answers = []
contexts = []

# answer each question in the dataset
for question in test_questions:
    # save the answer generated
    chain_response chain.invoke({"query" question})
    answers.append(chain_response ["result"])
    
    #save the contexts used
    vs_response = vectorstore.invoke(question)
    contexts.append(list(map(lambda doc: doc.page_content, vs_response)))
    
# construct the final dataset
response_dataset = Dataset.from_dict({
    "inputs": test_questions,
    "answer": answers,
    "context": contexts,
    "ground_truth": test_groundtrutes
})

display(response_dataset.to_pandas())

## calculate evaluation metrics

In [None]:
import mlflow
from mlflow.deployments import set_deployments_target

set_deployments_target("databricks")

dbrx_answer_Isimilarity = mlflow.metrics.genai.answer_similarity(
    model="endpoints:/databricks-dbrx-instruct"
)

dbrx_relevance = mlflow.metrics.genai.relevance(
    model="endpoints:/databricks-dbrx-instruct"
)

results = mlflow.evaluate(
    data=response_dataset.to_pandas(),
    targets="ground_truth",
    predictions="answer",
    extra_metrics=[dbrx_answer_similarity, dbrx_relevance],
    evaluators="default",
)

display(results.tables['eval_results_table'])

In [None]:
from mlflow.models import infer_signature
import mlflow
import langchain

# set model registery to UC
mlflow.set_registry_uri("databricks-uc")
model_name = f{DA.catalog_name}. {DA.schema_name}.rag_app_demo4"

with mlflow.start_run(run_name="rag_app_demo4") as run:
    signature infer_signature (question, answer)
    model_info = mlflow.langchain.log_model(
        chain,
        loader_fn=get_retriever,
        artifact_path="chain",
        registered_model_name=model_name,
        pip_requirements=[
            "mlflow==" + mlflow._version_,
            "langchain==" + langchain. _ version_,
            "databricks-vectorsearch",
        ],
        input_example=question,
        signature=signature
)

## planning a compound AI system architecture


In [None]:
%sh
#Libararies to dev graphics
echo 'Driver Installs...'
apt-get install -y graphviz
pip install graphviz

In [None]:
def get_multistage_html():
    import re
    from graphviz import Digraph
    
    dot Digraph('pt')
    dot.attr(compound='true')
    dot.graph_attr['rankdir'] = 'LR'
    dot.graph_attr['splines'] = 'ortho'
    dot.edge_attr.update(arrowhead='normal', arrowsize='1')
    dot.attr('node', shape='rectangle')
def component_link(component,
                    ttip=''):
    url = "https://curriculum-dev.cloud.databricks.com"
    path = "/".join(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get().split("/") [:-1])
    path path.replace(" ", "%20")
    return {'tooltip': ttip, 'href': f'{url}#workspace(path}/components/{component)', 'target': "_blank",
    'width': "1.5"}
with dot.subgraph(name='cluster_workflow') as w:
    w.body.append('label="Model Serving"')
    w.body.append('style="filled"')
    w.body.append('color="#808080"')
    w.body.append('fillcolor="#F5F5F5"')
    
    w.node('question', 'question', fillcolor='#FFD580', style='filled', shape='oval',
        **component_link('question'))
with w.subgraph(name='cluster_app') as a:
    a.body.append('label="Compound_rag_app Class"')
    a.body.append('style="filled"')
    a.body.append('color="#808080"*)
    a.body.append('fillcolor="#DCDCDC"')

In [None]:
def main(self, question: str) -> str:
    search_result: Similarity SearchResult = self.run_search (question)
    augmented_result: Tuple [SearchResult Augmented Content, ...] = self.run_augment(search)
    context: str = self.run_get_context(augmented_result)
    qa_result: QaModelResult = self.run_qa(question, context)
    class="reserved" return Similarity SearchResult = self.run_search(question)
    return qa_result.get_answer()

In [None]:
## Building Multi-stage AI systems

In [None]:
%run ../Includes/Classroom-Setup-03

In [None]:
print(f"Username: {DA.username}")
print(f"Catalog Name: {DA.catalog_name}")
print(f"Schema Name:{DA.schema_name}")
print(f"Working Directory:{DA.paths.working_dir}")
print(f"Dataset Location:{DA.paths.datasets}")

In [None]:
# Prompts
from langchain.prompts import PromptTemplate
prompt_template PromptTemplate.from_template("Tell me about a (genre) movie which (actor) is one of the actors.")
prompt_template.format(genre="romance", actor "Brad Pitt")

In [None]:
# LLM
from langchain_community.chat_models import ChatDatabricks

# play with max_tokens to define the length of the response
llm_dbrx ChatDatabricks (endpoint="databricks-dbrx-instruct", max_tokens 500)

for chunk in llm_dbrx.stream("Who is Brad Pitt?"):
    print(chunk.content, end="\n", flush=True)

In [None]:
# Retriever
from langchain_community.retrievers import WikipediaRetriever
retriever = WikipediaRetriever()
#docs = retriever.get_relevant_documents (query="Brad Pitt")
docs retriever.invoke(input="Brad Pitt")
print(docs[0])

In [None]:
# Tools
from langchain_community.tools import YouTubeSearchTool
tool = YouTube Search Tool()
tool.run("Brad Pitt movie trailer")

In [None]:
print(tool.description)
print(tool.args)

In [None]:
# Chaining
from langchain_core.output_parsers import StrOutputParser

chain = prompt_template | Illm_dbrx | StroutputParser()
print(chain.invoke({"genre":"romance", "actor":"Brad Pitt"}))

In [None]:
# Build a multi-stage chain
# create a vector store
#assign vs search endpoint by username
vs_endpoint_prefix = "vs_endpoint_"
vs_endpoint_fallback = "vs_endpoint_genai_as"
vs_endpoint_name vs_endpoint_prefix+str(get_fixed_integer(DA.unique_name("_")))
print(f"Vector Endpoint name: (vs_endpoint_name). In case of any issues, replace variable 'vs_endpoint_name with 'vs_endpoint_fallback in demos and labs.")

In [None]:
from pyspark.sql import functions as F

vs_index_table_fullname = f"{DA.catalog_name}.{DA.schema_name}.dais_embeddings"
source_table_fullname = f"(DA.catalog_name}.{DA.schema_name}.dais_text"

#load dataset and compute embeddings
df spark.read.parquet(f" (DA.paths.datasets}/dais/dais23_talks.parquet")
dfdf.withColumn("id", F.monotonically_increasing_id())
#df df.withColumn("embedding", get_embedding("Abstract"))
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(source_table_fullname)

spark.sql(f"ALTER TABLE {source_table_fullname) SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

#store embeddings in vector store
create_vs_index(vs_endpoint_name, vs_index_table_fullname, source_table_fullname, "Title")

In [None]:
# Build first chain
from langchain_community.chat_models import ChatDatabricks
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StroutputParser
from langchain_community.tools import YouTube Search Tool
from databricks.vector_search.client import VectorSearchClient
from langchain.schema.runnable import RunnablePassthrough

llm_dbrx= ChatDatabricks (endpoint="databricks-dbrx-instruct", max_tokens = 1000)
tool_yt = YouTube Search Tool()

prompt_template_1 = PromptTemplate.from_template(
    """You are a Databricks expert. You will get questions about Databricks. Try to give simple answers and be professional.
    Question: {question}
    Answer:
    """
)
chain1=({"question": RunnablePassthrough()} | prompt_template_1 | llm_dbrx | StroutputParser())
print(chain1.invoke({"question":"How machine learning models are stored in Unity Catalog?"}))

In [None]:
# Build second chain
from langchain_community.vectorstores import DatabricksVectorSearch
vsc VectorSearchClient()
dais_index vsc.get_index(vs_endpoint_name, vs_index_table_fullname)
query "how do I use DatabricksSQL"

dvs_delta_sync = DatabricksVectorSearch(dais_index)
docs = dvs_delta_sync.similarity_search (query)
videos= tool_yt.run(docs[0].page_content)
prompt_template_2 = PromptTemplate.from_template(
    """You will get a list of videos related to the user's question which are recorded in DAIS-2023. Encourage the user to watch the videos.
    List videos with their YouTube links.
    
    List of videos: (videos)
    """
chain2= ({"videos": RunnablePassthrough()) | prompt_template_2 | llm_dbrx | StroutputParser())

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter

multi_chain = ({
    "c": chain1,
    "d": chain2
}| RunnablePassthrough.assign(d=chain2))
multi_chain.invoke({"question": "How machine learning models are stored in Unity Catalog?", "videos": videos})

In [None]:
# Define the brain of the agent
from langchain_community.chat_models import ChatDatabricks
# play with max_tokens to define the length of the response
llm_dbrx ChatDatabricks (endpoint="databricks-dbrx-instruct", max_tokens 500)

In [None]:
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

from langchain_community.tools import YouTubeSearchTool

from langchain.agents import Tool
from langchain_experimental.utilities import PythonREPL

from langchain_community.tools import DuckDuckGoSearchRun
#Wiki tool for info retrieval
api_wrapper=WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=100)
tool wiki=WikipediaQueryRun(api_wrapper=api_wrapper)

#tool to search youtube videos.
tool_youtube =YouTubeSearchTool()

#web search tool
search =DuckDuckGoSearchRun()

#tool to write python code
python_repl =PythonREPL()
repl_tool Tool(
    name="python_repl",
    description="A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output
    of a value, you should print it out with 'print(...)".",
    func=python_repl.run
)
# toolset
tools=[tool_wiki, tool_youtube, search, repl_tool]

In [None]:
# Define planning logic
from langchain.prompts import PromptTemplate
template='''Answer the following questions as best you can. You have access to the following tools:
{tools}
Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought: {agent_scratchpad}'''
prompt= PromptTemplate.from_template(template)

In [None]:
from langchain.agents import Agent Executor
from langchain.agents.react.agent import create_react_agent

agent=create_react_agent(llm_dbrx, tools, prompt)
brixo = AgentExecutor (agent=agent, tools=tools, verbose=True, handle_parsing_errors=True)
brixo.invoke({"input":
    """What would be a nice movie to watch in rainy weather. Follow these steps.
    
    First, decide which movie you would recommend.
    
    Second, show me the trailler video of the movie that you suggest.
    
    Next, collect data about the movie using search tool and draw a bar chart using Python libraries. If you can't find latest data use some dummy data as we to show your abilities to the learners. Don't use for python code. Input should be sanitized by removing any leading or trailing backticks. if the input starts with "python", remove that word as well. The output must be the result of executed code.
    
    Finally, tell a funny joke about agents.
    """})

In [None]:
from datasets import load_dataset
dataset load_dataset("maharshipandya/spotify-tracks-dataset")
df = dataset['train'].to_pandas()

In [None]:
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent

from langchain_community.chat_models import ChatDatabricks

llm_dbrx ChatDatabricks (endpoint="databricks-dbrx-instruct", max_tokens = 500)

prefix="""Input should be sanitized by removing any leading or trailing backticks. if the input starts with "python", remove that word as well. Use the dataset provided. The output must start with a new line."""
dataqio=create_pandas_dataframe_agent(
    llm_dbrx
    df,
    verbose=True,
    max_iterations=3,
    prefix=prefix,
    agent_executor_kwargs={
    "handle_parsing_errors": True
    }
)

In [None]:
dataqio.invoke("what is the artist name of most populat country song?")

In [None]:
dataqio.invoke("what is the total number of rows?")