### Demo how to use the LLM with DuckDB

In [1]:
import duckdb

duckdb.__version__

'0.9.0'

In [13]:
from langchain.agents import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain.agents import AgentExecutor
from langchain.agents.agent_types import AgentType
from langchain import HuggingFaceHub, OpenAI
from langchain import PromptTemplate, LLMChain
from sqlalchemy import Column, Integer, Sequence, String, create_engine

from langchain.llms import GPT4All
from langchain import PromptTemplate, LLMChain

from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# n_gpu_layers = 1  # Metal set to 1 is enough.
# n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
# callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
# local_llm = LlamaCpp(
#     model_path="/home/wei/Developments/llms/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.gguf.bin",
#     n_gpu_layers=n_gpu_layers,
#     n_batch=n_batch,
#     n_ctx=4096,
#     f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
#     callback_manager=callback_manager,
#     temperature=0,
#     verbose=True,
# )

# load_dotenv()
# OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

local_path = ("/home/wei/Developments/llms/llama.cpp/models/starcoderbase-3b-ggml.bin")

# initialize the LLM and make chain it with the prompts

local_llm = GPT4All(
    model=local_path, 
    backend="llama", 
)

uri = 'duckdb:///db/cdn_open_data.db'

connect_args = {
        'read_only': True
    }

CONN = create_engine(uri)

db = SQLDatabase.from_uri(
    uri,
    include_tables=['inventory'], 
	sample_rows_in_table_info=3)

toolkit = SQLDatabaseToolkit(db=db, llm=local_llm)

agent_executor = create_sql_agent(
    llm=local_llm,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
)

template = """/
You are a SQL Analyst that is querying a database of Canada Open Data Inventory that about all the Canada Open Data from the Government of Canada website.

Below is a description of the columns, data types, and information in the columns:

The column name ref_number with the data type VARCHAR contains the following information: Unique identifier for every open data
The column name title_en with the data type VARCHAR contains the following information: English title of the open data
The column name title_fr with the data type VARCHAR contains the following information: French title of the open data
The column name description_en with the data type VARCHAR contains the following information: English description of the open data
The column name description_fr with the data type VARCHAR contains the following information: French description of the open data
The column name publisher_en with the data type VARCHAR contains the following information: Publisher name in English
The column name publisher_fr with the data type VARCHAR contains the following information: Publisher name in French
The column name date_published with the data type VARCHAR contains the following information: The date of this open data published
The column name language with the data type VARCHAR contains the following information: What language this open data
The column name size with the data type BIGINT contains the following information: The open data size
The column name program_alignment_architecture_en with the data type VARCHAR contains the following information: English name of the program alignment architecture
The column name program_alignment_architecture_fr with the data type VARCHAR contains the following information: French name of the program alignment architecture
The column name date_released with the data type VARCHAR contains the following information: The date this open data released
The column name portal_url_en with the data type VARCHAR contains the following information: English portal url 
The column name portal_url_fr with the data type VARCHAR contains the following information: French portal url 
The column name user_votes with the data type BIGINT contains the following information: The users votes count for this open data, which showing which open data are most request by the user
The column name owner_org with the data type VARCHAR contains the following information: Which org divison are the owner of this open data
The column name owner_org_title with the data type VARCHAR contains the following information: The org division title of the owner of this open data

Your job is to write an execute a query that answers the following question:
{query}
"""

prompt = PromptTemplate.from_template(template)

agent_executor.run(
    prompt.format(query = "Total records of inventory table?")
)


Found model file at  /home/wei/Developments/llms/llama.cpp/models/ggml-model-gpt4all-falcon-q4_0.bin
falcon_model_load: loading model from '/home/wei/Developments/llms/llama.cpp/models/ggml-model-gpt4all-falcon-q4_0.bin' - please wait ...
falcon_model_load: n_vocab   = 65024
falcon_model_load: n_embd    = 4544
falcon_model_load: n_head    = 71
falcon_model_load: n_head_kv = 1
falcon_model_load: n_layer   = 32
falcon_model_load: ftype     = 2
falcon_model_load: qntvr     = 0
falcon_model_load: ggml ctx size = 3872.64 MB
falcon_model_load: memory_size =    32.00 MB, n_mem = 65536
falcon_model_load: ........................ done
falcon_model_load: model size =  3872.59 MB / num tensors = 196






[1m> Entering new AgentExecutor chain...[0m


OutputParserException: Could not parse LLM output: `Action: Look at the tables in the database`

In [None]:
agent_executor.run(
    prompt.format(query = "What's most released owner_org title?")
)



[1m> Entering new AgentExecutor chain...[0m


Llama.generate: prefix-match hit


Action: sql_db_schema
... (this Thought/Action/Action Input/Observation can repeat N times)



llama_print_timings:        load time = 115236.72 ms
llama_print_timings:      sample time =    44.38 ms /    33 runs   (    1.34 ms per token,   743.54 tokens per second)
llama_print_timings: prompt eval time =  9178.05 ms /    40 tokens (  229.45 ms per token,     4.36 tokens per second)
llama_print_timings:        eval time = 11102.14 ms /    32 runs   (  346.94 ms per token,     2.88 tokens per second)
llama_print_timings:       total time = 20517.14 ms


OutputParserException: Could not parse LLM output: `Action: sql_db_schema
... (this Thought/Action/Action Input/Observation can repeat N times)
`

In [6]:
agent_executor.run(
    prompt.format(query = "what is the most voted open date between 2000 and 2023?")
)



[1m> Entering new AgentExecutor chain...[0m


Llama.generate: prefix-match hit


Action: sql_db_schema
... (this Thought/Action/Action Input/Observation can repeat N times)



llama_print_timings:        load time = 115236.72 ms
llama_print_timings:      sample time =    44.55 ms /    33 runs   (    1.35 ms per token,   740.79 tokens per second)
llama_print_timings: prompt eval time = 12838.93 ms /    55 tokens (  233.44 ms per token,     4.28 tokens per second)
llama_print_timings:        eval time = 10637.32 ms /    32 runs   (  332.42 ms per token,     3.01 tokens per second)
llama_print_timings:       total time = 23701.60 ms


OutputParserException: Could not parse LLM output: `Action: sql_db_schema
... (this Thought/Action/Action Input/Observation can repeat N times)
`

In [8]:
agent_executor.run(
    prompt.format(query = "can you show me the top 10 open date in french title which have been published but never been released?")
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: sql_db_list_tables
Action Input: [0m
Observation: [38;5;200m[1;3minventory[0m
Thought:[32;1m[1;3m I should query the schema of the inventory table to see what columns I can use.
Action: sql_db_schema
Action Input: inventory[0m
Observation: [33;1m[1;3m
CREATE TABLE inventory (
	ref_number VARCHAR, 
	title_en VARCHAR, 
	title_fr VARCHAR, 
	description_en VARCHAR, 
	description_fr VARCHAR, 
	publisher_en VARCHAR, 
	publisher_fr VARCHAR, 
	date_published VARCHAR, 
	language VARCHAR, 
	size BIGINT, 
	eligible_for_release VARCHAR, 
	program_alignment_architecture_en VARCHAR, 
	program_alignment_architecture_fr VARCHAR, 
	date_released VARCHAR, 
	portal_url_en VARCHAR, 
	portal_url_fr VARCHAR, 
	user_votes BIGINT, 
	owner_org VARCHAR, 
	owner_org_title VARCHAR
)

/*
3 rows from inventory table:
ref_number	title_en	title_fr	description_en	description_fr	publisher_en	publisher_fr	date_published	language	size	eligible_

Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-8QRUoez80xE1pV7j8qlCobzS on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-davinci-003 in organization org-8QRUoez80xE1pV7j8qlCobzS on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/acco

[32;1m[1;3m I now know the final answer.
Final Answer: The top 10 open data in French title that have been published but never been released are: Rendu de décisions pour le comité sur la représentation à l'étranger (CORA), Requêtes de transport aux missions, Biens des missions et demandes de matériel, Requêtes d'informations financières et budgétaires des missions, Adhérence au standard de service de livraison pour les requêtes de services en ligne pour les missions (SLM), Niveau de satisfaction au service en ligne pour les missions, Liste des véhicules des services communs, SCDATA PSD 2023-2025 FINAL, Compte d’unités de logement résidentiel du ministère de la Défense nationale au Canada., and États financiers 21-22.[0m

[1m> Finished chain.[0m


"The top 10 open data in French title that have been published but never been released are: Rendu de décisions pour le comité sur la représentation à l'étranger (CORA), Requêtes de transport aux missions, Biens des missions et demandes de matériel, Requêtes d'informations financières et budgétaires des missions, Adhérence au standard de service de livraison pour les requêtes de services en ligne pour les missions (SLM), Niveau de satisfaction au service en ligne pour les missions, Liste des véhicules des services communs, SCDATA PSD 2023-2025 FINAL, Compte d’unités de logement résidentiel du ministère de la Défense nationale au Canada., and États financiers 21-22."