In [1]:
from llama_hub.tools.waii import WaiiToolSpec

waii_tool = WaiiToolSpec(
    url="https://tweakit.waii.ai/api/",
    # API Key of Waii (not OpenAI API key)
    api_key="3a...",
    # Which database you want to use, you need add the db connection to Waii first
    database_key="snowflake://..."
)

In [2]:
from llama_index import VectorStoreIndex

# Use as Data Loader, load data to index and query it
documents = waii_tool.load_data('Get all tables with their number of columns')
index = VectorStoreIndex.from_documents(documents).as_query_engine()

index.query('Which table contains most columns, tell me top 5 tables with number of columns?').response

"The table 'TABLES' contains the most columns. The top 5 tables with the number of columns are 'TABLES' with 25 columns, followed by 'SHOW' with 5 columns."

In [3]:
# Use as tool, initialize it
from llama_index.agent import OpenAIAgent
from llama_index.llms import OpenAI

agent = OpenAIAgent.from_tools(waii_tool.to_tool_list(), llm=OpenAI(model='gpt-4-1106-preview'), verbose=True)

In [4]:
# Ask simple questions
print(agent.chat("Give me top 3 countries with the most number of car factory"))

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: get_answer with args: {"ask":"What are the top 3 countries with the highest number of car factories?"}
Got output: Japan, Germany, and the USA.

STARTING TURN 2
---------------

The top 3 countries with the highest number of car factories are Japan, Germany, and the USA.


In [5]:
print(agent.chat("What are the car factories of these countries"))

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: get_answer with args: {"ask": "What are the car factories in Japan?"}
Got output: Nissan Motors, Honda, Mazda, Subaru, and Toyota are car factories in Japan.

=== Calling Function ===
Calling function: get_answer with args: {"ask": "What are the car factories in Germany?"}
Got output: Volkswagen, BMW, Daimler Benz, and Opel are car factories in Germany.

=== Calling Function ===
Calling function: get_answer with args: {"ask": "What are the car factories in the USA?"}
Got output: amc, gm, ford, and chrysler are car factories in the USA.

STARTING TURN 2
---------------

Here are some of the car factories in the top 3 countries with the most number of car factories:

- In Japan: Nissan Motors, Honda, Mazda, Subaru, and Toyota.
- In Germany: Volkswagen, BMW, Daimler Benz, and Opel.
- In the USA: AMC, GM, Ford, and Chrysler.


In [6]:
# Do performance analysis
print(agent.chat("Give me top 3 longest running queries, and their duration."))

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: get_answer with args: {"ask":"What are the top 3 longest running queries and their durations?"}
Got output: The top 3 longest running queries and their durations are as follows:
1. Query ID: 01b08971-0001-e7a0-0022-ba8700c28122, Duration: 365143 milliseconds
2. Query ID: 01b08aca-0001-e830-0022-ba8700c2a09e, Duration: 190413 milliseconds
3. Query ID: 01b08ac4-0001-e7ed-0022-ba8700c2951a, Duration: 170837 milliseconds

STARTING TURN 2
---------------

The top 3 longest running queries and their respective durations are:

1. Query ID: 01b08971-0001-e7a0-0022-ba8700c28122, with a duration of 365,143 milliseconds.
2. Query ID: 01b08aca-0001-e830-0022-ba8700c2a09e, with a duration of 190,413 milliseconds.
3. Query ID: 01b08ac4-0001-e7ed-0022-ba8700c2951a, with a duration of 170,837 milliseconds.


In [7]:
print(agent.chat("analyze the 2nd-longest running query"))

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: performance_analyze with args: {"query_uuid":"01b08aca-0001-e830-0022-ba8700c2a09e"}
Got output: {
  "summary": [
    "The 'aggregate' operator with functions sum(sum(sum(sum_internal(ss.ss_sales_price, count(*))))) and grouping keys i.i_category, i.i_class, i.i_brand, i.i_product_name, d.d_year, d.d_qoy, d.d_moy, s.s_store_id is the most time-consuming, with an overall percentage of 0.31 of execution time and a significant amount of bytes spilled to local storage (4774297600 bytes).",
    "The 'windowfunction' operator with function rank() over (partition by i.i_category order by sum(sum(sum(sum_internal(ss.ss_sales_price, count(*))))) desc nulls first) also contributes to the high execution time with an overall percentage of 0.2 and a large amount of bytes spilled to local storage (15166328832 bytes).",
    "The 'tablescan' operator on the store_sales table is scanning a large amount of data (3514269696 bytes

In [8]:
# Diff two queries
previous_query = """
SELECT
    employee_id,
    department,
    salary,
    AVG(salary) OVER (PARTITION BY department) AS department_avg_salary,
    salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg
FROM
    employees;
"""
current_query = """
SELECT
    employee_id,
    department,
    salary,
    MAX(salary) OVER (PARTITION BY department) AS department_max_salary,
    salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg
FROM
    employees;
LIMIT 100;
"""
print(agent.chat(f"tell me difference between {previous_query} and {current_query}"))

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: diff_query with args: {"previous_query":"SELECT\n employee_id,\n department,\n salary,\n AVG(salary) OVER (PARTITION BY department) AS department_avg_salary,\n salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg\nFROM\n employees;","current_query":"SELECT\n employee_id,\n department,\n salary,\n MAX(salary) OVER (PARTITION BY department) AS department_max_salary,\n salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg\nFROM\n employees;\nLIMIT 100;"}
Got output: The query retrieves the employee ID, department, salary, maximum salary within the department, and the difference between the employee's salary and the average salary within the department. The result set is limited to 100 rows.

STARTING TURN 2
---------------

The difference between the two queries is as follows:

1. In the first query, the fourth column calculates the average salary for each department (`department_avg

In [9]:
# Describe dataset
print(agent.chat("Summarize the dataset"))

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: describe_dataset with args: {"ask":"Can you summarize the dataset for me?"}
Got output: The dataset consists of multiple schemas, each containing tables related to different domains. The schemas include "FLIGHT" which contains tables related to airlines, airports, and flights, "STUDENT_TRANSCRIPTS_TRACKING" which contains tables related to student information, courses, degree programs, departments, semesters, and transcripts, "WORLD" which contains tables related to cities, countries, and languages, "INFORMATION_SCHEMA" which provides information about the database objects and metadata in the WAII database, "EMPLOYEE_HIRE_EVALUATION" which contains tables related to employee information, evaluations, hiring evaluations, and shop details, and "PETS" which contains tables related to pet ownership and student information.

Each schema has a summary describing its purpose and the type of information it contains. Ad

In [11]:
q = """
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, lag, lead, round
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("yearly_car_analysis").getOrCreate()

yearly_avg_hp = cars_data.groupBy("year").agg(avg("horsepower").alias("avg_horsepower"))

windowSpec = Window.orderBy("year")

yearly_comparisons = yearly_avg_hp.select(
    "year",
    "avg_horsepower",
    lag("avg_horsepower").over(windowSpec).alias("prev_year_hp"),
    lead("avg_horsepower").over(windowSpec).alias("next_year_hp")
)

final_result = yearly_comparisons.select(
    "year",
    "avg_horsepower",
    round(
        (yearly_comparisons.avg_horsepower - yearly_comparisons.prev_year_hp) / 
        yearly_comparisons.prev_year_hp * 100, 2
    ).alias("percentage_diff_prev_year"),
    round(
        (yearly_comparisons.next_year_hp - yearly_comparisons.avg_horsepower) / 
        yearly_comparisons.avg_horsepower * 100, 2
    ).alias("percentage_diff_next_year")
).orderBy("year")

final_result.show()
"""
print(agent.chat(f"translate this pyspark query {q}, to Snowflake"))

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: transcode with args: {"instruction":"Translate this PySpark query to Snowflake SQL.","source_dialect":"pyspark","source_query":"from pyspark.sql import SparkSession\nfrom pyspark.sql.functions import avg, lag, lead, round\nfrom pyspark.sql.window import Window\n\nspark = SparkSession.builder.appName(\"yearly_car_analysis\").getOrCreate()\n\nyearly_avg_hp = cars_data.groupBy(\"year\").agg(avg(\"horsepower\").alias(\"avg_horsepower\"))\n\nwindowSpec = Window.orderBy(\"year\")\n\nyearly_comparisons = yearly_avg_hp.select(\n    \"year\",\n    \"avg_horsepower\",\n    lag(\"avg_horsepower\").over(windowSpec).alias(\"prev_year_hp\"),\n    lead(\"avg_horsepower\").over(windowSpec).alias(\"next_year_hp\")\n)\n\nfinal_result = yearly_comparisons.select(\n    \"year\",\n    \"avg_horsepower\",\n    round(\n        (yearly_comparisons.avg_horsepower - yearly_comparisons.prev_year_hp) / \n        yearly_comparisons.prev_ye