## SQL to NL to SQL architecture. 

In many cases, a company has already many anlytic assets such like SQLs, procedure, OLAP cubes and others in their firm. 

SQL is a valuable tool, but it can be challenging to use in a reusable way because it is not always clear what the query is doing.

For example, 

If someone want to know 'which department has poor performace than the average ?'

The goal is very simple, but its implementation SQL is very complex. 

There is a chasm between the business goal and the specific SQL implentation. 

In this case, pre-defined complex SQL can help to solve this problem. And we should suggest how to retrieve appropriate SQL from the lots of pre-defined SQLs.

In [1]:
# TODO: Implment this function
import pandas as pd

def crawl_prefined_sqls():
  sqls = []
  # goal = "SQL to check the delay time(second) before shipping for each order and product"
  sqls.append("""
CREATE TEMP FUNCTION convertIntervalToSecond(start_ts TIMESTAMP, end_ts TIMESTAMP)
RETURNS INT64
AS (
  (EXTRACT(DAY FROM (end_ts - start_ts)) * 86400 +
  EXTRACT(HOUR FROM (end_ts - start_ts)) * 3600 +
  EXTRACT(MINUTE FROM (end_ts - start_ts)) * 60 +
  EXTRACT(SECOND FROM (end_ts - start_ts)))
);
select convertIntervalToSecond(a.created_at, a.shipped_at) as order_pending_second, 
  convertIntervalToSecond(b.created_at, b.shipped_at) as product_pending_second, 
  a.status, a.order_id, a.user_id, a.gender, a.created_at, a.shipped_at, 
  b.product_id, b.created_at, b.shipped_at, b.delivered_at, b.returned_at
 from `bigquery-public-data.thelook_ecommerce.orders` a
 join `bigquery-public-data.thelook_ecommerce.order_items` b on (a.order_id = b.order_id)
 where a.status not in ('Cancelled') 
   and a.created_at between '2022-01-01' and '2022-06-30'
order by a.order_id, b.product_id
  """)
  return sqls


In [2]:
import vertexai
from langchain.chat_models import ChatVertexAI
from langchain.llms import VertexAI
import os

PROJECT_ID = os.getenv("PROJECT_ID")  # @param {type:"string"}
vertexai.init(project=PROJECT_ID, location="us-central1")

llm_vertex = VertexAI(
    #model_name="text-bison@latest",
    model_name="text-bison-32k",
    max_output_tokens=8000,
    temperature=0,
    top_p=0.8,
    top_k=40,
)

llm = llm_vertex

In [3]:
import json

def parse_json_response(llm_json_response) -> any:
  #print('llm response:'+ response)
  start_char = '['
  end_char = ']'
  if llm_json_response.find('[') == -1 or llm_json_response.find('{') < llm_json_response.find('[') :
    start_char = '{'
    end_char = '}'
  start_index = llm_json_response.find(start_char)
  end_index = llm_json_response.rfind(end_char)
  json_data = llm_json_response[start_index:end_index+1]
  parsed_json = json.loads(json_data)
  return parsed_json


In [17]:

def explain_sql(sql):
  example_json = """
  {
    "business goal": "explanation of the SQL",
    "prepared sql": "select convertIntervalToSecond(a.created_at, a.shipped_at) as order_pending_second from sample_table where created_at between ? and ?",
    "filters": [
      {
        "column": "created_at",
        "business goal": "filter by order created date",
        "operator": "between",
        "mandatory": true,
        "type" : "TIMESTAMP",
        "filter names": ["order_created_at_start","order_created_at_end"]
      }
    ],
  }
  """
  prompt_template = """You are a serverside developer. 
  Please convert SQL to prepared statement and extract filters from the SQL in a JSON format.

  output format json:
  {example_json}

  ----------------------------

  target sql:
  {sql}
  """
  prompt = prompt_template.format(sql=sql, example_json=example_json)
  response = llm.predict(prompt)
  print(response)
  return parse_json_response(response)

In [20]:

assetized_queries = []

for sql in crawl_prefined_sqls():
  assetized_queries.append(explain_sql(sql))
  

 ```python
{
    "business goal": "This SQL query calculates the order pending time and product pending time for each order, and then joins the order table with the order items table to get the order details. Finally, it filters the results by order status and created date.",
    "prepared sql": "CREATE TEMP FUNCTION convertIntervalToSecond(start_ts TIMESTAMP, end_ts TIMESTAMP)\nRETURNS INT64\nAS (\n  (EXTRACT(DAY FROM (end_ts - start_ts)) * 86400 +\n  EXTRACT(HOUR FROM (end_ts - start_ts)) * 3600 +\n  EXTRACT(MINUTE FROM (end_ts - start_ts)) * 60 +\n  EXTRACT(SECOND FROM (end_ts - start_ts)))\n);\nselect convertIntervalToSecond(a.created_at, a.shipped_at) as order_pending_second, \n  convertIntervalToSecond(b.created_at, b.shipped_at) as product_pending_second, \n  a.status, a.order_id, a.user_id, a.gender, a.created_at, a.shipped_at, \n  b.product_id, b.created_at, b.shipped_at, b.delivered_at, b.returned_at\n from `bigquery-public-data.thelook_ecommerce.orders` a\n join `bigquery-pu

In [21]:
from vector_util import VectorDatabase

vdb = VectorDatabase()

vdb.truncate_table()

In [22]:
from langchain.embeddings import VertexAIEmbeddings

embeddings = VertexAIEmbeddings()

In [23]:
def write_goal_to_vdb(assetized_queries):
  for assetized_query in assetized_queries:
    description = assetized_query['business goal']
    sql = assetized_query['prepared sql']
    parameters = assetized_query['filters']
    desc_vector = embeddings.embed_query(description)
    vdb.insert_record(sql=sql, parameters=parameters, description=description, explore_view=None, model_name=None, table_name=None, column_schema=None, desc_vector=desc_vector)

write_goal_to_vdb(assetized_queries)

In [26]:
def get_related_query(question):
  test_embedding =  embeddings.embed_query(question)
  results = []
  with vdb.get_connection() as conn:
    try:
      with conn.cursor() as cur:
        select_record = (str(test_embedding).replace(' ',''),)
        cur.execute(f"SELECT sql, parameters, description FROM rag_test where (1 - (desc_vector <=> %s)) > 0.6 limit 1", select_record)
        results = cur.fetchall()
        print(results)
    except Exception as e:
      print(e)
  return results


In [28]:
question = "How to check the delay time(second) before shipping for each order and product?"

assetized_query = get_related_query(question)[0]
print(query)


[('CREATE TEMP FUNCTION convertIntervalToSecond(start_ts TIMESTAMP, end_ts TIMESTAMP)\nRETURNS INT64\nAS (\n  (EXTRACT(DAY FROM (end_ts - start_ts)) * 86400 +\n  EXTRACT(HOUR FROM (end_ts - start_ts)) * 3600 +\n  EXTRACT(MINUTE FROM (end_ts - start_ts)) * 60 +\n  EXTRACT(SECOND FROM (end_ts - start_ts)))\n);\nselect convertIntervalToSecond(a.created_at, a.shipped_at) as order_pending_second, \n  convertIntervalToSecond(b.created_at, b.shipped_at) as product_pending_second, \n  a.status, a.order_id, a.user_id, a.gender, a.created_at, a.shipped_at, \n  b.product_id, b.created_at, b.shipped_at, b.delivered_at, b.returned_at\n from `bigquery-public-data.thelook_ecommerce.orders` a\n join `bigquery-public-data.thelook_ecommerce.order_items` b on (a.order_id = b.order_id)\n where a.status not in (?) \n   and a.created_at between ? and ?\norder by a.order_id, b.product_id', "[{'column': 'status', 'business goal': 'Filter by order status', 'operator': 'not in', 'mandatory': True, 'type': 'STRI

In [127]:

def extract_filter_values(assetized_query, question):
  example_json = """
  {
    "parameters": [
      {
        "table_name": "sample_table",
        "column": "col_1",
        "operator" : "in", 
        "type" : "STRING",
        "filter_names": ["col_1_filter"],
        "extracted_values": [null]
      },
      {
        "table_name": "sample_table",
        "column": "col_2",
        "operator" : "=", 
        "type" : "STRING",
        "filter_names": ["col_2_filter"],
        "extracted_values": ["2022-01"]
      }
    ]
  }
  """

  prompt_template = """You are a serverside developer. Extract filter values from the question and sql. If you can't find filter values from the question, check whether there is any missing filter values in the question. Do not suggest codes. Output should be the following JSON format.

  output format json:
  {example_json}

  ----------------------------
  given question:
  {question}
  
  given sql:
  {sql}

  given filters:
  {filters}
  """
  sql = assetized_query[0]
  filters = assetized_query[1]
  prompt = prompt_template.format(sql=sql, filters=filters, example_json=example_json, question=question)
  response = llm.predict(prompt)
  return response


In [128]:
question_enriched = "How to check the delay time(second) before shipping for each order and product in this year(2023)?"

response = extract_filter_values(assetized_query, question_enriched)

In [129]:
response

' ```json\n  {\n    "parameters": [\n      {\n        "table_name": "bigquery-public-data.thelook_ecommerce.orders",\n        "column": "status",\n        "operator" : "not in", \n        "type" : "STRING",\n        "filter_names": ["order_status"],\n        "extracted_values": [null]\n      },\n      {\n        "table_name": "bigquery-public-data.thelook_ecommerce.orders",\n        "column": "created_at",\n        "operator" : "between", \n        "type" : "TIMESTAMP",\n        "filter_names": ["order_created_at_start", "order_created_at_end"],\n        "extracted_values": ["2023-01-01", "2023-12-31"]\n      }\n    ]\n  }\n```'

In [130]:
filter_values = parse_json_response(response)

In [131]:
filter_values['parameters']

[{'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
  'column': 'status',
  'operator': 'not in',
  'type': 'STRING',
  'filter_names': ['order_status'],
  'extracted_values': [None]},
 {'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
  'column': 'created_at',
  'operator': 'between',
  'type': 'TIMESTAMP',
  'filter_names': ['order_created_at_start', 'order_created_at_end'],
  'extracted_values': ['2023-01-01', '2023-12-31']}]

In [132]:

def check_unfilled_filters(filter_values):
  unfilled_filters = []
  for parameter in filter_values['parameters']:
    if None in parameter['extracted_values']:
      unfilled_filters.append(parameter)
  return unfilled_filters



In [133]:
unfilled_filters = check_unfilled_filters(filter_values)

In [134]:
unfilled_filters

[{'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
  'column': 'status',
  'operator': 'not in',
  'type': 'STRING',
  'filter_names': ['order_status'],
  'extracted_values': [None]}]

In [135]:

def make_response_to_fill_filter_values(unfilled_filters, question):
  example_json = """
  {
    "agent_response": "..."
  }
  """
  prompt_template = """You are an automatic agent to serve natural language to SQL conversion. Please guide the user to fill the missing filter values in the given question and unfilled filters. Output should be the following JSON format.

  question:
  {question}

  unfilled filters:
  {unfilled_filters}

  output format json:
  {example_json}
  """
  prompt = prompt_template.format(unfilled_filters=unfilled_filters, question=question, example_json=example_json)
  response = llm.predict(prompt)
  return response

In [136]:
response = make_response_to_fill_filter_values(unfilled_filters, question=question_enriched)


In [137]:
response

' {\n    "agent_response": "What is the order status that you want to exclude?"\n  }'

In [138]:
additional_response = "I want to know not cancelled orders only"

In [139]:

def extract_filter_values_with_additional_response(unfilled_filters, additional_response):
  example_json = """
  {
    "parameters": [
      {
        "column": "col_1",
        "operator" : "in", 
        "type" : "STRING",
        "filter_names": ["col_1_filter"],
        "extracted_values": [null]
      }
    ]
  }
  """

  prompt_template = """You are a serverside developer. Extract filter values from the user response and unfilled filter information. Do not suggest codes. Output should be the following JSON format.

  output format json:
  {example_json}

  ----------------------------
  user response :
  {user_response}

  unfilled filters:
  {unfilled_filters}
  """
  prompt = prompt_template.format(unfilled_filters=unfilled_filters, user_response=additional_response, example_json=example_json)
  response = llm.predict(prompt)
  return response

In [140]:
additional_filter_values = parse_json_response(extract_filter_values_with_additional_response(unfilled_filters, additional_response))

In [141]:
additional_filter_values

{'parameters': [{'column': 'status',
   'operator': 'not in',
   'type': 'STRING',
   'filter_names': ['order_status'],
   'extracted_values': ['cancelled']}]}

In [142]:
def merge_filter_values(filter_values, additional_filter_values):
  original_parameters = filter_values['parameters']
  additional_parameters = additional_filter_values['parameters']
  for org_parameter in original_parameters:
    for add_parameter in additional_parameters:
      if org_parameter['column'] == add_parameter['column']:
        org_parameter['extracted_values'] = add_parameter['extracted_values']
  return filter_values


In [143]:
merged_filter_values = merge_filter_values(filter_values, additional_filter_values)

In [144]:
merged_filter_values

{'parameters': [{'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
   'column': 'status',
   'operator': 'not in',
   'type': 'STRING',
   'filter_names': ['order_status'],
   'extracted_values': ['cancelled']},
  {'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
   'column': 'created_at',
   'operator': 'between',
   'type': 'TIMESTAMP',
   'filter_names': ['order_created_at_start', 'order_created_at_end'],
   'extracted_values': ['2023-01-01', '2023-12-31']}]}

As you can see, LLM can handle extraction / tranformation and validation as well. 

Next step is very similar with 'Direction Conversion'.