## SQL to NL to SQL architecture. 

In many cases, a company has already many anlytic assets such like SQLs, procedure, OLAP cubes and others in their firm. 

SQL is a valuable tool, but it can be challenging to use in a reusable way because it is not always clear what the query is doing.

For example, 

If someone want to know 'which department has poor performace than the average ?'

The goal is very simple, but its implementation SQL is very complex. 

There is a chasm between the business goal and the specific SQL implentation. 

In this case, pre-defined complex SQL can help to solve this problem. And we should suggest how to retrieve appropriate SQL from the lots of pre-defined SQLs.

In [1]:
#! pip install pandas google-cloud-aiplatform google-cloud-bigquery
#! pip install python-dotenv langchain
#! pip install langchain-google-vertexai

In [2]:
# TODO: Implment this function
import pandas as pd

def crawl_prefined_sqls():
  sqls = []
  # goal = "SQL to check the delay time(second) before shipping for each order and product"
  sqls.append("""
CREATE TEMP FUNCTION convertIntervalToSecond(start_ts TIMESTAMP, end_ts TIMESTAMP)
RETURNS INT64
AS (
  (EXTRACT(DAY FROM (end_ts - start_ts)) * 86400 +
  EXTRACT(HOUR FROM (end_ts - start_ts)) * 3600 +
  EXTRACT(MINUTE FROM (end_ts - start_ts)) * 60 +
  EXTRACT(SECOND FROM (end_ts - start_ts)))
);
select convertIntervalToSecond(a.created_at, a.shipped_at) as order_pending_second, 
  convertIntervalToSecond(b.created_at, b.shipped_at) as product_pending_second, 
  a.status, a.order_id, a.user_id, a.gender, a.created_at, a.shipped_at, 
  b.product_id, b.created_at, b.shipped_at, b.delivered_at, b.returned_at
 from `bigquery-public-data.thelook_ecommerce.orders` a
 join `bigquery-public-data.thelook_ecommerce.order_items` b on (a.order_id = b.order_id)
 where a.status not in ('Cancelled') 
   and a.created_at between '2022-01-01' and '2022-06-30'
order by a.order_id, b.product_id
  """)
  return sqls


In [3]:
import vertexai
from langchain.chat_models import ChatVertexAI
from langchain_google_vertexai import VertexAI
import os
from dotenv import load_dotenv

load_dotenv()

PROJECT_ID=os.getenv("PROJECT_ID")  # @param {type:"string"}
LOCATION=os.getenv("LOCATION")
DATASET=os.getenv("DATASET")
TABLE_NAME=os.getenv("TABLE_NAME")



In [4]:
vertexai.init(project=PROJECT_ID, location="us-central1")

llm_vertex = VertexAI(
    #model_name="text-bison@latest",
    model_name="text-bison-32k@002",
    max_output_tokens=8000,
    temperature=0,
    top_p=0.8,
    top_k=40,
)

llm = llm_vertex

In [5]:
import json
import ast

def parse_json_response(llm_json_response) -> any:
  #print('llm response:'+ response)
  start_char = '['
  end_char = ']'
  if llm_json_response.find('[') == -1 or max(0,llm_json_response.find('{')) < llm_json_response.find('[') :
    start_char = '{'
    end_char = '}'
  start_index = llm_json_response.find(start_char)
  end_index = llm_json_response.rfind(end_char)
  json_data = llm_json_response[start_index:end_index+1]
  parsed_json = json.loads(json_data)
  return parsed_json

def parse_python_object(llm_python_object) -> any:
  print('llm response:'+ llm_python_object)
  if llm_python_object.find('{') == -1:
    start_char = '['
    end_char = ']'
  elif llm_python_object.find('[') == -1 or llm_python_object.find('{') < llm_python_object.find('[') :
    start_char = '{'
    end_char = '}'
  start_index = llm_python_object.find(start_char)
  end_index = llm_python_object.rfind(end_char)
  object_data = llm_python_object[start_index:end_index+1]
  print(object_data)
  parsed_object = ast.literal_eval(object_data)
  return parsed_object



In [8]:

def explain_sql(sql):
  example_json = """
  {
    "business_goal": "explanation of the SQL",
    "prepared_statement": "select ... from .... where created_at between ? and ?", // A prepared statement that is exactly the same as the given SQL which can include Temporary function, except for parameter substitution.
    "filter_columns": [
      {
        "table_name": "project_id(optional).dataset_name.sample_table",
        "column_name": "created_at",
        "business goal": "filter by order created date",
        "operator": "between",
        "column_type" : "TIMESTAMP",
        "filter names": ["order_created_at_start","order_created_at_end"]
        "fitler_order": 1
      }
    ],
  }
  """
  prompt_template = """You are a server-side developer.
Please convert the provided SQL to a prepared_statement and extract the filters from the SQL in JSON format. The filter_columns should only include predicate columns. Do not suggest Python code. Describe the business goal based on the prepared_statement.
Step 1: Convert the provided SQL to a prepared_statement.
Step 2: Extract the filters from the SQL in JSON format. The filter_columns should only include predicate columns. Do not suggest Python code.
Step 3: Describe the business goal based on the prepared_statement.

----------------------------
example sql :
select convertIntervalToSecond(a.created_at, a.shipped_at) as order_pending_second from sample_table where created_at between '2023-01-01' and '2023-12-01'

example output json:
{example_json}

----------------------------
provided sql:
{sql}

output json:
"""
  prompt = prompt_template.format(sql=sql, example_json=example_json)
  response = llm.invoke(prompt)
  print(response)
  return parse_json_response(response)

In [9]:

assetized_queries = []

## In some cases, the LLM model can't extract the filter columns properly.

for sql in crawl_prefined_sqls():
  assetized_queries.append(explain_sql(sql))
  

 ```json
{
  "business_goal": "This query aims to analyze order and product-related metrics for orders placed between January 1, 2022, and June 30, 2022, excluding canceled orders.",
  "prepared_statement": "CREATE TEMP FUNCTION convertIntervalToSecond(start_ts TIMESTAMP, end_ts TIMESTAMP) RETURNS INT64 AS ((EXTRACT(DAY FROM (end_ts - start_ts)) * 86400 + EXTRACT(HOUR FROM (end_ts - start_ts)) * 3600 + EXTRACT(MINUTE FROM (end_ts - start_ts)) * 60 + EXTRACT(SECOND FROM (end_ts - start_ts))));\nselect convertIntervalToSecond(a.created_at, a.shipped_at) as order_pending_second, convertIntervalToSecond(b.created_at, b.shipped_at) as product_pending_second, a.status, a.order_id, a.user_id, a.gender, a.created_at, a.shipped_at, b.product_id, b.created_at, b.shipped_at, b.delivered_at, b.returned_at from `bigquery-public-data.thelook_ecommerce.orders` a join `bigquery-public-data.thelook_ecommerce.order_items` b on (a.order_id = b.order_id) where a.status not in (?) and a.created_at between 

In [11]:
# from vector_util import VectorDatabase

# vdb = VectorDatabase()

# vdb.truncate_table()

from bigquery_vector_util import BigQueryVectorDatabase, SqlSearchSchema

vdb = BigQueryVectorDatabase(project_id=PROJECT_ID, location=LOCATION, dataset=DATASET, table_name=TABLE_NAME)
vdb.create_table()

Empty DataFrame
Columns: []
Index: []
QueryJob<project=turnkey-charter-358922, location=asia-northeast3, id=69db7b1d-9bf5-463c-8577-c4cd42f080fa>


In [12]:
from langchain_google_vertexai import VertexAIEmbeddings

embdding_model = VertexAIEmbeddings("textembedding-gecko-multilingual@latest")

In [13]:
def write_goal_to_vdb(assetized_queries):
  records = []
  for assetized_query in assetized_queries:
    description = assetized_query['business_goal']
    sql = assetized_query['prepared_statement']
    parameters = json.dumps(assetized_query['filter_columns'])
    desc_vector = embdding_model.embed_query(description)
    records.append(SqlSearchSchema(sql=sql, description=description, desc_vector=desc_vector, parameters=parameters, explore_view=None, model_name=None, table_name=None, column_schema=None))
  vdb.insert_record(records)

write_goal_to_vdb(assetized_queries)


[]


## Retrieve Stage

After to ingest predefined SQls into Vector Database, we need to retrieve them in a right way.

In [14]:
import json
import ast

def parse_json_response(llm_json_response) -> any:
  #print('llm response:'+ response)
  start_char = '['
  end_char = ']'
  if llm_json_response.find('[') == -1 or max(0,llm_json_response.find('{')) < llm_json_response.find('[') :
    start_char = '{'
    end_char = '}'
  start_index = llm_json_response.find(start_char)
  end_index = llm_json_response.rfind(end_char)
  json_data = llm_json_response[start_index:end_index+1]
  parsed_json = json.loads(json_data)
  return parsed_json

def parse_python_object(llm_python_object) -> any:
  print('llm response:'+ llm_python_object)
  if llm_python_object.find('{') == -1:
    start_char = '['
    end_char = ']'
  elif llm_python_object.find('[') == -1 or llm_python_object.find('{') < llm_python_object.find('[') :
    start_char = '{'
    end_char = '}'
  start_index = llm_python_object.find(start_char)
  end_index = llm_python_object.rfind(end_char)
  object_data = llm_python_object[start_index:end_index+1]
  print(object_data)
  parsed_object = ast.literal_eval(object_data)
  return parsed_object


In [15]:
import vertexai
from langchain.chat_models import ChatVertexAI
from langchain_google_vertexai import VertexAI
import os
from dotenv import load_dotenv

load_dotenv()

PROJECT_ID=os.getenv("PROJECT_ID")  # @param {type:"string"}
LOCATION=os.getenv("LOCATION")
DATASET=os.getenv("DATASET")
TABLE_NAME=os.getenv("TABLE_NAME")



In [16]:
vertexai.init(project=PROJECT_ID, location="us-central1")

llm_vertex = VertexAI(
    #model_name="text-bison@latest",
    model_name="text-bison-32k@002",
    max_output_tokens=8000,
    temperature=0,
    top_p=0.8,
    top_k=40,
)

llm = llm_vertex

In [17]:
from langchain_google_vertexai import VertexAIEmbeddings

embedding_model = VertexAIEmbeddings("textembedding-gecko-multilingual@latest")

In [18]:
from bigquery_vector_util import BigQueryVectorDatabase, SqlSearchSchema

vdb = BigQueryVectorDatabase(project_id=PROJECT_ID, location=LOCATION, dataset=DATASET, table_name=TABLE_NAME)


In [19]:
def get_related_query(question):
  test_embedding =  embedding_model.embed_query(question)
  related_queries = vdb.select_similar_query(test_embedding)
  result = None
  if related_queries is not None:
    result = { 
      "uuid" : related_queries['uuid'][0],
      "sql" : related_queries['sql'][0],
      "description" : related_queries['description'][0],
      "parameters" : related_queries['parameters'][0],
      "explore_view" : related_queries['explore_view'][0],
      "model_name" : related_queries['model_name'][0],
      "table_name" : related_queries['table_name'][0],
      "column_schema" : related_queries['column_schema'][0],
      "distance" : related_queries['distance'][0]
    }
  else:
    result = { 
      "uuid" : "",
      "sql" : "",
      "description" : "",
      "parameters" : "",
      "explore_view" : "",
      "model_name" : "",
      "table_name" : "",
      "column_schema" : "",
      "distance" : 1
    }
  return result


In [20]:
question = "How to check the delay time(second) before shipping for each order and product?"

assetized_query = get_related_query(question)
print(assetized_query)


{'uuid': 'f98d0e64-03ae-4cb0-9e80-137ba594f697', 'sql': 'CREATE TEMP FUNCTION convertIntervalToSecond(start_ts TIMESTAMP, end_ts TIMESTAMP) RETURNS INT64 AS ((EXTRACT(DAY FROM (end_ts - start_ts)) * 86400 + EXTRACT(HOUR FROM (end_ts - start_ts)) * 3600 + EXTRACT(MINUTE FROM (end_ts - start_ts)) * 60 + EXTRACT(SECOND FROM (end_ts - start_ts))));\nselect convertIntervalToSecond(a.created_at, a.shipped_at) as order_pending_second, convertIntervalToSecond(b.created_at, b.shipped_at) as product_pending_second, a.status, a.order_id, a.user_id, a.gender, a.created_at, a.shipped_at, b.product_id, b.created_at, b.shipped_at, b.delivered_at, b.returned_at from `bigquery-public-data.thelook_ecommerce.orders` a join `bigquery-public-data.thelook_ecommerce.order_items` b on (a.order_id = b.order_id) where a.status not in (?) and a.created_at between ? and ? order by a.order_id, b.product_id', 'description': 'This query aims to analyze order and product-related metrics for orders placed between Janu

In [21]:

def extract_filter_values(assetized_query, question):
  example_json = """
  {
    "filter_columns": [
      {
        "table_name": "sample_table",
        "column_name": "col_1",
        "operator" : "in", 
        "column_type" : "STRING",
        "filter_names": ["col_1_filter"],
        "filter_values": [null],
        "filter_order": 1
      },
      {
        "table_name": "sample_table",
        "column_name": "col_2",
        "operator" : "=", 
        "column_type" : "STRING",
        "filter_names": ["col_2_filter"],
        "filter_values": ["2022-01"],
        "filter_order": 2
      }
    ]
  }
  """

  prompt_template = """You are a serverside developer. Extract filter values from the question and fill the values into the 'filter_values' item for the given filter columns. Do not suggest codes. Output should be the following JSON format.

  given question:
  {question}
  
  given sql:
  {sql}

  given filters:
  {filters}

  ----------------------------

  output format(example) : json
  {example_json}

  """
  sql = assetized_query['sql']
  filters = assetized_query['parameters']
  prompt = prompt_template.format(sql=sql, filters=filters, example_json=example_json, question=question)
  response = llm.invoke(prompt)
  return response


In [23]:
question_enriched = "How to check the delay time(second) before shipping for each order and product in this year(2023)?"

response = extract_filter_values(assetized_query, question_enriched)

In [24]:
response

' {\n    "filter_columns": [\n      {\n        "table_name": "bigquery-public-data.thelook_ecommerce.orders",\n        "column_name": "status",\n        "operator": "not in",\n        "column_type": "STRING",\n        "filter_names": ["order_status"],\n        "filter_values": ["canceled"],\n        "filter_order": 1\n      },\n      {\n        "table_name": "bigquery-public-data.thelook_ecommerce.orders",\n        "column_name": "created_at",\n        "operator": "between",\n        "column_type": "TIMESTAMP",\n        "filter_names": ["order_created_at_start", "order_created_at_end"],\n        "filter_values": ["2023-01-01", "2023-12-31"],\n        "filter_order": 2\n      }\n    ]\n  }'

In [25]:
filter_values = parse_json_response(response)

In [26]:
filter_values

{'filter_columns': [{'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
   'column_name': 'status',
   'operator': 'not in',
   'column_type': 'STRING',
   'filter_names': ['order_status'],
   'filter_values': ['canceled'],
   'filter_order': 1},
  {'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
   'column_name': 'created_at',
   'operator': 'between',
   'column_type': 'TIMESTAMP',
   'filter_names': ['order_created_at_start', 'order_created_at_end'],
   'filter_values': ['2023-01-01', '2023-12-31'],
   'filter_order': 2}]}

In [27]:

def check_unfilled_filters(filter_values):
  unfilled_filters = []
  for parameter in filter_values['filter_columns']:
    if None in parameter['filter_values'] or len(parameter['filter_values']) == 0:
      unfilled_filters.append(parameter)
  return unfilled_filters



In [28]:
unfilled_filters = check_unfilled_filters(filter_values)

In [29]:
unfilled_filters

[]

In [30]:

def make_response_to_fill_filter_values(unfilled_filters, question):
  example_json = """
  {
    "agent_response": "..."
  }
  """
  prompt_template = """You are an automatic agent to serve natural language to SQL conversion. Please guide the user to fill the missing filter values in the given question and unfilled filters. Output should be the following JSON format.

  question:
  {question}

  unfilled filters:
  {unfilled_filters}

  output format json:
  {example_json}
  """
  prompt = prompt_template.format(unfilled_filters=unfilled_filters, question=question, example_json=example_json)
  response = llm.predict(prompt)
  return response

In [31]:
response = make_response_to_fill_filter_values(unfilled_filters, question=question_enriched)


  warn_deprecated(


In [32]:
response

' {\n    "agent_response": "To check the delay time (in seconds) before shipping for each order and product in the year 2023, you can use the following SQL query:\\n\\n```sql\\nSELECT \\n    o.order_id, \\n    p.product_id, \\n    o.order_date, \\n    o.ship_date, \\n    JULIANDAY(o.ship_date) - JULIANDAY(o.order_date) AS delay_in_days,\\n    (JULIANDAY(o.ship_date) - JULIANDAY(o.order_date))*24*60*60 AS delay_in_seconds\\nFROM \\n    orders AS o\\nJOIN \\n    products AS p ON o.product_id = p.product_id\\nWHERE \\n    o.order_date BETWEEN \'2023-01-01\' AND \'2023-12-31\';\\n```\\n\\nThis query will give you the order ID, product ID, order date, ship date, delay in days, and delay in seconds for all orders and products in the year 2023."\n  }'

In [33]:
additional_response = "I want to know not cancelled orders only"

In [34]:

def extract_filter_values_with_additional_response(unfilled_filters, additional_response):
  example_json = """
  {
    "filter_columns": [
      {
        "column_name": "col_1",
        "operator" : "in", 
        "column_type" : "STRING",
        "filter_names": ["col_1_filter"],
        "filter_values": [null]
      }
    ]
  }
  """

  prompt_template = """You are a serverside developer. Extract filter values from the user response and unfilled filter information. Do not suggest codes. Output should be the following JSON format.

  output format json:
  {example_json}

  ----------------------------
  user response :
  {user_response}

  unfilled filters:
  {unfilled_filters}
  """
  prompt = prompt_template.format(unfilled_filters=unfilled_filters, user_response=additional_response, example_json=example_json)
  response = llm.predict(prompt)
  return response

In [35]:
additional_filter_values = parse_json_response(extract_filter_values_with_additional_response(unfilled_filters, additional_response))

In [36]:
additional_filter_values

{'filter_columns': [{'column_name': 'status',
   'operator': 'not in',
   'column_type': 'STRING',
   'filter_names': ['status_filter'],
   'filter_values': ['cancelled']}]}

In [37]:
def merge_filter_values(filter_values, additional_filter_values):
  original_parameters = filter_values['filter_columns']
  additional_parameters = additional_filter_values['filter_columns']
  for org_parameter in original_parameters:
    for add_parameter in additional_parameters:
      if org_parameter['column_name'] == add_parameter['column_name']:
        org_parameter['filter_values'] = add_parameter['filter_values']
  return filter_values


In [38]:
merged_filter_values = merge_filter_values(filter_values, additional_filter_values)

In [39]:
merged_filter_values

{'filter_columns': [{'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
   'column_name': 'status',
   'operator': 'not in',
   'column_type': 'STRING',
   'filter_names': ['order_status'],
   'filter_values': ['cancelled'],
   'filter_order': 1},
  {'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
   'column_name': 'created_at',
   'operator': 'between',
   'column_type': 'TIMESTAMP',
   'filter_names': ['order_created_at_start', 'order_created_at_end'],
   'filter_values': ['2023-01-01', '2023-12-31'],
   'filter_order': 2}]}

As you can see, LLM can handle extraction / tranformation and validation as well. 

Next step is very similar with 'Direction Conversion'.

In [40]:
from google.cloud import bigquery
client = bigquery.Client()

In [41]:
def get_field_unique_values(matched_table, matched_field):
  if matched_table[0] != '`' :
    matched_table = '`' + matched_table + '`'
  sql_query = f"with distinct_values as ( select distinct {matched_field} as {matched_field} from {matched_table} ) select {matched_field}, (select count(1) from distinct_values) as total_count from distinct_values limit 500"
  df = client.query(sql_query).to_dataframe()
  return df[matched_field].tolist(), df['total_count'][0]
  

In [42]:
import ast


def choose_right_filter_value(filter_values, wanted_value):
  prompt_template = """As a looker developer, choose right filter value for the wanted value below without changing filter value itself.

  filter_values : {filter_values}

  wanted_values: {wanted_value}

  answer format: python list
[filter_value1, filter_value2, ...]
  """
  response = llm.predict(prompt_template.format(filter_values=filter_values, wanted_value=wanted_value))
  return response 

def adjust_filter_value(filter_columns):
  for filter in filter_columns:
    matched_table = filter['table_name']
    matched_field = filter['column_name']
    filter['unique_values'], filter['unique_count'] = get_field_unique_values(matched_table, matched_field)
    # TODO: if unique_count < 500, then choose right filter value in the unique value list.
    if filter['unique_count'] < 500:
      response = choose_right_filter_value(filter['unique_values'], filter['filter_values'])
      if response.strip().find("```json") == 0 :
        filter['adjust_filter_values'] = parse_json_response(response)
      else:
        filter['adjust_filter_values'] = parse_python_object(response)
    else:
      filter['adjust_filter_values'] = filter['filter_values']
    filter['unique_values'] = None
  
  

In [43]:
adjust_filter_value(merged_filter_values['filter_columns'])

llm response: ```python
['Cancelled']
```
['Cancelled']


In [44]:
merged_filter_values

{'filter_columns': [{'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
   'column_name': 'status',
   'operator': 'not in',
   'column_type': 'STRING',
   'filter_names': ['order_status'],
   'filter_values': ['cancelled'],
   'filter_order': 1,
   'unique_values': None,
   'unique_count': 5,
   'adjust_filter_values': ['Cancelled']},
  {'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
   'column_name': 'created_at',
   'operator': 'between',
   'column_type': 'TIMESTAMP',
   'filter_names': ['order_created_at_start', 'order_created_at_end'],
   'filter_values': ['2023-01-01', '2023-12-31'],
   'filter_order': 2,
   'unique_values': None,
   'unique_count': 117304,
   'adjust_filter_values': ['2023-01-01', '2023-12-31']}]}

In [45]:
def prepared_statement_with_filter_values_in_bigquery(sql_and_filters):
  prepared_statement = sql_and_filters['prepared_statement']
  query_parameters = []
  for filter_column in sql_and_filters['filter_columns']:
    if len(filter_column['adjust_filter_values']) > 1:
      if len(filter_column['filter_names']) > 1:
        for filter_value in filter_column['adjust_filter_values']:
          if(filter_column['column_type'] == 'FLOAT64'):
            query_parameters.append(bigquery.ScalarQueryParameter(None, "FLOAT64", filter_value))
          elif(filter_column['column_type'] == 'INT64'):
            query_parameters.append(bigquery.ScalarQueryParameter(None, "INT64", filter_value))
          else:
            query_parameters.append(bigquery.ScalarQueryParameter(None, "STRING", filter_value))  
      else:
        if(filter_column['column_type'] == 'FLOAT64'):
          query_parameters.append(bigquery.ArrayQueryParameter(None, "FLOAT64", filter_column['adjust_filter_values']))
        elif(filter_column['column_type'] == 'INT64'):
          query_parameters.append(bigquery.ArrayQueryParameter(None, "INT64", filter_column['adjust_filter_values']))
        else:
          query_parameters.append(bigquery.ArrayQueryParameter(None, "STRING", filter_column['adjust_filter_values']))
    else:
      if(filter_column['column_type'] == 'FLOAT64'):
        query_parameters.append(bigquery.ScalarQueryParameter(None, "FLOAT64", filter_column['adjust_filter_values'][0]))
      elif(filter_column['column_type'] == 'INT64'):
        query_parameters.append(bigquery.ScalarQueryParameter(None, "INT64", filter_column['adjust_filter_values'][0]))
      else:
        query_parameters.append(bigquery.ScalarQueryParameter(None, "STRING", filter_column['adjust_filter_values'][0]))  
  job_config = bigquery.QueryJobConfig(
    query_parameters=query_parameters
  )
  print(query_parameters)
  query_job = client.query(prepared_statement, job_config=job_config)
  return query_job.to_dataframe()

In [46]:
sql_and_filters = {
  'prepared_statement':assetized_query['sql'],
  'filter_columns': merged_filter_values['filter_columns']
}

In [47]:
sql_and_filters

{'prepared_statement': 'CREATE TEMP FUNCTION convertIntervalToSecond(start_ts TIMESTAMP, end_ts TIMESTAMP) RETURNS INT64 AS ((EXTRACT(DAY FROM (end_ts - start_ts)) * 86400 + EXTRACT(HOUR FROM (end_ts - start_ts)) * 3600 + EXTRACT(MINUTE FROM (end_ts - start_ts)) * 60 + EXTRACT(SECOND FROM (end_ts - start_ts))));\nselect convertIntervalToSecond(a.created_at, a.shipped_at) as order_pending_second, convertIntervalToSecond(b.created_at, b.shipped_at) as product_pending_second, a.status, a.order_id, a.user_id, a.gender, a.created_at, a.shipped_at, b.product_id, b.created_at, b.shipped_at, b.delivered_at, b.returned_at from `bigquery-public-data.thelook_ecommerce.orders` a join `bigquery-public-data.thelook_ecommerce.order_items` b on (a.order_id = b.order_id) where a.status not in (?) and a.created_at between ? and ? order by a.order_id, b.product_id',
 'filter_columns': [{'table_name': 'bigquery-public-data.thelook_ecommerce.orders',
   'column_name': 'status',
   'operator': 'not in',
   

In [48]:
df_result = prepared_statement_with_filter_values_in_bigquery(sql_and_filters)

[ScalarQueryParameter(None, 'STRING', 'Cancelled'), ScalarQueryParameter(None, 'STRING', '2023-01-01'), ScalarQueryParameter(None, 'STRING', '2023-12-31')]


In [49]:
df_result.head(10)

Unnamed: 0,order_pending_second,product_pending_second,status,order_id,user_id,gender,created_at,shipped_at,product_id,created_at_1,shipped_at_1,delivered_at,returned_at
0,197100.0,206241.0,Complete,1,1,F,2023-06-26 08:14:00+00:00,2023-06-28 14:59:00+00:00,1076,2023-06-26 05:41:39+00:00,2023-06-28 14:59:00+00:00,2023-07-03 00:40:00+00:00,NaT
1,180480.0,193555.0,Complete,3,2,F,2023-02-19 12:58:00+00:00,2023-02-21 15:06:00+00:00,5061,2023-02-19 09:20:05+00:00,2023-02-21 15:06:00+00:00,2023-02-22 12:23:00+00:00,NaT
2,115320.0,122997.0,Shipped,5,3,M,2023-07-21 16:57:00+00:00,2023-07-23 00:59:00+00:00,16024,2023-07-21 14:49:03+00:00,2023-07-23 00:59:00+00:00,NaT,NaT
3,115320.0,-54714.0,Shipped,5,3,M,2023-07-21 16:57:00+00:00,2023-07-23 00:59:00+00:00,24282,2023-07-23 16:10:54+00:00,2023-07-23 00:59:00+00:00,NaT,NaT
4,,,Processing,6,4,M,2023-11-02 07:48:00+00:00,NaT,16206,2023-11-02 07:01:41+00:00,NaT,NaT,NaT
5,222000.0,231890.0,Complete,7,5,F,2023-12-25 18:53:00+00:00,2023-12-28 08:33:00+00:00,9250,2023-12-25 16:08:10+00:00,2023-12-28 08:33:00+00:00,2024-01-02 07:16:00+00:00,NaT
6,38280.0,-213306.0,Shipped,9,6,M,2023-09-22 05:39:00+00:00,2023-09-22 16:17:00+00:00,17894,2023-09-25 03:32:06+00:00,2023-09-22 16:17:00+00:00,NaT,NaT
7,38280.0,40113.0,Shipped,9,6,M,2023-09-22 05:39:00+00:00,2023-09-22 16:17:00+00:00,20063,2023-09-22 05:08:27+00:00,2023-09-22 16:17:00+00:00,NaT,NaT
8,137640.0,63251.0,Shipped,12,7,F,2023-12-07 04:42:00+00:00,2023-12-08 18:56:00+00:00,394,2023-12-08 01:21:49+00:00,2023-12-08 18:56:00+00:00,NaT,NaT
9,137640.0,137431.0,Shipped,12,7,F,2023-12-07 04:42:00+00:00,2023-12-08 18:56:00+00:00,7272,2023-12-07 04:45:29+00:00,2023-12-08 18:56:00+00:00,NaT,NaT
