In [1]:
# !pip install -q boto3
# !pip install -q requests
# !pip install -q requests-aws4auth
# !pip install -q opensearch-py
# !pip install -q tqdm

# !pip install "faiss-cpu" --quiet
# !pip install langchain --quiet
# !pip install jq --quiet

In [1]:
import os
import sys
from langchain.document_loaders.json_loader import JSONLoader
from langchain.docstore.document import Document
import json
import re
from langchain.vectorstores import FAISS
from langchain.embeddings import BedrockEmbeddings
from functools import reduce
from langchain.prompts import PromptTemplate
from sqlalchemy import MetaData
from sqlalchemy import create_engine


import re
import pandas as pd
import numpy as np
import json
import sqlite3

with open(f'./tables.json', 'rb') as ofp:
    meta = json.load(ofp)
data = meta[0]

data = [i for i in meta if i['db_id'] == 'department_store']

data  = data[0]    
columns = data["column_names_original"]
col_df = pd.DataFrame(columns).iloc[1:]
col_df.rename(columns={0: 'table_idx', 1: 'col_name'}, inplace=True)
col_df

types_df = pd.DataFrame(data["column_types"]).iloc[1:]
types_df.rename(columns={0: 'type'}, inplace=True)
types_df

merged_col = pd.concat([col_df, types_df], axis=1)

tables_df = pd.DataFrame(data["table_names_original"])
tables_df.reset_index(inplace=True)
tables_df.columns = ['table_idx', 'table_name']

meta = pd.merge(tables_df, merged_col, on=['table_idx'])
meta = meta.drop(columns=['table_idx'])
meta.to_csv('./data/total_meta.csv', index=False)

In [2]:

def ask_llm(question):
    bedrock_model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
    body = json.dumps({
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 1024,
                "temperature" : 0.1,
                "top_p": 0.5,
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": question},
                        ],
                    }
                ],
            }) 

    response = bedrock.invoke_model(
        body=body, 
        modelId=bedrock_model_id,
        accept='application/json',
        contentType='application/json') #payload를 Bedrock으로 전송

    response_body = json.loads(response.get("body").read())
    llm_output = response_body.get("content")[0].get("text")
    return llm_output


In [5]:
# Create new docs with the right metadata we need for indexing
def create_docs_with_correct_metadata(documents):
    # We are going to return a list of new documents
    new_docs = []

    # For each document
    for doc in documents:
        # Get it's metadata and contents
        metadata = doc.metadata
        contents = json.loads(doc.page_content)

        # Now calculate the new metadata that we want to add
        new_metadata = {
            "tableName": contents["tableName"],
            "question": contents["question"],
            "tableSchema": contents["tableSchema"],
        }

        # Print out the new metadata for our documents
        # print(new_metadata)

        new_docs.append(
            Document(page_content=new_metadata["question"], metadata=new_metadata)
        )

    return new_docs

def load_json_file(filename):
    loader = JSONLoader(file_path=filename, jq_schema=".[]", text_content=False)

    # This is our internal Langchain document data structure
    docs = loader.load()
    return docs


In [72]:
def write_questions_to_file(question_list_filename, table_name, table_schema, sql_query, answer):
    data_list = []
    question_list_obj = answer
    questions_list = question_list_obj.splitlines()
    # Open the file in write mode
    with open(question_list_filename, mode="w", newline="") as file:
        for question in questions_list:

            # Skip if it doesn't really have a question
            if "?" not in question:
                continue

            questionSplit = re.split(r"\d{1,5}.||. ||- ", question, maxsplit=1)
            question = questionSplit[1]
            data = {
                "sql_query": sql_query,
                "tableName": table_name,
                "question": question,
                "tableSchema": table_schema.lstrip(" "),
            }
            data_list.append(data)

        json.dump(data_list, file)

In [69]:
# This function asks the LLM to inspect a table schema, generate some questions which could be answered
# by that schema, and then it stores those questions to file, loads them all into a single vectorDB
def add_new_table(idx, sql_query, schema, table_name, is_incremental, bedrock_embeddings):
    """
    :schema         :   
    :table_name     :
    :model_id       :
    :is_incremental :
    """
    print(f"Adding table {table_name} with schema {schema}")
    
    question = f"""
        ### Instruction:
        You are an AI trained to generate potential user questions based on an SQL query. Below is an example SQL query. Generate a possible natural language question that users might have asked to produce such an SQL query.

        ### Example SQL Query:
        SELECT e.FirstName, e.LastName, d.DepartmentName, e.Salary
        FROM Employee e
        JOIN Department d ON e.DepartmentID = d.DepartmentID
        WHERE e.Salary > 70000;

        ### Corresponding Question:
        What are the names and departments of employees who earn more than $70,000?

        ### Task:
        Generate possible natural language questions based on the following SQL query.

        ### Input SQL Query:
        {sql_query}

        ### Output Question:
        """
       
    answer = ask_llm(question)
    os.makedirs('./data/rag', exist_ok=True)
    question_list_filename = f"./data/rag/questionList_{table_name}_{idx}.json"

    # # Get rid of anything before the 1.
    # if re.match(r"^[^\d+]\. ", answer) and re.search(r"\d+\. ", answer):
    #     answer = "1. " + answer.split("1. ")[1]
    # else:
    #     answer = "1. " + answer

    print(
        f"Writing questions to {question_list_filename}, with schema {schema}, with table name {table_name} and answer {answer}.\n\n"
    )

    write_questions_to_file(question_list_filename, table_name, schema, sql_query, answer)


In [50]:
import re

def extract_table_name(sql_query):
    # Regular expression to find the table name after the FROM keyword
    match = re.search(r'FROM\s+([^\s,;]+)', sql_query, re.IGNORECASE)
    if match:
        return match.group(1)
    return None



answer_path = './evaluation/answer.txt'

with open(answer_path) as f:
    alist = [l.strip().split('\t')[0] for l in f.readlines()]
alist = pd.DataFrame(alist, columns=['query'])
alist['lower_table'] = alist['query'].apply(lambda x: extract_table_name(x).lower())


In [54]:
new_meta = meta.groupby(['table_name'])['col_name'].apply(list).reset_index()
new_meta = new_meta.set_index('table_name')

tpc_ds = []
for idx, row in new_meta.iterrows():
    v = [('|').join(row.values[0]), idx]
    print(v)
    tpc_ds.append(v)

['address_id|address_details', 'Addresses']
['customer_id|address_id|date_from|date_to', 'Customer_Addresses']
['order_id|customer_id|order_status_code|order_date', 'Customer_Orders']
['customer_id|payment_method_code|customer_code|customer_name|customer_address|customer_phone|customer_email', 'Customers']
['dept_store_chain_id|dept_store_chain_name', 'Department_Store_Chain']
['dept_store_id|dept_store_chain_id|store_name|store_address|store_phone|store_email', 'Department_Stores']
['department_id|dept_store_id|department_name', 'Departments']
['order_item_id|order_id|product_id', 'Order_Items']
['product_id|supplier_id|date_supplied_from|date_supplied_to|total_amount_purchased|total_value_purchased', 'Product_Suppliers']
['product_id|product_type_code|product_name|product_price', 'Products']
['staff_id|staff_gender|staff_name', 'Staff']
['staff_id|department_id|date_assigned_from|job_title_code|date_assigned_to', 'Staff_Department_Assignments']
['supplier_id|address_id|date_from|date

In [56]:
tpc_ds = pd.DataFrame(tpc_ds, columns=['column', 'table'])
tpc_ds['lower_table'] = tpc_ds['table'].apply(lambda x: x.lower())

tpc_ds = pd.merge(alist, tpc_ds, on='lower_table')
tpc_ds = tpc_ds.drop(columns = ['lower_table'])

In [65]:
import boto3
from botocore.config import Config

bedrock_region = athena_region = boto3.session.Session().region_name
retry_config = Config(retries = {'max_attempts': 100})
session = boto3.Session(region_name=bedrock_region)
bedrock = session.client('bedrock-runtime', region_name=bedrock_region, config=retry_config)

In [73]:
from langchain.embeddings import BedrockEmbeddings
from functools import reduce

# model_name = 'Titan-Embeddings-G1'
bedrock_embeddings = BedrockEmbeddings(client=bedrock)
for idx, x in tpc_ds.iterrows():
    print(x)
    add_new_table(
        idx=idx,
        sql_query=x['query'],
        schema=x['column'], 
        table_name=x['table'],
        is_incremental=True, 
        bedrock_embeddings=bedrock_embeddings
    )
    print('-------------------')


query     SELECT product_id FROM product_suppliers ORDER...
column    product_id|supplier_id|date_supplied_from|date...
table                                     Product_Suppliers
Name: 0, dtype: object
Adding table Product_Suppliers with schema product_id|supplier_id|date_supplied_from|date_supplied_to|total_amount_purchased|total_value_purchased
Writing questions to ./data/rag/questionList_Product_Suppliers_0.json, with schema product_id|supplier_id|date_supplied_from|date_supplied_to|total_amount_purchased|total_value_purchased, with table name Product_Suppliers and answer What are the top 3 products with the highest total amount purchased from suppliers?.


-------------------
query     SELECT product_id FROM product_suppliers ORDER...
column    product_id|supplier_id|date_supplied_from|date...
table                                     Product_Suppliers
Name: 1, dtype: object
Adding table Product_Suppliers with schema product_id|supplier_id|date_supplied_from|date_supplied_to|total

Writing questions to ./data/rag/questionList_Order_Items_13.json, with schema order_item_id|order_id|product_id, with table name Order_Items and answer A possible natural language question that could produce the given SQL query is:

"What is the most frequently ordered product?"

Explanation:
- The query `SELECT product_id FROM order_items` selects the product IDs from the `order_items` table.
- `GROUP BY product_id` groups the rows by product ID.
- `ORDER BY count(*) DESC` orders the groups by the count of rows in each group in descending order.
- `LIMIT 1` takes only the first row, which will be the product ID with the highest count (i.e., the most frequently ordered product).

Therefore, the query is finding the product ID of the most frequently ordered product, which aligns with the question "What is the most frequently ordered product?"..


-------------------
query     SELECT T1.customer_name ,  T1.customer_phone ,...
column    customer_id|payment_method_code|customer_code|...
ta

Writing questions to ./data/rag/questionList_Customers_25.json, with schema customer_id|payment_method_code|customer_code|customer_name|customer_address|customer_phone|customer_email, with table name Customers and answer A possible natural language question for the given SQL query could be:

"What are the distinct customer names who have pending orders, ordered by their customer ID?".


-------------------
query     SELECT T1.customer_name ,  T1.customer_address...
column    customer_id|payment_method_code|customer_code|...
table                                             Customers
Name: 26, dtype: object
Adding table Customers with schema customer_id|payment_method_code|customer_code|customer_name|customer_address|customer_phone|customer_email
Writing questions to ./data/rag/questionList_Customers_26.json, with schema customer_id|payment_method_code|customer_code|customer_name|customer_address|customer_phone|customer_email, with table name Customers and answer Possible natural langua

Writing questions to ./data/rag/questionList_Department_Stores_33.json, with schema dept_store_id|dept_store_chain_id|store_name|store_address|store_phone|store_email, with table name Department_Stores and answer A possible natural language question for the given SQL query could be:

"What are the top 2 department store chains with the most number of stores?".


-------------------
query     SELECT department_id FROM staff_department_ass...
column    staff_id|department_id|date_assigned_from|job_...
table                          Staff_Department_Assignments
Name: 34, dtype: object
Adding table Staff_Department_Assignments with schema staff_id|department_id|date_assigned_from|job_title_code|date_assigned_to
Writing questions to ./data/rag/questionList_Staff_Department_Assignments_34.json, with schema staff_id|department_id|date_assigned_from|job_title_code|date_assigned_to, with table name Staff_Department_Assignments and answer A possible natural language question that could produce t

Writing questions to ./data/rag/questionList_Customer_Orders_44.json, with schema order_id|customer_id|order_status_code|order_date, with table name Customer_Orders and answer A possible natural language question that could correspond to the given SQL query is:

"What are the distinct customer IDs that have placed orders after the earliest order cancellation date?".


-------------------
query     SELECT DISTINCT customer_id FROM Customer_Orde...
column    order_id|customer_id|order_status_code|order_date
table                                       Customer_Orders
Name: 45, dtype: object
Adding table Customer_Orders with schema order_id|customer_id|order_status_code|order_date
Writing questions to ./data/rag/questionList_Customer_Orders_45.json, with schema order_id|customer_id|order_status_code|order_date, with table name Customer_Orders and answer A possible natural language question that could correspond to the given SQL query is:

"What are the distinct customer IDs that have place

Writing questions to ./data/rag/questionList_Customers_57.json, with schema customer_id|payment_method_code|customer_code|customer_name|customer_address|customer_phone|customer_email, with table name Customers and answer A possible natural language question that could produce the given SQL query is:

"What are all the unique phone numbers present in the customers and suppliers tables?".


-------------------
query     SELECT product_id FROM Order_Items GROUP BY pr...
column                    order_item_id|order_id|product_id
table                                           Order_Items
Name: 58, dtype: object
Adding table Order_Items with schema order_item_id|order_id|product_id
Writing questions to ./data/rag/questionList_Order_Items_58.json, with schema order_item_id|order_id|product_id, with table name Order_Items and answer A possible natural language question for the given SQL query could be:

"Which products have been ordered more than 3 times or have a total amount purchased grea

Writing questions to ./data/rag/questionList_Customers_69.json, with schema customer_id|payment_method_code|customer_code|customer_name|customer_address|customer_phone|customer_email, with table name Customers and answer What are the distinct customer names who have ordered a product named "keyboard"?.


-------------------
query     SELECT DISTINCT T1.supplier_name ,  T1.supplie...
column             supplier_id|supplier_name|supplier_phone
table                                             Suppliers
Name: 70, dtype: object
Adding table Suppliers with schema supplier_id|supplier_name|supplier_phone
Writing questions to ./data/rag/questionList_Suppliers_70.json, with schema supplier_id|supplier_name|supplier_phone, with table name Suppliers and answer What are the distinct supplier names and phone numbers for suppliers who provide the product "red jeans"?.


-------------------
query     SELECT DISTINCT T1.supplier_name ,  T1.supplie...
column             supplier_id|supplier_name|suppl

Writing questions to ./data/rag/questionList_Customers_83.json, with schema customer_id|payment_method_code|customer_code|customer_name|customer_address|customer_phone|customer_email, with table name Customers and answer A possible natural language question for the given SQL query could be:

"What are the customer IDs and names of customers whose addresses are in Wyoming and who do not use credit cards as their payment method?".


-------------------
query     SELECT avg(product_price) FROM products WHERE ...
column    product_id|product_type_code|product_name|prod...
table                                              Products
Name: 84, dtype: object
Adding table Products with schema product_id|product_type_code|product_name|product_price
Writing questions to ./data/rag/questionList_Products_84.json, with schema product_id|product_type_code|product_name|product_price, with table name Products and answer What is the average price of products that belong to the 'Clothes' product type?.

