In [57]:
import os
import sys
from langchain.document_loaders.json_loader import JSONLoader
from langchain.docstore.document import Document
import json
import re
from langchain.vectorstores import FAISS
from langchain.embeddings import BedrockEmbeddings
from functools import reduce
from langchain.prompts import PromptTemplate
from sqlalchemy import MetaData
from sqlalchemy import create_engine


import re
import pandas as pd
import numpy as np
import json
import sqlite3

data_path = './data'
with open(f'{data_path}/tables.json', 'rb') as ofp:
    meta = json.load(ofp)
data = meta[0]

data = [i for i in meta if i['db_id'] == 'department_store']

data = data[0]
columns = data["column_names_original"]
col_df = pd.DataFrame(columns).iloc[1:]
col_df.rename(columns={0: 'table_idx', 1: 'col_name'}, inplace=True)
col_df

types_df = pd.DataFrame(data["column_types"]).iloc[1:]
types_df.rename(columns={0: 'type'}, inplace=True)
types_df

merged_col = pd.concat([col_df, types_df], axis=1)

tables_df = pd.DataFrame(data["table_names_original"])
tables_df.reset_index(inplace=True)
tables_df.columns = ['table_idx', 'table_name']

meta = pd.merge(tables_df, merged_col, on=['table_idx'])
meta = meta.drop(columns=['table_idx'])
meta.to_csv('./data/total_meta.csv', index=False)

In [2]:
from botocore.config import Config
import boto3
DB_NAME = "text2sql"
DB_FAISS_PATH = './vectorstore/db_faiss'

bedrock_region = athena_region = boto3.session.Session().region_name
retry_config = Config(retries = {'max_attempts': 100})
session = boto3.Session(region_name=bedrock_region)
bedrock = session.client('bedrock-runtime', region_name=bedrock_region, config=retry_config)

In [3]:
files = os.listdir('./data/rag')
df = pd.DataFrame()

for f_name in files:
    with open(f'./data/rag/{f_name}', 'rb') as ofp:
        df_tmp = pd.DataFrame(json.load(ofp))
        df = pd.concat([df, df_tmp])

df

Unnamed: 0,tableName,question,tableSchema
0,Customer_Addresses,What is the total number of unique customers ...,customer_id|address_id|date_from|date_to
1,Customer_Addresses,Which customers have had multiple addresses a...,customer_id|address_id|date_from|date_to
2,Customer_Addresses,How many customers have had a change in their...,customer_id|address_id|date_from|date_to
3,Customer_Addresses,Can you identify the customers who have had t...,customer_id|address_id|date_from|date_to
4,Customer_Addresses,What is the distribution of address changes a...,customer_id|address_id|date_from|date_to
...,...,...,...
5,Department_Stores,Can customers reach out to a department store...,dept_store_id|dept_store_chain_id|store_name|s...
6,Department_Stores,Which department store locations are part of ...,dept_store_id|dept_store_chain_id|store_name|s...
7,Department_Stores,How many department store chains are represen...,dept_store_id|dept_store_chain_id|store_name|s...
8,Department_Stores,What is the geographic distribution of the de...,dept_store_id|dept_store_chain_id|store_name|s...


In [4]:
import pandas as pd


def create_text(row, max_len=509):
    text = ""
    for col, val in row.items():
        text += f"{col}: {val},"
    if len(text) > max_len:
        text = text[:max_len] + "..."

    # print(text.rstrip("\n"))
    return text.rstrip()


# Assuming your DataFrame is called 'df'
df["text"] = df.apply(create_text, axis=1)
df.head(10)

Unnamed: 0,tableName,question,tableSchema,text
0,Customer_Addresses,What is the total number of unique customers ...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: What ..."
1,Customer_Addresses,Which customers have had multiple addresses a...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: Which..."
2,Customer_Addresses,How many customers have had a change in their...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: How m..."
3,Customer_Addresses,Can you identify the customers who have had t...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: Can y..."
4,Customer_Addresses,What is the distribution of address changes a...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: What ..."
5,Customer_Addresses,Which customers have had the most frequent ad...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: Which..."
6,Customer_Addresses,How does the frequency of address changes var...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: How d..."
7,Customer_Addresses,Can you identify any patterns or trends in th...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: Can y..."
8,Customer_Addresses,What is the average duration of an address as...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: What ..."
9,Customer_Addresses,Which customers have had gaps or overlaps in ...,customer_id|address_id|date_from|date_to,"tableName: Customer_Addresses,question: Which..."


In [5]:
# Find all items in df  with plot value longer than 512 characters
def find_long_plot_items(df):
    long_plot_items = df[df["text"].str.len() > 512]
    return long_plot_items


find_long_plot_items(df).count()

tableName      0
question       0
tableSchema    0
text           0
dtype: int64

In [6]:
def get_cfn_outputs(stackname, cfn):
    outputs = {}
    for output in cfn.describe_stacks(StackName=stackname)["Stacks"][0]["Outputs"]:
        outputs[output["OutputKey"]] = output["OutputValue"]
    return outputs

In [14]:
import boto3, json


region_name = "us-west-2"

cfn = boto3.client("cloudformation", region_name)
kms = boto3.client("secretsmanager", region_name)

stackname = "opensearch-workshop"
cfn_outputs = get_cfn_outputs(stackname, cfn)

aos_credentials = json.loads(
    kms.get_secret_value(SecretId=cfn_outputs["OpenSearchSecret"])["SecretString"]
)

aos_host = cfn_outputs["OpenSearchDomainEndpoint"]
aos_host

'search-opensearch-workshop-ashew5agtjkgsyxprzgu2m2oua.us-west-2.es.amazonaws.com'

In [15]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth

auth = (aos_credentials["username"], aos_credentials["password"])

aos_client = OpenSearch(
    hosts=[{"host": aos_host, "port": 443}],
    http_auth=auth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
)

In [16]:
import requests

search_model = {"query": {"match": {"name": "OpenSearch-Cohere"}}, "size": 10}

response = requests.get(
    "https://" + aos_host + "/_plugins/_ml/models/_search", auth=auth, json=search_model
)
model_info = json.loads(response.text)
model_id = model_info["hits"]["hits"][0]["_id"]
model_id

'DTDeTJAB3Hj2edbFglKU'

In [17]:
pipeline = {
    "description": "An neural search pipeline for movie index - OpenSearch-cohere-060124084807",
    "processors": [
        {
            "text_embedding": {
                "model_id": model_id,
                "field_map": {
                    "text": "vector_field",
                },
            }
        }
    ],
}

pipeline_id = "text2sql_plot"
# aos_client.ingest.delete_pipeline(id=pipeline_id)
aos_client.ingest.put_pipeline(id=pipeline_id, body=pipeline)

{'acknowledged': True}

In [18]:
index_name = "rag_semantic_ver1"

In [20]:
# aos_client.indices.delete(index=index_name)

In [21]:
rag_semantic = {
    "settings": {
        "max_result_window": 15000,
        # "analysis": {"analyzer": {"analysis-nori": {"type": "nori", "stopwords": "_korean_"}}},
        "index.knn": True,
        "default_pipeline": pipeline_id,
        "index.knn.space_type": "l2",
    },
    "mappings": {
        "properties": {
            "tableName": {
                "type": "text",
                "fields": {
                          "english": {
                            "type": "text",
                            "analyzer": "english"},                
                            },
            },
            "question": {
                "type": "text",
                "fields": {
                          "english": {
                            "type": "text",
                            "analyzer": "english"},                
                            },                
            },
            "tableSchema": {
                "type": "text",
                "fields": {
                          "english": {
                            "type": "text",
                            "analyzer": "english"},                
                            },     #{"keyword": {"type": "keyword", "ignore_above": 256}},
            },
            "vector_field": {
                "type": "knn_vector",
                "dimension": 1024,
                "method": {"name": "hnsw", "space_type": "l2", "engine": "faiss"},
                "store": True,
            },

        }
    },
}



aos_client.indices.create(index=index_name, body=rag_semantic)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'rag_semantic_ver1'}

In [22]:
from tqdm import tqdm
from opensearchpy import helpers

json_data = df.to_json(orient="records", lines=True)
docs = json_data.split("\n")[:-1]  # To remove the last empty line


def _generate_data():
    for doc in docs:
        yield {"_index": index_name, "_source": doc}


succeeded = []
failed = []
for success, item in helpers.parallel_bulk(
    aos_client, actions=_generate_data(), chunk_size=10, thread_count=1, queue_size=1
):
    if success:
        succeeded.append(item)
    else:
        failed.append(item)

In [23]:
# Refresh the index to make the changes visible
aos_client.indices.refresh(index=index_name)

count = aos_client.count(index=index_name)
print(count)

{'count': 140, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}}


In [37]:
def keyword_search(query_text):
    query = {
        "size": 10,
        "_source": {"excludes": ["vector_field"]},
        "query": {
            "multi_match": {
                "query": query_text,
                "fields": ["tableName", "question", "tableSchema"],
            }
        },
    }

    res = aos_client.search(index=index_name, body=query)

    query_result = []
    for hit in res["hits"]["hits"]:
        row = [
            hit["_score"],
            hit["_source"]["tableName"],
            hit["_source"]["question"],
            hit["_source"]["tableSchema"],            
        ]
        query_result.append(row)

    query_result_df = pd.DataFrame(
        data=query_result, columns=["_score", "tableName", "question", "tableSchema"]
    )
    return query_result_df

In [38]:
def semantic_search(query_text):
    query = {
        "size": 10,
        "_source": {"excludes": ["vector_field"]},
        "query": {
            "neural": {"vector_field": {"query_text": query_text, "model_id": model_id, "k": 10}},
        },
    }

    res = aos_client.search(index=index_name, body=query)

    query_result = []
    for hit in res["hits"]["hits"]:
        row = [
            hit["_score"],
            hit["_source"]["tableName"],
            hit["_source"]["question"],
            hit["_source"]["tableSchema"],            
        ]
        query_result.append(row)

    query_result_df = pd.DataFrame(
        data=query_result, columns=["_score", "tableName", "question", "tableSchema"]
    )
    return query_result_df

In [39]:
def hybrid_search(query_text, keyword_weight=0.3, semantic_weight=0.7):
    query = {
        "size": 10,
        "_source": {"exclude": ["text", "vector_field"]},
        "query": {
            "hybrid": {
                "queries": [
                    {
                        "multi_match": {
                            "query": query_text,
                        "fields": ["tableName", "question", "tableSchema"],
                        }
                    },
                    {
                        "neural": {
                            "vector_field": {
                                "query_text": query_text,
                                "model_id": model_id,
                                "k": 30,
                            }
                        }
                    },
                ]
            }
        },
        "search_pipeline": {
            "description": "Post processor for hybrid search",
            "phase_results_processors": [
                {
                    "normalization-processor": {
                        "normalization": {"technique": "min_max"},
                        "combination": {
                            "technique": "arithmetic_mean",
                            "parameters": {"weights": [keyword_weight, semantic_weight]},
                        },
                    }
                }
            ],
        },
    }

    res = aos_client.search(index=index_name, body=query)

    query_result = []
    for hit in res["hits"]["hits"]:
        row = [
            hit["_score"],
            hit["_source"]["tableName"],
            hit["_source"]["question"],
            hit["_source"]["tableSchema"],            
        ]
        query_result.append(row)

    query_result_df = pd.DataFrame(
        data=query_result, columns=["_score", "tableName", "question", "tableSchema"]
    )
    return query_result_df

In [40]:
query_text = "people email"

display(keyword_search(query_text))

display(semantic_search(query_text))

display(hybrid_search(query_text, keyword_weight=0.1, semantic_weight=0.9))

Unnamed: 0,_score,tableName,question,tableSchema
0,3.005003,Department_Stores,Can customers reach out to a department store...,dept_store_id|dept_store_chain_id|store_name|s...
1,2.675094,Customers,Which customers have provided both phone numb...,customer_id|payment_method_code|customer_code|...


Unnamed: 0,_score,tableName,question,tableSchema
0,0.489988,Customers,Which customers have provided both phone numb...,customer_id|payment_method_code|customer_code|...
1,0.478523,Department_Stores,Can customers reach out to a department store...,dept_store_id|dept_store_chain_id|store_name|s...
2,0.466766,Customers,Which customers have provided their contact i...,customer_id|payment_method_code|customer_code|...
3,0.465974,Addresses,• 10. Which addresses are associated with high...,address_id|address_details
4,0.465111,Addresses,• 4. Can you provide a list of addresses that ...,address_id|address_details
5,0.462699,Customer_Addresses,Which customers have had gaps or overlaps in ...,customer_id|address_id|date_from|date_to
6,0.462589,Suppliers,• 2. Which suppliers have a phone number listed?,supplier_id|supplier_name|supplier_phone
7,0.461825,Customers,Can customers' preferred communication channe...,customer_id|payment_method_code|customer_code|...
8,0.459224,Addresses,• 2. Which addresses have been recently added ...,address_id|address_details
9,0.455127,Customer_Addresses,Which customers have had multiple addresses a...,customer_id|address_id|date_from|date_to


Unnamed: 0,_score,tableName,question,tableSchema
0,0.9001,Customers,Which customers have provided both phone numb...,customer_id|payment_method_code|customer_code|...
1,0.860234,Department_Stores,Can customers reach out to a department store...,dept_store_id|dept_store_chain_id|store_name|s...
2,0.616901,Customers,Which customers have provided their contact i...,customer_id|payment_method_code|customer_code|...
3,0.607252,Addresses,• 10. Which addresses are associated with high...,address_id|address_details
4,0.59673,Addresses,• 4. Can you provide a list of addresses that ...,address_id|address_details
5,0.567321,Customer_Addresses,Which customers have had gaps or overlaps in ...,customer_id|address_id|date_from|date_to
6,0.56599,Suppliers,• 2. Which suppliers have a phone number listed?,supplier_id|supplier_name|supplier_phone
7,0.556671,Customers,Can customers' preferred communication channe...,customer_id|payment_method_code|customer_code|...
8,0.52496,Addresses,• 2. Which addresses have been recently added ...,address_id|address_details
9,0.475017,Customer_Addresses,Which customers have had multiple addresses a...,customer_id|address_id|date_from|date_to


In [42]:
rag_df = hybrid_search(query_text, keyword_weight=0.1, semantic_weight=0.9)
rag_table = rag_df['tableName'].drop_duplicates(keep='first')[:5].values
meta = pd.read_csv('./data/total_meta.csv')

In [75]:
def merge_total_meta_rag(rag_table, meta):
    meta = meta[meta['table_name'].isin(rag_table)]

    meta['order'] = pd.Categorical(meta['table_name'], categories=rag_table, ordered=True)
    meta = meta.sort_values(by=['order'])
    meta.reset_index(inplace=True, drop = True)
    meta = meta.drop(columns=['order'])   
    return meta

In [77]:
# SQL DDL 생성 함수
def generate_sql_ddl(df):
    ddl_dict = {}
    for _, row in df.iterrows():
        table_name = row['table_name']
        col_name = row['col_name']
        col_type = row['type']
        if col_type == 'numbe':  # Fixing the typo in 'number'
            col_type = 'number'
        if table_name not in ddl_dict:
            ddl_dict[table_name] = []
        ddl_dict[table_name].append(f'    {col_name} {col_type.upper()}')
    
    ddl_statements = []
    for table_name, columns in ddl_dict.items():
        columns_str = ",\n".join(columns)
        ddl = f'CREATE TABLE {table_name} (\n{columns_str}\n);'
        ddl_statements.append(ddl)
    
    return "\n\n".join(ddl_statements)

In [None]:
meta = merge_total_meta_rag(rag_table, meta)
# SQL DDL 생성
sql_ddl = generate_sql_ddl(meta)
print(sql_ddl)