# Lab. 3-1 Schema Preparation-2

In this notebook, we'll be focusing on the '3. Table Summarizer' process as illustrated in the diagram below.

Typically, the Schema Linking process for multi-table structure is divided into two steps: table selection followed by column selection. It's crucial to have comprehensive descriptions for each table because if the wrong table is selected, all subsequent steps become meaningless.

This notebook will simulate the process of using a LLM to create detailed descriptive documents for each table.

![Intro](../images/text2sql/schema-prep-1.png)


## Step 0: OpenSearch Configurations

In [None]:
from libs.ssm import parameter_store

pm = parameter_store('us-west-2')
domain_endpoint = pm.get_params(key="chatbot-opensearch_domain_endpoint", enc=False)
opensearch_domain_endpoint = f"https://{domain_endpoint}"
opensearch_user_id = pm.get_params(key="chatbot-opensearch_user_id", enc=False)
opensearch_user_password = pm.get_params(key="chatbot-opensearch_user_password", enc=True)
print(opensearch_domain_endpoint)

## Step 1: Loading `Schema Description` & `Example Queries`

In [None]:
import json 
SCHEMA_FILE_PATH = "./chinook_schema.json"
SAMPLE_QUERY_FILE_PATH = "./example_queries_temp.jsonl"

def load_schema(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        schema = json.load(file)
    return schema

def load_queries(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        queries = file.readlines()
    return queries

schema = load_schema(SCHEMA_FILE_PATH)
queries = load_queries(SAMPLE_QUERY_FILE_PATH)

## Step 2: Table summrization

We utilize various information to generate table summary documents.

We create table summaries using all available resources, including the basic Schema Description document and Sample Queries.

Below is an LLM prompt template designed to incorporate this information.

In [None]:
summarization_sys_prompt = [{
    "text": """ 
You are a data analyst. Summarize the provided SQL table based on the given context.

<instruction> 
- Write a detailed summary based solely on the provided information. 
- Focus on the table's contents and structure, not on sample queries. 
- Describe the data types and content objectively, without subjective adjectives. 
- Do not mention sample queries or speculate on who might use the table. 
- Include potential use cases: questions the table can answer and analyses it enables. 
- Provide a concise summary without any preamble or introduction. 
</instruction>
""" 
}]

def get_summarization_prompt(table_schema, sample_queries):
    return [{
        "role": "user",
        "content": [{"text": f"""<table schema>
{table_schema}
</table schema>

<sample queries>
{sample_queries}
</sample queries>"""}]
    }]

In [None]:
import boto3
from botocore.config import Config

region_name = "us-west-2"
llm_model = "anthropic.claude-3-5-haiku-20241022-v1:0"

def init_boto3_client(region: str):
    retry_config = Config(
        region_name=region,
        retries={"max_attempts": 10, "mode": "standard"}
    )
    return boto3.client("bedrock-runtime", region_name=region, config=retry_config)

def converse_with_bedrock(boto3_client, sys_prompt, usr_prompt):    
    temperature = 0.0
    top_p = 0.1
    inference_config = {"temperature": temperature, "topP": top_p}
    
    response = boto3_client.converse(
        modelId=llm_model, 
        messages=usr_prompt, 
        system=sys_prompt,
        inferenceConfig=inference_config
    )

    return response['output']['message']['content'][0]['text']

def search_table_queries(queries, table_name): 
    table_name_lower = table_name.lower()
    matched_queries = []

    for line in queries:
        try:
            query_data = json.loads(line)
            if table_name_lower in query_data['query'].lower():
                matched_queries.append(query_data)
        except json.JSONDecodeError:
            print(f"Invalid JSON line: {line}")
    
    return matched_queries

boto3_client = init_boto3_client(region_name)

#### Based on the given information, we will extract a summary document for the table named `Customer`.

In [None]:
table_name = 'Customer'

matched_queries = search_table_queries(queries, table_name)
print("matched queries:\n", matched_queries, "\n")

table_summary = converse_with_bedrock(boto3_client, 
                                      summarization_sys_prompt, 
                                      get_summarization_prompt(schema[0][table_name], matched_queries))

print(table_summary)

#### The code below performs this operation for all tables in the Schema Description (it takes about 2-3 minutes)

In [None]:
import os

OUTPUT_FILE_PATH1 = "./chinook_detailed_schema_temp.json"

def summarize_table(table_name, table_desc, summarization_sys_prompt, queries):
    table_summary = converse_with_bedrock(boto3_client, 
                                          summarization_sys_prompt, 
                                          get_summarization_prompt(table_desc, queries))
    table_desc['table_summary'] = table_summary 
    return {table_name: table_desc}

def write_summaries_to_file(summaries, file_path):
    with open(file_path, 'w', encoding='utf-8') as output_file:
        json.dump(summaries, output_file, ensure_ascii=False, indent=4)

def process_schema(schema, summarization_sys_prompt, queries):
    summaries = []
    for table_info in schema:
        for table_name, table_desc in table_info.items():
            matched_queries = search_table_queries(queries, table_name)
            summary = summarize_table(table_name, table_desc, summarization_sys_prompt, matched_queries)
            summaries.append(summary)
    return summaries

table_summaries = process_schema(schema, summarization_sys_prompt, queries)
write_summaries_to_file(table_summaries, OUTPUT_FILE_PATH1)

In the `chinook_detailed_schema_temp.json` file, you'll see that the table_summary has been added to the schema document.

As demonstrated above, providing the LLM with detailed information about 1) what columns are in the table, and 2) how the table is used, helps in selecting the correct table.

However, when the table summaries become too long, it's not feasible to pass summaries of all tables to the LLM. In such cases, it's better to explore the table summary information using vector similarity search.

## Step 3: Transform documents to vector embeddings and Store in OpenSearch

This step proceeds similarly to the sample query storage process performed in `1.sample_queries.ipynb`.

In [None]:
import yaml
from opensearchpy import OpenSearch, RequestsHttpConnection
INDEX_NAME = "schema_description"

def load_opensearch_config():
    with open("../libs/opensearch.yml", 'r', encoding='utf-8') as file:
        return yaml.safe_load(file)

def init_opensearch(config):
    mapping = {"settings": config['settings'], "mappings": config['mappings-detailed-schema']}
    endpoint = opensearch_domain_endpoint
    http_auth = (opensearch_user_id, opensearch_user_password)

    os_client = OpenSearch(
            hosts=[{'host': endpoint.replace("https://", ""),'port': 443}],
            http_auth=http_auth, 
            use_ssl=True,
            verify_certs=True,
            timeout=300,
            connection_class=RequestsHttpConnection
    )

    create_os_index(os_client, mapping)
    return os_client

def create_os_index(os_client, mapping):
    exists = os_client.indices.exists(INDEX_NAME)

    if exists:
        os_client.indices.delete(index=INDEX_NAME)
        print("Existing index has been deleted. Create new one.")
    else:
        print("Index does not exist, Create one.")

    os_client.indices.create(INDEX_NAME, body=mapping)

config = load_opensearch_config()
os_client = init_opensearch(config)

In [None]:
embed_model = "amazon.titan-embed-text-v2:0"
region_name = "us-west-2"

OUTPUT_FILE_PATH2 = "./chinook_detailed_schema.json"

def summary_embedding():
    with open(OUTPUT_FILE_PATH1, 'r', encoding='utf-8') as input_file:
        data_list = json.load(input_file)

    for data in data_list:
        table_name = list(data.keys())[0]
        table_summary = data[table_name]["table_summary"]

        response = boto3_client.invoke_model(
                modelId=embed_model,
                body=json.dumps({"inputText": table_summary})
            )
        
        data[table_name]["table_summary_v"] = json.loads(response['body'].read())['embedding']
    
    with open(OUTPUT_FILE_PATH2, 'w', encoding='utf-8') as output_file:
        json.dump(data_list, output_file, ensure_ascii=False, indent=4)

summary_embedding()

If you now open the `chinook_detailed_schema_temp.json` file,
you'll see that the table_summary and its corresponding embedding have been added to the schema document.

In [None]:
def load_detailed_schema_descriptions(os_client):

    with open(OUTPUT_FILE_PATH2, 'r') as file:
        schema_data = json.load(file)

    bulk_data = []
    for table in schema_data:
        for table_name, table_info in table.items():
            table_doc = {
                "table_name": table_name,
                "table_desc": table_info["table_desc"],
                "columns": [{"col_name": col["col"], "col_desc": col["col_desc"]} for col in table_info["cols"]],
                "table_summary": table_info["table_summary"],
                "table_summary_v": table_info["table_summary_v"]
            }
            bulk_data.append({"index": {"_index": INDEX_NAME, "_id": table_name}})
            bulk_data.append(table_doc)
    
    bulk_data_str = '\n'.join(json.dumps(item) for item in bulk_data) + '\n'

    response = os_client.bulk(body=bulk_data_str)
    if response["errors"]:
        print("There were errors during bulk indexing:")
        for item in response["items"]:
            if 'index' in item and item['index']['status'] >= 400:
                print(f"Error: {item['index']['error']['reason']}")
    else:
        print("Bulk-inserted all items successfully.")

load_detailed_schema_descriptions(os_client)

#### Now, the schema description has been stored into OpenSearch