In [1]:
!pip install -q boto3
!pip install -q requests
!pip install -q requests-aws4auth
!pip install -q opensearch-py
!pip install -q tqdm

In [2]:
!pip install "faiss-cpu" --quiet
!pip install langchain --quiet
!pip install jq --quiet

In [7]:
import os
import sys
from langchain.document_loaders.json_loader import JSONLoader
from langchain.docstore.document import Document
import json
import re
from langchain.vectorstores import FAISS
from langchain.embeddings import BedrockEmbeddings
from functools import reduce
from langchain.prompts import PromptTemplate
from sqlalchemy import MetaData
from sqlalchemy import create_engine


import re
import pandas as pd
import numpy as np
import json
import sqlite3

data_path = './data'
with open(f'{data_path}/tables.json', 'rb') as ofp:
    meta = json.load(ofp)
data = meta[0]

data = [i for i in meta if i['db_id'] == 'department_store']

data  = data[0]    
columns = data["column_names_original"]
col_df = pd.DataFrame(columns).iloc[1:]
col_df.rename(columns={0: 'table_idx', 1: 'col_name'}, inplace=True)
col_df

types_df = pd.DataFrame(data["column_types"]).iloc[1:]
types_df.rename(columns={0: 'type'}, inplace=True)
types_df

merged_col = pd.concat([col_df, types_df], axis=1)

tables_df = pd.DataFrame(data["table_names_original"])
tables_df.reset_index(inplace=True)
tables_df.columns = ['table_idx', 'table_name']

meta = pd.merge(tables_df, merged_col, on=['table_idx'])
meta = meta.drop(columns=['table_idx'])

In [8]:
from botocore.config import Config
import boto3
DB_NAME = "text2sql"
DB_FAISS_PATH = './vectorstore/db_faiss'

bedrock_region = athena_region = boto3.session.Session().region_name
retry_config = Config(retries = {'max_attempts': 100})
session = boto3.Session(region_name=bedrock_region)
bedrock = session.client('bedrock-runtime', region_name=bedrock_region, config=retry_config)

In [9]:
files = os.listdir('./data/rag')

with open(f'./data/rag/{files[0]}', 'rb') as ofp:
    df = json.load(ofp)

In [10]:
# We will be using the Titan Embeddings Model to generate our Embeddings.
from langchain.embeddings import BedrockEmbeddings

# model_name="Titan-Embeddings-G1"
llm_emb = BedrockEmbeddings(client=bedrock)
dimension = 1536
llm_emb

BedrockEmbeddings(client=<botocore.client.BedrockRuntime object at 0x7f4497e530d0>, region_name=None, credentials_profile_name=None, model_id='amazon.titan-embed-text-v1', model_kwargs=None, endpoint_url=None, normalize=False)

In [11]:
def get_cfn_outputs(stackname, cfn):
    outputs = {}
    for output in cfn.describe_stacks(StackName=stackname)["Stacks"][0]["Outputs"]:
        outputs[output["OutputKey"]] = output["OutputValue"]
    return outputs

In [12]:
import boto3, json


region_name = "us-west-2"

cfn = boto3.client("cloudformation", region_name)
kms = boto3.client("secretsmanager", region_name)

stackname = "opensearch-workshop"
cfn_outputs = get_cfn_outputs(stackname, cfn)

aos_credentials = json.loads(
    kms.get_secret_value(SecretId=cfn_outputs["OpenSearchSecret"])["SecretString"]
)

aos_host = cfn_outputs["OpenSearchDomainEndpoint"]
aos_host

'search-opensearch-workshop-ashew5agtjkgsyxprzgu2m2oua.us-west-2.es.amazonaws.com'

In [13]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth

auth = (aos_credentials["username"], aos_credentials["password"])

aos_client = OpenSearch(
    hosts=[{"host": aos_host, "port": 443}],
    http_auth=auth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
)

In [14]:
import requests

search_model = {"query": {"match": {"name": "OpenSearch-Cohere"}}, "size": 10}

response = requests.get(
    "https://" + aos_host + "/_plugins/_ml/models/_search", auth=auth, json=search_model
)
model_info = json.loads(response.text)
model_id = model_info["hits"]["hits"][0]["_id"]
model_id

'DTDeTJAB3Hj2edbFglKU'

In [15]:
pipeline = {
    "description": "Text to Sql Task - OpenSearch-cohere-060124084807",
    "processors": [
        {
            "text_embedding": {
                "model_id": model_id,
                "field_map": {
                    "text": "vector_field",
                },
            }
        }
    ],
}

pipeline_id = "text2sql_meta_data"
# aos_client.ingest.delete_pipeline(id=pipeline_id)
aos_client.ingest.put_pipeline(id=pipeline_id, body=pipeline)

{'acknowledged': True}

In [45]:
index_name = "rag_semantic_ver11"

# aos_client.indices.delete(index=index_name)

rag_semantic = {
    "settings": {
        "max_result_window": 15000,
        "analysis": {"analyzer": {"analysis-nori": {"type": "nori", "stopwords": "_korean_"}}},
        "index.knn": True,
        "default_pipeline": pipeline_id,
        "index.knn.space_type": "l2",
    },
    "mappings": {
        "properties": {
            "tableName": {
                "type": "text",
                "fields": {
                          "english": {
                            "type": "text",
                            "analyzer": "english"},                
                            },
            },
            "question": {
                "type": "text",
                "fields": {
                          "english": {
                            "type": "text",
                            "analyzer": "english"},                
                            },                
            },
            "tableSchema": {
                "type": "text",
                "fields": {
                          "english": {
                            "type": "text",
                            "analyzer": "english"},                
                            },     #{"keyword": {"type": "keyword", "ignore_above": 256}},
            },
            "vector_field": {
                "type": "knn_vector",
                "dimension": 1024,
                "method": {"name": "hnsw", "space_type": "l2", "engine": "faiss"},
                "store": True,
            },

        }
    },
}



aos_client.indices.create(index=index_name, body=rag_semantic)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'rag_semantic_ver11'}

In [46]:
from tqdm import tqdm
from opensearchpy import helpers

def _generate_data():
    for doc in docs:
        yield {"_index": index_name, "_source": doc}

succeeded = []
failed = []

json_files = os.listdir('./data/rag')
for p in json_files:
    with open(f"./data/rag/{p}", 'rb') as ofp:
        docs = json.load(ofp)

    for success, item in helpers.parallel_bulk(
        aos_client, actions= _generate_data(), chunk_size=10, thread_count=1, queue_size=1
    ):
        if success:
            succeeded.append(item)
        else:
            failed.append(item)

In [47]:
# Refresh the index to make the changes visible
aos_client.indices.refresh(index=index_name)

count = aos_client.count(index=index_name)
print(count)

{'count': 140, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}}


In [48]:
def keyword_search(query_text):
    query = {
        "size": 10,
        "_source": {"excludes": ["vector_field"]},
        "query": {
            "multi_match": {
                "query": query_text,
                "fields": ["tableName", "question", "tableSchema"],
            }
        },
    }

    res = aos_client.search(index=index_name, body=query)

    query_result = []
    for hit in res["hits"]["hits"]:
        row = [
            hit["_score"],
            hit["_source"]["tableName"],
            hit["_source"]["question"],
            hit["_source"]["tableSchema"],            
        ]
        query_result.append(row)

    query_result_df = pd.DataFrame(
        data=query_result, columns=["_score", "tableName", "question", "tableSchema"]
    )
    display(query_result_df)

In [49]:
query_text = "customer table"
keyword_search(query_text)

Unnamed: 0,_score,tableName,question,tableSchema
0,3.837943,Customers,What is the relationship between customer cod...,customer_id|payment_method_code|customer_code|...
1,3.30594,Customers,What are the different types of customer code...,customer_id|payment_method_code|customer_code|...
2,2.894174,Staff,• 3. How many unique staff members are listed ...,staff_id|staff_gender|staff_name
3,2.697738,Customer_Addresses,What is the distribution of address changes a...,customer_id|address_id|date_from|date_to
4,2.658295,Customer_Orders,What is the total number of orders placed by ...,order_id|customer_id|order_status_code|order_date
5,2.61007,Customers,How can customers be categorized based on the...,customer_id|payment_method_code|customer_code|...
6,2.570412,Department_Stores,How can a customer contact a specific departm...,dept_store_id|dept_store_chain_id|store_name|s...
7,2.152261,Addresses,• 4. Can you provide a list of addresses that ...,address_id|address_details
8,2.152261,Customer_Orders,Can you determine the average time between or...,order_id|customer_id|order_status_code|order_date


In [50]:
def semantic_search(query_text):
    query = {
        "size": 10,
        "_source": {"excludes": ["vector_field"]},
        "query": {
            "neural": {"vector_field": {"query_text": query_text, "model_id": model_id, "k": 10}},
        },
    }

    res = aos_client.search(index=index_name, body=query)
    return res

#     query_result = []
#     for hit in res["hits"]["hits"]:
#         row = [
#             hit["_score"],
#             hit["_source"]["tableName"],
#             hit["_source"]["question"],
#             hit["_source"]["tableSchema"],            
#         ]
#         query_result.append(row)

#     query_result_df = pd.DataFrame(
#         data=query_result, columns=["_score", "tableName", "question", "tableSchema"]
#     )
#     display(query_result_df)

In [51]:
res = semantic_search(query_text)
# print(index_name)

In [52]:
res

{'took': 365,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [53]:
!git status

On branch rag
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   04_0_load_pdf_kr_docs_opensearch.ipynb[m
	[31mmodified:   rag-lexical search(opensearch).ipynb[m
	[31mmodified:   rag-semantic search(langchain).ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git ad