In [1]:
%store -r

In [None]:
!pip install -U boto3

## Checking OpenSearch Serverless

If you haven't set up OpenSearch Serverless yet, please refer to the `lab2_text2sql_schema_preparation/1.setup_opensearch.ipynb` notebook.

In [2]:
from dotenv import load_dotenv, get_key
import os

current_path = os.getcwd()
parent_path = os.path.dirname(current_path)
env_path = os.path.join(parent_path, '.env')

load_dotenv(dotenv_path=env_path)

collection_name = get_key(key_to_get='COLLECTION_NAME', dotenv_path=env_path)
collection_arn = get_key(key_to_get='COLLECTION_ARN', dotenv_path=env_path)
collection_id = get_key(key_to_get='COLLECTION_ID', dotenv_path=env_path)
collection_endpoint = get_key(key_to_get='COLLECTION_ENDPOINT', dotenv_path=env_path)

In [None]:
collection_arn

In [None]:
import json
import boto3

oss_client = boto3.client('opensearchserverless')

access_policy_name = 'text2sql-access-policy'

response = oss_client.get_access_policy(
    name=access_policy_name,
    type='data'
)
policy_detail = response['accessPolicyDetail']
print(policy_detail)

The data access policy will be updated to include the `Knowledge Base service role`.

## IAM Role for Knowledge Base for Structured Data Source

In [None]:
import boto3
import json

def delete_iam_role(iam, role_name):
    attached_policies = iam.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']
    for policy in attached_policies:
        iam.detach_role_policy(RoleName=role_name, PolicyArn=policy['PolicyArn'])

    inline_policies = iam.list_role_policies(RoleName=role_name)['PolicyNames']
    for policy_name in inline_policies:
        iam.delete_role_policy(RoleName=role_name, PolicyName=policy_name)

    iam.delete_role(RoleName=role_name)

    print(f"IAM Role '{role_name}' deleted successfully.")


def create_iam_role(account_id, region, workgroup_arn, secret_arn):
    iam = boto3.client('iam')
    role_name = "KnowledgeBase_Structured_Role"

    trust_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Sid": "TrustPolicyStatement",
                "Effect": "Allow",
                "Principal": {
                    "Service": "bedrock.amazonaws.com"
                },
                "Action": "sts:AssumeRole",
                "Condition": {
                    "StringEquals": {
                        "aws:SourceAccount": account_id
                    },
                    "ArnLike": {
                        "aws:SourceArn": f"arn:aws:bedrock:{region}:{account_id}:knowledge-base/*"
                    }
                }
            }
        ]
    }

    try:
        iam.get_role(RoleName=role_name)
        print(f"IAM Role '{role_name}' already exists. Deleting it...")
        delete_iam_role(iam, role_name)

        response = iam.create_role(
            RoleName=role_name,
            AssumeRolePolicyDocument=json.dumps(trust_policy)
        )
        print(f"IAM Role '{role_name}' created successfully.")
    except iam.exceptions.EntityAlreadyExistsException:
        print(f"IAM Role '{role_name}' already exists.")
        response = iam.get_role(RoleName=role_name)

    policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Sid": "RedshiftDataAPIStatementPermissions",
                "Effect": "Allow",
                "Action": [
                    "redshift-data:GetStatementResult",
                    "redshift-data:DescribeStatement",
                    "redshift-data:CancelStatement"
                ],
                "Resource": ["*"],
                "Condition": {
                    "StringEquals": {
                        "redshift-data:statement-owner-iam-userid": "${aws:userid}"
                    }
                }
            },
            {
                "Sid": "RedshiftDataAPIExecutePermissions",
                "Effect": "Allow",
                "Action": ["redshift-data:ExecuteStatement"],
                "Resource": [workgroup_arn]
            },
            {
                "Sid": "GetSecretPermissions",
                "Effect": "Allow",
                "Action": ["secretsmanager:GetSecretValue"],
                "Resource": [secret_arn]
            },
            {
                "Sid": "SqlWorkbenchAccess",
                "Effect": "Allow",
                "Action": [
                    "sqlworkbench:GetSqlRecommendations",
                    "sqlworkbench:PutSqlGenerationContext",
                    "sqlworkbench:GetSqlGenerationContext",
                    "sqlworkbench:DeleteSqlGenerationContext"
                ],
                "Resource": "*"
            },
            # {
            #     "Sid": "OpenSearchServerlessAPIAccessAllStatement",
            #     "Effect": "Allow",
            #     "Action": [
            #         "aoss:APIAccessAll"
            #     ],
            #     "Resource": [ collection_arn ]
            # },
            {
                "Sid": "KbAccess",
                "Effect": "Allow",
                "Action": ["bedrock:GenerateQuery"],
                "Resource": "*"
            }
        ]
    }

    iam.put_role_policy(
        RoleName=role_name,
        PolicyName="KnowledgeBaseSturcturedSourcePolicy",
        PolicyDocument=json.dumps(policy)
    )
    print(f"Inline policy attached to role '{role_name}'.")

    # Get and return the role ARN
    role_arn = response['Role']['Arn']
    return role_arn


kb_role_arn = create_iam_role(account_id, region, workgroup_arn, secret_arn)
print(f"Created IAM Role ARN: {kb_role_arn}")

### Adding the Knowledge Base role as a principal to the OpenSearch Data Access policy

In [70]:
# policy_detail['policy'][0]['Principal'].append(kb_role_arn)

# updated_policy = json.dumps(policy_detail['policy'])
# update_response = oss_client.update_access_policy(
#     name=access_policy_name,
#     description=policy_detail['description'],
#     policy=updated_policy,
#     policyVersion=policy_detail['policyVersion'],
#     type='data'
# )

# print("Updated Access Policy:")
# print(json.dumps(update_response, indent=2))

## Bedrock Knowledge Base Creation

*Currently, member must have length less than or equal to 200 in `generationContext`

In [None]:
import json

def truncate_string(s, max_length=200):
    return s if len(s) <= max_length else s[:max_length-3] + "..."

def transform_schema(input_file, database_name, schema_name):
    with open(input_file, 'r') as f:
        data = json.load(f)

    transformed_schema = {"tables": []}

    for table in data:
        table_name = list(table.keys())[0]
        table_info = table[table_name]

        new_table = {
            "columns": [],
            "description": truncate_string(table_info["table_summary"]),
            "inclusion": "INCLUDE",
            "name": f"{database_name}.{schema_name}.{table_name}" 
        }

        for col in table_info["cols"]:
            new_column = {
                "description": truncate_string(col["col_desc"]),
                "inclusion": "INCLUDE",
                "name": truncate_string(col["col"])
            }
            new_table["columns"].append(new_column)

        transformed_schema["tables"].append(new_table)

    return transformed_schema

input_file = "./redshift_schema.json"
transformed_schema = transform_schema(input_file, database_name=redshift_dbname, schema_name="public")

print(json.dumps(transformed_schema, indent=2))

In [None]:
def transform_queries(input_file):
    transformed_queries = []
    with open(input_file, 'r') as f:
        for line in f:
            query_data = json.loads(line)

            transformed_query = {
                'naturalLanguage': query_data['input'],
                'sql': query_data['query']
            }

            transformed_queries.append(transformed_query)

        manual_query = {
            'naturalLanguage': sample_question,
            'sql': sample_query
        }
        transformed_queries[0] = (manual_query)
    return transformed_queries

input_file = "./redshift_example_queries.jsonl"
transformed_queries = transform_queries(input_file)

print(json.dumps(transformed_queries, indent=2))

In [8]:
import uuid
import boto3

bedrock_agent = boto3.client('bedrock-agent')
kb_name = 'SQLKnowledgeBase'
embedding_model = "amazon.titan-embed-text-v2:0"

def create_knowledge_base():
    return bedrock_agent.create_knowledge_base(
        name=kb_name,
        knowledgeBaseConfiguration={
            'sqlKnowledgeBaseConfiguration': {
                'type': 'REDSHIFT',
                'redshiftConfiguration': {
                    'queryEngineConfiguration': {
                        'type': 'SERVERLESS',
                        'serverlessConfiguration': {
                            'workgroupArn': workgroup_arn,
                            'authConfiguration': {
                                'type': 'USERNAME_PASSWORD',
                                'usernamePasswordSecretArn': secret_arn
                            }
                        }
                    },
                    'queryGenerationConfiguration': {
                        'executionTimeoutSeconds': 120,
                        'generationContext': {
                            'curatedQueries': [
                                {
                                    'naturalLanguage': query['naturalLanguage'],
                                    'sql': query['sql']
                                } for query in transformed_queries
                            ],
                            'tables': [
                                {
                                    'columns': [
                                        {
                                            'description': col['description'],
                                            'inclusion': col['inclusion'],
                                            'name': col['name']
                                        } for col in table['columns']
                                    ],
                                    'description': table['description'],
                                    'inclusion': table['inclusion'],
                                    'name': table['name']
                                } for table in transformed_schema['tables']
                            ]
                        }
                    },
                    'storageConfigurations': [
                        {
                            'type': 'REDSHIFT',
                            'redshiftConfiguration': {
                                'databaseName': redshift_dbname
                            }
                        }
                    ]
                }            
            },
            'type': 'SQL',
            'vectorKnowledgeBaseConfiguration': {
                'embeddingModelArn': embedding_model,
                'embeddingModelConfiguration': {
                    'bedrockEmbeddingModelConfiguration': {
                        'dimensions': 1024,
                        'embeddingDataType': 'FLOAT32'
                    }
                },
            }
        },
        roleArn=kb_role_arn,
        clientToken=str(uuid.uuid4())
    )


In [None]:

response = bedrock_agent.list_knowledge_bases(maxResults=100)
existing_kb = next((kb for kb in response['knowledgeBaseSummaries'] if kb['name'] == kb_name), None)

if existing_kb:
    kb_id = existing_kb['knowledgeBaseId']
    print(f"Knowledge base '{kb_name}' already exists. Skipping creation.")
    print(f"Knowledge base ID: {kb_id}")
    print(f"Status: {existing_kb['status']}")
else:
    kb_response = create_knowledge_base()
    kb_id = kb_response['knowledgeBase']['knowledgeBaseId']
    print(f"Knowledge base '{kb_name}' created successfully.")
    print(f"Knowledge base ID: {kb_id}")

response = bedrock_agent.get_knowledge_base(knowledgeBaseId=kb_id)
kb_arn = response['knowledgeBase']['knowledgeBaseArn']
print(f"Knowledge base ARN: {kb_arn}")

### Create Data Source

In [None]:
data_source_name = "redshift-data-source"

try:
    list_response = bedrock_agent.list_data_sources(knowledgeBaseId=kb_id)
    for data_source in list_response.get('dataSourceSummaries', []):
        if data_source['name'] == data_source_name:
            bedrock_agent.delete_data_source(
                dataSourceId=data_source['dataSourceId'],
                knowledgeBaseId=kb_id
            )
            print(f"Existing data source '{data_source_name}' deleted.")
            break
except Exception as e:
    print(f"Error checking/deleting existing data source: {str(e)}")

try:
    response = bedrock_agent.create_data_source(
        name=data_source_name,
        clientToken=str(uuid.uuid4()),
        dataSourceConfiguration={
            "type": "REDSHIFT_METADATA"
        },
        knowledgeBaseId=kb_id
    )

    data_source_id = response['dataSource']['dataSourceId']
    print(f"New Data Source ID: {data_source_id}")
except Exception as e:
    print(f"Error creating new data source: {str(e)}")

In [None]:
%store kb_id kb_arn data_source_id

### Start Ingestion Job (Sync)

In [None]:
import time

response = bedrock_agent.start_ingestion_job(
    dataSourceId=data_source_id,
    knowledgeBaseId=kb_id
)

ingestion_job_id = response['ingestionJob']['ingestionJobId']
print(f"Ingestion job started. Job ID: {ingestion_job_id}")

while True:
    job_status = bedrock_agent.get_ingestion_job(
        dataSourceId=data_source_id,
        knowledgeBaseId=kb_id,
        ingestionJobId=ingestion_job_id
    )
    status = job_status['ingestionJob']['status']
    print(f"Current status: {status}")

    if status in ['COMPLETE', 'FAILED', 'STOPPED']:
        break

    time.sleep(30)

print(f"Ingestion job ended with status: {status}")