# Step 3. 두 개의 KB 간 내용 비교

### Setup
* 아래 패키지 설치 후, 커널 Restart 해주세요.

In [None]:
%pip install --upgrade pip
%pip install boto3 --force-reinstall --quiet
%pip install botocore --force-reinstall --quiet
%pip install sqlalchemy==2.0.0 --quiet
%pip install langchain --force-reinstall --quiet
%pip install langchain_aws langchain-community --force-reinstall --quiet

### Bedrock Client 초기화

In [1]:
import boto3
import pprint
from botocore.client import Config
import json
import time
import os

pp = pprint.PrettyPrinter(indent=2)
session = boto3.session.Session()
region = session.region_name
bedrock_config = Config(connect_timeout=120, read_timeout=120, retries={'max_attempts': 0})
bedrock_client = boto3.client('bedrock-runtime', region_name = region)
bedrock_agent_client = boto3.client("bedrock-agent-runtime",
                              config=bedrock_config, region_name = region)
print(region)

us-east-1


### LangChain 초기화

In [2]:
import langchain
from langchain_aws import ChatBedrock
from langchain.retrievers.bedrock import AmazonKnowledgeBasesRetriever


modelId = 'anthropic.claude-3-sonnet-20240229-v1:0' # change this to use a different version from the model provider

llm = ChatBedrock(model_id=modelId, 
                  client=bedrock_client)

### 사전에 생성한 KB 목록 확인
* 사용하려는 2개의 KB ID를 확인 합니다.

In [3]:
import boto3

def list_knowledge_bases():
    # Bedrock 클라이언트 생성
    bedrock_agent = boto3.client('bedrock-agent')

    # Knowledge Base 목록 가져오기
    response = bedrock_agent.list_knowledge_bases(
        maxResults=100  # 한 번에 가져올 최대 결과 수
    )

    # 결과 출력
    if 'knowledgeBaseSummaries' in response:
        print("Found Knowledge Bases:")
        for kb in response['knowledgeBaseSummaries']:
            print(f"- Name: {kb['name']}")
            print(f"  ID: {kb['knowledgeBaseId']}")
            print(f"  Description: {kb.get('description', 'N/A')}")
            print(f"  Status: {kb['status']}")
            print(f"  Last Updated: {kb['updatedAt']}")
            print("---")
    else:
        print("No Knowledge Bases found.")

    # 페이지네이션 처리
    while 'nextToken' in response:
        response = bedrock_agent.list_knowledge_bases(
            maxResults=100,
            nextToken=response['nextToken']
        )
        
        for kb in response['knowledgeBaseSummaries']:
            print(f"- Name: {kb['name']}")
            print(f"  ID: {kb['knowledgeBaseId']}")
            print(f"  Description: {kb.get('description', 'N/A')}")
            print(f"  Status: {kb['status']}")
            print(f"  Last Updated: {kb['updatedAt']}")
            print("---")

if __name__ == "__main__":
    list_knowledge_bases()

Found Knowledge Bases:
- Name: ITB-Civil_Eng
  ID: RYYEVXXULD
  Description: N/A
  Status: ACTIVE
  Last Updated: 2024-07-19 02:34:45.555184+00:00
---
- Name: 240718-test
  ID: LEX6UDHS1Q
  Description: N/A
  Status: ACTIVE
  Last Updated: 2024-07-18 04:14:32.625010+00:00
---
- Name: knowledge-base-quick-start-aafi3
  ID: KRQKUHNJIV
  Description: N/A
  Status: ACTIVE
  Last Updated: 2024-02-26 08:27:55.881890+00:00
---
- Name: ITB-ES_Contracts
  ID: JXGUVQQIK6
  Description: N/A
  Status: ACTIVE
  Last Updated: 2024-07-19 00:42:45.537844+00:00
---
- Name: knowledge-base-quick-start-en
  ID: DJLWVWNGBT
  Description: N/A
  Status: ACTIVE
  Last Updated: 2024-02-26 12:26:22.625358+00:00
---


In [4]:
# 비교에 사용할 2개의 KB ID 를 확인하고 수정

kb_id_1 = "JXGUVQQIK6"  # ITB-ES_Contracts
kb_id_2 = "RYYEVXXULD"  # ITB-Civil_Eng

In [5]:
# 각각의 kb_id 에 대한 Name 확인

def list_knowledge_bases(kb_id):
    bedrock_agent = boto3.client('bedrock-agent')
    target_id = kb_id
    target_name = None

    response = bedrock_agent.list_knowledge_bases(maxResults=100)

    while True:
        for kb in response.get('knowledgeBaseSummaries', []):
            if kb['knowledgeBaseId'] == target_id:
                target_name = kb['name']
                break

        if target_name or 'nextToken' not in response:
            break

        response = bedrock_agent.list_knowledge_bases(
            maxResults=100,
            nextToken=response['nextToken']
        )

    if target_name:
        print(f"Knowledge Base with ID {target_id} is named: {target_name}")
        return target_name
    else:
        print(f"No Knowledge Base found with ID {target_id}")


# KB Name 확인
kb_name_1 = list_knowledge_bases(kb_id_1)
kb_name_2 = list_knowledge_bases(kb_id_2)

Knowledge Base with ID JXGUVQQIK6 is named: ITB-ES_Contracts
Knowledge Base with ID RYYEVXXULD is named: ITB-Civil_Eng


In [6]:
from langchain.prompts import PromptTemplate

# 프롬프트 템플릿
PROMPT_TEMPLATE = """
Human: You are a ITB advisor AI system, and provides answers to questions by using fact based when possible. 
Use the following pieces of information to provide a detail answer to the question enclosed in <question> tags. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

<context>
{context}
</context>


<question>
{question}
</question>

The response should be specific and use statistics or numbers when possible.

Assistant:"""
claude_prompt = PromptTemplate(template=PROMPT_TEMPLATE, 
                               input_variables=["context","question"])


# 사용자 쿼리

query = """
Reference Site condition 에 대해서 알려주세요.
"""


In [7]:
%%time

# KB에서 관련 내용 검색
retriever1 = AmazonKnowledgeBasesRetriever(
        knowledge_base_id=kb_id_1,
        retrieval_config={"vectorSearchConfiguration": 
                          {"numberOfResults": 10,
                           'overrideSearchType': "HYBRID", # optional
                           }
                          },
        # endpoint_url=endpoint_url,
        # region_name=region,
        # credentials_profile_name="<profile_name>",
    )

retriever2 = AmazonKnowledgeBasesRetriever(
        knowledge_base_id=kb_id_2,
        retrieval_config={"vectorSearchConfiguration": 
                          {"numberOfResults": 10,
                           'overrideSearchType': "HYBRID", # optional
                           }
                          },
        # endpoint_url=endpoint_url,
        # region_name=region,
        # credentials_profile_name="<profile_name>",
    )

docs1 = retriever1.get_relevant_documents(
        query=query
    )

docs2 = retriever2.get_relevant_documents(
        query=query
    )

  warn_deprecated(


CPU times: user 174 ms, sys: 37.6 ms, total: 212 ms
Wall time: 1.34 s


### 두 KB (문서들)에 대해 검색된 결과를 각각 확인 합니다.

#### KB 1 검색 내용 출력

In [8]:
uri_list_1 = []
content_list_1 = []

for doc in docs1:
    uri = doc.metadata['location']['s3Location']['uri']
    content = doc.page_content

    # 리스트에 추가
    uri_list_1.append(uri)
    content_list_1.append(content)

    # 출력
    print(f"URI: {uri}")
    print(f"Content: {content[:100]}...")  # 내용의 처음 100자만 출력
    print("-" * 50)  # 구분선

# 저장된 데이터 확인
print(f"\n총 {len(uri_list_1)}개의 문서가 처리되었습니다.")
print(f"URI 리스트의 첫 번째 항목: {uri_list_1[0]}")
print(f"Content 리스트의 첫 번째 항목: {content_list_1[0][:100]}...")

URI: s3://240719-jesamkim-bucket/ES_Contracts/2_UHP/Schedules_Execution Version/Execution Version_Schedule 22A_Technical Limits/Schedule 22A - Annex 3 - Section 3.2_Design and Performance Data.pdf
Content: # SCHEDULE 22A, ANNEX 3
Rev 5_19Mar2019
Bid Forms Formsheet F: Design and Performance Data
| Descrip...
--------------------------------------------------
URI: s3://240719-jesamkim-bucket/ES_Contracts/1_UHP/Schedules_Execution Version/Execution Version_Schedule 16A_Part 3 EPC Contractor Proposal/Section 3/Section 3.2 Design and Peroformance Data/Section 3.2_Design and Performance Data.pdf
Content: # SCHEDULE 22A, ANNEX 3
Rev 5_19Mar2019
Bid Forms Formsheet F: Design and Performance Data
| Descrip...
--------------------------------------------------
URI: s3://240719-jesamkim-bucket/ES_Contracts/2_UHP/Schedules_Execution Version/Execution Version_Schedule 16A_Part 1_MFS/Part 1 Appendix C/Sch 16A Part 1 App C - Annex 6 Section 3.2_Design and Performance Data.pdf
Content: # SCHEDULE 16A

#### KB 2 검색 내용 출력

In [9]:
uri_list_2 = []
content_list_2 = []

for doc in docs2:
    uri = doc.metadata['location']['s3Location']['uri']
    content = doc.page_content

    # 리스트에 추가
    uri_list_2.append(uri)
    content_list_2.append(content)

    # 출력
    print(f"URI: {uri}")
    print(f"Content: {content[:100]}...")  # 내용의 처음 100자만 출력
    print("-" * 50)  # 구분선

# 저장된 데이터 확인
print(f"\n총 {len(uri_list_2)}개의 문서가 처리되었습니다.")
print(f"URI 리스트의 첫 번째 항목: {uri_list_2[0]}")
print(f"Content 리스트의 첫 번째 항목: {content_list_2[0][:100]}...")

URI: s3://240719-jesamkim-bucket/Civil_Eng_Contracts/[0] RML7 ITB/Conditions of Contracts/M-ESD-200000-GN00-CON-000002-03 Addendum 01.pdf
Content: The inspection carried out by the Employer's Representative to verify the execution of works at thei...
--------------------------------------------------
URI: s3://240719-jesamkim-bucket/Civil_Eng_Contracts/[0] RML7 ITB/Conditions of Contracts/Exhibit C_General Conditions_Rev 01.pdf
Content: # SECTION I: GENERAL PROVISIONS
## 1 DEFINITIONS
The following terms wherever mentioned herein or in...
--------------------------------------------------
URI: s3://240719-jesamkim-bucket/Civil_Eng_Contracts/[0] RML7 ITB/Conditions of Contracts/Exhibit B_Special Conditions_Rev 01.pdf
Content: Reference to a Completion Certificate in General Condition 66 "RESPONSIBILITY FOR THE WORKS" shall m...
--------------------------------------------------
URI: s3://240719-jesamkim-bucket/Civil_Eng_Contracts/[0] RML7 ITB/Conditions of Contracts/Exhibit B_Special Co

### 두 KB 검색 결과에 대해서 차이점을 Claude 3 Sonnet 에게 질의

In [10]:
# Bedrock - Claude 3 Sonnet 호출 매서드 정의
def get_text_response(input_content):
    llm = ChatBedrock(
        credentials_profile_name=os.environ.get("BWB_PROFILE_NAME"),
        region_name=os.environ.get("BWB_REGION_NAME"),
        endpoint_url=os.environ.get("BWB_ENDPOINT_URL"),
        model_id="anthropic.claude-3-sonnet-20240229-v1:0",      # Claude 3 Sonnet
        #model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0", # Claude 3.5 Sonnet
        model_kwargs={
            "max_tokens": 4096,
            "temperature": 0,
            "top_p": 0.0,
            "top_k": 0,
        }
    )
    return llm.predict(input_content)

In [11]:
context_1 = content_list_1
context_2 = content_list_2

# 답변 비교 프롬프트
comp_prompt = f"""
{kb_name_1}의 내용인 <result1></result1> 과 {kb_name_2}의 내용인 <result2></result2>를 비교 합니다.
답변은 자세하게 기술해주세요. 
모르는 내용은 말하지 않습니다.

<result1>
{context_1}
</result1>

<result2>
{context_2}
</result2>

XML tag는 답변에 포함하지 마세요.

"""

In [13]:
%%time

# 두 KB 비교 답변 출력
input_text = comp_prompt
response_content = get_text_response(input_content=input_text)
print(response_content)
print()

ITB-ES_Contracts의 내용은 주로 발전소 설계 및 성능 데이터에 관한 것으로 보입니다. 주요 내용은 다음과 같습니다:

- 설계 조건(기준 현장 조건, 연료 데이터 등)
- 다양한 생산 조건(100%, 75%, 70% 등)에서의 해수 담수화 생산량

ITB-Civil_Eng의 내용은 주로 건설 공사 계약 조건에 관한 것으로 보입니다. 주요 내용은 다음과 같습니다:

- 현장 숨겨진 결함에 대한 계약자의 책임
- 현장 정보 및 현장 조건에 대한 계약자의 의무
- 공사 기간 중 계약자의 현장 운영
- 하자보수기간 중 계약자의 현장 운영
- 불가항력 조건

두 문서의 내용은 서로 다른 주제를 다루고 있으며, 직접적인 연관성은 없는 것으로 보입니다.

CPU times: user 135 ms, sys: 7.05 ms, total: 142 ms
Wall time: 8.43 s
