In [None]:
!pip install boto3 faiss-cpu

In [None]:
import boto3
import json
import re
import faiss
import numpy as np

# Load Data

In [None]:
s3_client = boto3.client('s3')

In [None]:
bucket_name = 'sageon-mungchi-service'
key = 'case_data'

In [None]:
list_response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=key)
file_keys = [content['Key'] for content in list_response.get('Contents', [])]

file_keys

In [None]:
def clean_content(content):
    content = re.sub(r'\s+', ' ', content)
    content = content.replace('다.', '다.\n')
    content_list = content.split('\n')
    content_list = [sent.strip() for sent in content_list] 
    return content_list

In [None]:
content_response = s3_client.get_object(Bucket=bucket_name, Key=file_keys[0])
text_data = content_response['Body'].read().decode('utf-8')
case_data = json.loads(text_data)

case_data['case_issue'] = clean_content(case_data['case_issue'])
case_data['case_summary'] = clean_content(case_data['case_summary'])
case_data['case_detail'] = clean_content(case_data['case_detail'])

case_data

# 임베딩 테스트

In [None]:
br_client = boto3.client("bedrock-runtime", region_name="ap-northeast-1")

In [None]:
# titan
model_id = "amazon.titan-embed-text-v1"

native_request = {"inputText": case_data['case_detail'][0]}
request = json.dumps(native_request)
response = br_client.invoke_model(modelId=model_id, body=request)
model_response = json.loads(response["body"].read())

embedding = model_response["embedding"]
input_token_count = model_response["inputTextTokenCount"]

print(input_token_count, embedding)

In [None]:
# cohere
model_id = "cohere.embed-multilingual-v3"

native_request = {"texts": [case_data['case_detail'][0]], 'input_type':'search_query'}
request = json.dumps(native_request)
response = br_client.invoke_model(modelId=model_id, body=request)
model_response = json.loads(response["body"].read())
embedding = model_response['embeddings'][0]

embedding

# 벡터 데이터베이스

In [None]:
import gc
gc.collect()

In [None]:
def clean_text(text):
    text = text.replace('\n', '')
    text = re.sub(r'\s+', ' ', text)
    # content = content.replace('다.', '다.\n')
    # content_list = content.split('\n')
    # content_list = [sent.strip() for sent in content_list] 
    return text

In [None]:
def load_content(file_key):
    content_response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    text_data = content_response['Body'].read().decode('utf-8')
    case_data = json.loads(text_data)
    
    case_data['case_issue'] = clean_text(case_data['case_issue'])
    case_data['case_summary'] = clean_text(case_data['case_summary'])
    case_data['case_detail'] = clean_text(case_data['case_detail'])

    return case_data

In [None]:
def get_titan_embedding(text):
    model_id = "amazon.titan-embed-text-v1"
    native_request = {"inputText": text[0:2048]}
    request = json.dumps(native_request)
    response = br_client.invoke_model(modelId=model_id, body=request)
    model_response = json.loads(response["body"].read())
    embedding = model_response["embedding"]
    return embedding

In [None]:
def get_cohere_embedding(text):
    model_id = "cohere.embed-multilingual-v3"
    native_request = {"texts": [text[0:2048]], 'input_type':'search_query'}
    request = json.dumps(native_request)
    response = br_client.invoke_model(modelId=model_id, body=request)
    model_response = json.loads(response["body"].read())
    embedding = model_response['embeddings'][0]
    return embedding

In [None]:
def get_embeddings(documents, embedding_type='titan'):
    embeddings = []
    
    for doc in documents:
        text = doc['text']
        
        if embedding_type == 'titan':
            embedding = get_titan_embedding(text)
        elif embedding_type == 'cohere':
            embedding = get_cohere_embedding(text)
        else:
            print('Embedding type must be titan or cohere.')
            raise Exception()

        embeddings.append(embedding)

    return embeddings

### titan, cohere 중 더 좋은 걸 사용하기 위한 비교

In [None]:
%%time

documents = []

for file_key in file_keys[0:5]:
    case_data = load_content(file_key)
    
    for key in ['case_issue', 'case_summary', 'case_detail']:
        text = case_data[key]
        
        if text == '':
            continue
            
        documents.append({
            'text': text,
            'metadata': {
                'case_no': case_data['case_no'],
                'case_name': case_data['case_name'],
                'case_date': case_data['case_date'],
                'case_court': case_data['case_court'],
            }
        })

len(documents)

%%time

titan_embeddings = get_embeddings(documents, 'titan')
titan_index = faiss.IndexFlat(len(titan_embeddings[0]))
titan_index.add(np.array(titan_embeddings))

In [None]:
%%time

cohere_embeddings = get_embeddings(documents, 'cohere')
cohere_index = faiss.IndexFlat(len(cohere_embeddings[0]))
cohere_index.add(np.array(cohere_embeddings))

In [None]:
import random

random_index = random.randint(0, len(documents)-1)
documents[random_index]['text']

In [None]:
query = '사무실을 같은 동네에서 이전했는데 새로운 건물에 가격이 더 비싸다고 더 세금을 많이 내야 해?'

In [None]:
titan_query_embedding = get_titan_embedding(query)
distances, indices = titan_index.search(np.array([titan_query_embedding]), 3)
print("검색 결과:")
for i, idx in enumerate(indices[0]):
    print(f"Rank {i+1}:")
    print(f"Text: {documents[idx]['text']}")
    # print(f"Metadata: {metadata_df.iloc[idx].to_dict()}")
    print(f"Distance: {distances[0][i]}")

In [None]:
cohere_query_embedding = get_cohere_embedding(query)
distances, indices = cohere_index.search(np.array([cohere_query_embedding]), 3)
print("검색 결과:")
for i, idx in enumerate(indices[0]):
    print(f"Rank {i+1}:")
    print(f"Text: {documents[idx]['text']}")
    # print(f"Metadata: {metadata_df.iloc[idx].to_dict()}")
    print(f"Distance: {distances[0][i]}")

<span style="color: red;"> titan vs cohere = 0: 3 </span>
- cohere와 titan 중 cohere가 좀 더 짧은 텍스트에서 핵심을 추출하는 경향이 있음
- ranking 내에서도 핵심인 내용이 상위로 더 잘 올라옴
- 가격은 동일함
- 따라서 cohere로 결정!

### 입력한계인 2048자를 넘기는 경우에 대해

In [None]:
over_2048 = []
full_count = 0

for file_key in file_keys:
    case_data = load_content(file_key)
    
    for key in ['case_issue', 'case_summary', 'case_detail']:
        text = case_data[key]
        full_count += 1
        
        if text == '':
            continue

        if len(text) > 2048:
            print(len(text), round((len(text)-2048)/2048*100, 0))
            over_2048.append(text)

print(round(len(over_2048)/full_count*100, 2))

<span style="color: red;"> 30%가 2048자를 넘는다. 많이 넘는건 최대 400% 이상 넘는다. </span>
- 텍스트를 잘라서 내용 손실을 보지 않는 것이 좋을 듯 하다.

In [None]:
# def cos_sim(a, b):
#     a = np.array(a)
#     b = np.array(b)

#     dot = np.dot(a, b)
#     norm_a = np.linalg.norm(a)
#     norm_b = np.linalg.norm(b)

#     return dot / (norm_a + norm_b)

In [None]:
# threshold = 0.5

# def divide_text(sent_list):
#     part_list = []
#     before_embedding = None
    
#     for i, sent in enumerate(sent_list):
#         if i == 0:
#             part_list.append(sent)
#             before_embedding = get_cohere_embedding(sent)
#             continue

#         embedding = get_cohere_embedding(sent)
#         if cos_sim(before_embedding, embedding) > threshold:
#             part_list[-1] = part_list[-1] + ' ' + sent
#         else:
#             part_list.append(sent)

#     return part_list

In [None]:
# clean_text를 다시 수정해보자.
# idea1: 【이 유】 앞뒤로 구분한다.
# idea2: 문장을 구분한다.
def clean_text(text):
    if len(text) <= 2048:
        text = text.replace('\n', '')
        text = re.sub(r'\s+', ' ', text)
        return [text]
    
    text = text.replace('\n', '')
    text = re.sub(r'\s+', ' ', text)

    part_list = []
    
    split1 = text.split('【주 문】')
    
    split2 = split1[1].split('【이 유】')
    part_list.append(split1[0] + ' 【주 문】 ' + split2[0])

    split3 = re.split(r'(?<=다\.\s\d\.\s)', split2[1])
    for part in split3:
        if len(part) < 150:
            continue

        if len(part) <= 2048:
            part_list.append('【주 문】 ' + split2[0] + ' 【이 유】 ' + part[:-4])
        else:
            chunk_size = 2048 - len('【주 문】 ' + split2[0] + ' 【이 유】 ')
            slide_step = 1024
            part_list += ['【주 문】 ' + split2[0] + ' 【이 유】 ' + part[i:i + chunk_size] 
                          for i in range(0, len(part), slide_step)]

    return part_list

In [None]:
import random
random_index = random.randint(0, len(over_2048)-1)
over_2048[random_index]

# 최종 코드

In [None]:
import boto3
import json
import re
import faiss
import numpy as np
from datetime import datetime

In [None]:
s3_client = boto3.client('s3')

bucket_name = 'sageon-mungchi-service'
key = 'case_data'
save_key = 'vector_db/faiss'

In [None]:
br_client = boto3.client("bedrock-runtime", region_name="ap-northeast-1")

In [None]:
def get_s3_file_keys():
    list_response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=key)
    file_keys = [content['Key'] for content in list_response.get('Contents', [])]
    return file_keys

In [None]:
def load_content(file_key):
    content_response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    text_data = content_response['Body'].read().decode('utf-8')
    case_data = json.loads(text_data)
    
    case_data['case_issue'] = clean_text(case_data['case_issue'])
    case_data['case_summary'] = clean_text(case_data['case_summary'])
    case_data['case_detail'] = clean_text(case_data['case_detail'])

    return case_data

In [None]:
def clean_text(text):
    if len(text) <= 2048:
        text = text.replace('\n', '')
        text = re.sub(r'\s+', ' ', text)
        return [text]
        
    text = text.replace('\n', '')
    text = re.sub(r'\s+', ' ', text)

    part_list = []
    
    try:
        split1 = text.split('【주 문】')
    
        split2 = split1[1].split('【이 유】')
        part_list.append(split1[0] + ' 【주 문】 ' + split2[0])
    
        split3 = re.split(r'(?<=다\.\s\d\.\s)', split2[1])
        for part in split3:
            if len(part) < 150:
                continue
    
            if len(part) <= 2048:
                part_list.append('【주 문】 ' + split2[0] + ' 【이 유】 ' + part[:-4])
            else:
                chunk_size = 2048 - len('【주 문】 ' + split2[0] + ' 【이 유】 ')
                slide_step = 1024
                part_list += ['【주 문】 ' + split2[0] + ' 【이 유】 ' + part[i:i + chunk_size] 
                              for i in range(0, len(part), slide_step)]
                
    except IndexError: # 【주 문】, 【이 유】가 없는 경우 exception 발생
        chunk_size = 2048
        slide_step = 1024
        part_list += [text[i:i + chunk_size] for i in range(0, len(text), slide_step)]
    
    return part_list

In [None]:
def get_cohere_embedding(text):
    model_id = "cohere.embed-multilingual-v3"
    native_request = {"texts": [text[0:2048]], 'input_type':'search_query'}
    request = json.dumps(native_request)
    response = br_client.invoke_model(modelId=model_id, body=request)
    model_response = json.loads(response["body"].read())
    embedding = model_response['embeddings'][0]
    return embedding

In [None]:
def get_titan_embedding(text):
    model_id = "amazon.titan-embed-text-v1"
    native_request = {"inputText": text[0:2048]}
    request = json.dumps(native_request)
    response = br_client.invoke_model(modelId=model_id, body=request)
    model_response = json.loads(response["body"].read())
    embedding = model_response["embedding"]
    return embedding

In [None]:
def get_coduments(file_keys):
    documents = []

    for file_key in file_keys:
        case_data = load_content(file_key)
        
        for key in ['case_issue', 'case_summary', 'case_detail']:
            texts = case_data[key]

            for text in texts:
                if text == '' or text is None or len(text) <= 0:
                    continue
                    
                documents.append({
                    'text': text,
                    'metadata': {
                        'case_no': case_data['case_no'],
                        'case_name': case_data['case_name'],
                        'case_date': case_data['case_date'],
                        'case_court': case_data['case_court'],
                    }
                })

    return documents

In [None]:
def get_embeddings(documents, embedding_type='cohere'):
    embeddings = []
    metadata = {}
    
    for i, doc in enumerate(documents):
        text = doc['text']
        
        if embedding_type == 'titan':
            embedding = get_titan_embedding(text)
        elif embedding_type == 'cohere':
            embedding = get_cohere_embedding(text)
        else:
            print('Embedding type must be titan or cohere.')
            raise Exception()

        embeddings.append(embedding)
        metadata[i] = doc['metadata']

    return embeddings, metadata

In [None]:
def save_faiss_db(index, metadata):    
    faiss.write_index(index, 'faiss_index.index')

    dstring = datetime.now().strftime('%Y%m%d')
    index_key = f"{save_key}/{dstring}/faiss_index.index"
    metadata_key = f"{save_key}/{dstring}/metadata.json"

    s3_client.upload_file('faiss_index.index', bucket_name, index_key)
    s3_client.put_object(Bucket=bucket_name, Key=metadata_key, Body=json.dumps(metadata))

In [None]:
%%time

print("======= Load Documents ======")
s3_file_keys = get_s3_file_keys()
documents = get_coduments(s3_file_keys)

In [None]:
%%time

print("======= Loading Vector Database ======")
embeddings, metadata = get_embeddings(documents, 'cohere')
index = faiss.IndexFlat(len(embeddings[0]))
index.add(np.array(embeddings))

In [None]:
%%time

print("======= Run Test Query ======")
query = '사무실을 같은 동네에서 이전했는데 새로운 건물에 가격이 더 비싸다고 더 세금을 많이 내야 해?'
query_embedding = get_cohere_embedding(query)
distances, indices = index.search(np.array([query_embedding]), 3)
print("검색 결과:")
for i, idx in enumerate(indices[0]):
    print(f"Rank {i+1}:")
    print(f"Metadata: {metadata[i]}")
    print(f"Text: {documents[idx]['text']}")
    print(f"Distance: {distances[0][i]}")

In [None]:
print("======= Save to S3 ======")
save_faiss_db(index, metadata)