In [1]:
from config import config
from notebook_utils import update_doc_to_index
from dataloader import augment_text
from aws.dynamodb import DynamoDBWrapper
from aws.opensearch import OpenSearchWrapper
from aws.embedding import BedrockEmbedding
from utils import encode_image_base64

---

## Vector store로 사용할 OpenSearch Index 생성
- [Prerequirements] OpenSearch Service가 생성되어 있어야 합니다

In [5]:
embedding = BedrockEmbedding()

osImage = OpenSearchWrapper(
    endpoint=config.OPENSEARCH_ENDPOINT,
    index=config.OPENSEARCH_INDEX_IMAGE,
    region=config.OPENSEARCH_REGION
)

osText = OpenSearchWrapper(
    endpoint=config.OPENSEARCH_ENDPOINT,
    index=config.OPENSEARCH_INDEX_TEXT,
    region=config.OPENSEARCH_REGION
)

osImage.client.ping()

True

In [7]:
osImage.create_index(index_path='./os-index-schema.json')
osText.create_index(index_path='./os-index-schema.json')

delete index: product-image
create index: product-image
delete index: product-text
create index: product-text


---

## Embedding 해서 OpenSearch(VectorDB)에 저장
- DynamoDB에서 아이템을 읽은 후, embedding 하여 OpenSearch에 저장합니다.

```mermaid
graph LR
    A[\image\]:::imageStyle
    B[\image + name_kor\]:::imageStyle
    C[\text summary\]:::descStyle
    D[\description\]:::descStyle
    E[\image summary\]:::descStyle
    
    A --> F[[Multimodal Embedding]]:::multiEmbeddingStyle
    B --> F[[Multimodal Embedding]]:::multiEmbeddingStyle
    C --> G[[Text Embedding]]:::textEmbeddingStyle
    D --> G[[Text Embedding]]:::textEmbeddingStyle
    E -->G[[Text Embedding]]:::multiEmbeddingStyle

    F --> H[(product-image)]:::outputStyle
    G --> I[(product-text)]:::outputStyle

    classDef imageStyle fill:#B090F3,stroke:#E6E6FA,stroke-width:2px,color:#000000;
    classDef descStyle fill:#87CAF0,stroke:#E6E6FA,stroke-width:2px,color:#000000;
    classDef multiEmbeddingStyle fill:#FFA500,stroke:#F08D2B,stroke-width:2px,color:#000000;
    classDef textEmbeddingStyle fill:#FFE451,stroke:#F08D2B,stroke-width:2px,color:#000000;
    classDef outputStyle fill:#F08D2B,stroke:#FFFFFF,stroke-width:2px,color:#000000;
```

In [10]:
db = DynamoDBWrapper(table_name=config.DYNAMODB_TABLE)
items = db.query({
    'Limit': 100
}).get('Items', [])

for item in items:
    id = item.get('id')
    namekor = item.get('namekor')
    nameeng = item.get('productDisplayName')
    image = encode_image_base64(item.get('thumbnail'))
    image_summary = item.get('image_summary')
    tags = ', '.join(item.get('tags'))
    text = augment_text(item)
    summary = f"{text} {item.get('summary')} {tags}".replace('\n', ' ')
    description = f"{nameeng} {namekor} {item.get('description')}".replace('\n', ' ')

    # [image vector] image
    update_doc_to_index(
        opensearch=osImage,
        item=item,
        vector=embedding.embedding_multimodal(
            image=image
        ),
        text=image_summary,
        embedType='image'
    )

    # [image vector] image + namekor
    update_doc_to_index(
        opensearch=osImage,
        item=item,
        vector=embedding.embedding_multimodal(
            image=image,
            text=namekor,
        ),
        text=namekor,
        embedType='image-namekor'
    )
        
    # [text vector] description
    update_doc_to_index(
        opensearch=osText,
        item=item,
        vector=embedding.embedding_text(
            text=description,
        ),
        text=description,
        embedType='text'
    )

    # [text vector] summary
    update_doc_to_index(
        opensearch=osText,
        item=item,
        vector=embedding.embedding_text(
            text=summary,
        ),
        text=summary,
        embedType='text-summary'
    )

    # [text vector] image summary
    update_doc_to_index(
        opensearch=osText,
        item=item,
        vector=embedding.embedding_text(
            text=image_summary,
        ),
        text=image_summary,
        embedType='text-imgdesc'
    )