In [58]:
!pip install opensearch-py

Collecting opensearch-py
  Downloading opensearch_py-2.5.0-py2.py3-none-any.whl.metadata (6.8 kB)
Collecting urllib3<2,>=1.26.18 (from opensearch-py)
  Downloading urllib3-1.26.18-py2.py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Downloading opensearch_py-2.5.0-py2.py3-none-any.whl (266 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading urllib3-1.26.18-py2.py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.8/143.8 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: urllib3, opensearch-py
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.2.1
    Uninstalling urllib3-2.2.1:
      Successfully uninstalled urllib3-2.2.1
[31mERROR: pip's dependency resolver does not currently take into account all

### > Setup

In [14]:
import json
import boto3
import sys
import os
import io
from PIL import Image
import time
import shutil
import pandas as pd
from sagemaker.utils import name_from_base
from opensearch_util import OpenSearchManager

# Add the root directory to the Python search path
sys.path.append("../image-generator")

from helper import (
    get_text_response,
    _encode,
    download_file_from_s3,
    get_mm_embedding,
    calc_total_cost,
    load_jsonl
)

%store -r provisioned_model_id

os_manager = OpenSearchManager()
prefix = "mm-index"
index_name = name_from_base(prefix)

### > Load data

In [20]:
train_data = "data/train.json"
with open(train_data, 'r+') as f:
    train_dataset = json.load(f)
    
image_data = train_dataset['corpus']

valid_data = "data/valid.json"
with open(valid_data, 'r+') as f:
    valid_dataset = json.load(f)

image_data.update(valid_dataset['corpus'])
image_data

{'e2191ba5-ad8b-46d1-8271-22767c656705': {'image-ref': 's3://sagemaker-us-west-2-374212921621/titan-finetuning/image070.png',
  'caption': 'storyboard sketch of A fearless explorer climbing a towering tree, their eyes filled with wonder at the world below.'},
 'b2c6ab4b-8caa-4f8d-a3f0-6523ec434dc4': {'image-ref': 's3://sagemaker-us-west-2-374212921621/titan-finetuning/image063.png',
  'caption': "storyboard sketch of A tiny gardener, beaming with pride, gently caressing the petals of a vibrant sunflower they've nurtured from a seed."},
 '266e8fa3-2bd5-4ed2-8a46-cd1378532f3c': {'image-ref': 's3://sagemaker-us-west-2-374212921621/titan-finetuning/image013.png',
  'caption': 'storyboard sketch of A brave knight standing tall, shield raised, ready to defend the kingdom from any danger.'},
 '606a82ba-e640-4e82-93aa-3b5a275e1f6b': {'image-ref': 's3://sagemaker-us-west-2-374212921621/titan-finetuning/image095.png',
  'caption': 'storyboard sketch of Hands diligently tend to a vegetable garden

### > generate the index data

In [21]:
index_object=[]

for id, key in enumerate(image_data):
    metadata = dict()
    metadata['id'] = key

    image = download_file_from_s3(image_data[key]['image-ref'])
    image_base64 = _encode(image)
    metadata['vector_field'] = get_mm_embedding(image_base64=image_base64)
    metadata['image-ref'] = image_data[key]['image-ref']
    metadata['caption'] = image_data[key]['caption']
    
    index_object.append(metadata)

### > Setup a vector index

In [22]:
# vector_store_name = name_from_base(index_name)[:20]
# index_name = f"{vector_store_name}-index"
# encryption_policy_name = f"{vector_store_name}-ep"
# network_policy_name = f"{vector_store_name}-np"
# access_policy_name = f"{vector_store_name}-ap"

# host = os_manager.create_opensearch_collection(
#         vector_store_name=vector_store_name,
#         index_name=index_name,
#         encryption_policy_name=encryption_policy_name,
#         network_policy_name=network_policy_name,
#         access_policy_name=access_policy_name
#     )

# print(f"hosting url: {host}")

In [23]:
host = "zs7omkhikyvo5t5w2hyj.us-west-2.aoss.amazonaws.com"

if host is None:
    raise ValueError("Must provide a host url for Opensearch Severless collection ")

### > Inititialize Client

In [24]:
os_manager.initialize_client(host=host)

### > Create a new index

In [25]:
index_body = {
  "mappings": {
    "properties": {
      "id": {
        "type": "text"
      },
      "image-ref": {
        "type": "text"
      },
      "caption": {
        "type": "text"
      },
      "vector_field": {
        "type": "knn_vector",
        "dimension": 1024,
        "method": {
          "engine": "nmslib",
          "space_type": "cosinesimil", 
          "name": "hnsw",
          "parameters": {
            "ef_construction": 512,
            "m": 16
          }
        }
      }
    }
  },
  "settings": {
    "index": {
      "number_of_shards": 2,
      "knn.algo_param": {
        "ef_search": 512
      },
      "knn": True
    }
  }
}

In [26]:
resp = os_manager.create_index(index_name=index_name, index_body=index_body)
time.sleep(40)

{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "mm-index-2024-04-26-01-32-44-432"
}


### > Bulk ingestion

In [28]:
sucess, failed = os_manager.bulk_index_ingestion(index_name=index_name,
                                                     data=index_object)
time.sleep(20)
print("validate query ...")

Indexed 81 documents
validate query ...


### > Test the top hit performance before and after fine tuning

In [29]:
# build opensearch query
os_query = {
    "size": 5,
    "query":{
        "knn": {
        "vector_field": {
            "vector": [],
            "k": 5
        }
        }
    },
    "_source": ["id", 
                "image-ref", 
                "caption"]}

In [30]:
from tqdm.notebook import tqdm

def evaluate_top_hit(dataset, top_k=5, model_id=None):
    queries = dataset["queries"]
    mapping = dataset["relevant_docs"]
    eval_results = []
    for q_id, query in tqdm(queries.items()):

        if model_id:
            os_query["query"]["knn"]["vector_field"]["vector"] = get_mm_embedding(text_description=query, 
                                                                                 model_id=model_id)
        else:
            os_query["query"]["knn"]["vector_field"]["vector"] = get_mm_embedding(text_description=query)
        os_query["size"] = top_k
        os_query["query"]["knn"]["vector_field"]["k"] = top_k
            
        results = os_manager.opensearch_query(os_query,
                                              index_name=index_name)
    
        retrieved_ids = []
        for index, value in enumerate(results):
            retrieved_ids.append(value["_source"]["id"])
    
        expected_id = mapping[q_id][0]
        
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc
    
        eval_result = {
            'is_hit': is_hit,
            'retrieved': retrieved_ids,
            'expected': expected_id,
            'query': q_id,
        }
        eval_results.append(eval_result)
    return eval_results

### > Orginal Base Model

In [31]:
eval_results = evaluate_top_hit(valid_dataset, top_k=5)

df_base = pd.DataFrame(eval_results)
top_hits = df_base['is_hit'].mean()

print("percent of top hits: {:.2f} %".format(top_hits*100))

  0%|          | 0/340 [00:00<?, ?it/s]

percent of top hits: 92.35 %


### > Fine tune Titan Multi-Modal Embedding

In [32]:
eval_results = evaluate_top_hit(valid_dataset, top_k=5, model_id=provisioned_model_id)

df_base = pd.DataFrame(eval_results)
top_hits = df_base['is_hit'].mean()

print("percent of top hits: {:.2f} %".format(top_hits*100))

  0%|          | 0/340 [00:00<?, ?it/s]

percent of top hits: 95.00 %
