# Import

In [1]:
import os

os.chdir(os.path.dirname(os.getcwd()))

In [2]:
import json
import random
from typing import List
from tqdm.auto import tqdm
from dataclasses import dataclass
from collections import defaultdict

import faiss
import pickle
import numpy as np
from elasticsearch import Elasticsearch
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from run import ModelRunner
from jovis_model.config import Config
from jovis_model.utils.helper import build_faiss_index
from jovis_model.utils.report import ReportMaker

# Common

### Runner

In [26]:
params = {
    "pkg": "llm",
    "task": "bedrock",
    "use_hf_model": False,
    "params": {
    }
}
config = Config(**params)
bd_runner = ModelRunner(
    config=config,
    mode="inference"
)

01:01:29:PM:INFO: data module loaded
01:01:29:PM:INFO: model module loaded


In [4]:
params = {
    "pkg": "llm",
    "task": "sentence_embedding",
    "use_hf_model": True,
    "params": {
        "hf_name": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    }
}
config = Config(**params)
se_runner = ModelRunner(
    config=config,
    mode="inference"
)

10:54:08:AM:INFO: data module loaded
10:54:15:AM:INFO: model module loaded


### Parser

In [5]:
def parse_desc(desc, target: list, dropna: bool = True):
    parsed = {}
    try:
        desc = json.loads(desc)
        for k, v in desc.items():
            if k in target: 
                if k == "Fashion attribute":
                    if isinstance(v, dict):
                        fa_dict = {}
                        for k_, v_ in v.items():
                            if dropna:
                                if v_ != "N/A":
                                    fa_dict[k_] = v_
                            else:
                                fa_dict[k_] = v_
                        v = fa_dict
                    parsed[k] = v 
                else:
                    if dropna:
                        if v != "N/A":
                            parsed[k] = v
                    else:
                        parsed[k] = v
    except Exception as e:
        return None
    return parsed

### Descriptions

In [22]:
with open("outputs/skb/descriptions_original.json", "r") as f:
    desc = json.load(f)

### labels

In [7]:
target_columns = ["Season", "Place", "Occasion", "Style"]
dropna=True

In [8]:
pids = []
documents = []
for pid, des in tqdm(desc.items()):
    des = parse_desc(
        des,
        target=target_columns,
        dropna=dropna
    )
    if des:
        documents.append(des)
        pids.append(pid)
print(f"Total: {len(desc)}, Success: {len(pids)}")

  0%|          | 0/19180 [00:00<?, ?it/s]

Total: 19180, Success: 17368


In [9]:
labels = [[f"{k}_{v}" for k, v in sorted(d.items(), key=lambda x: x[0])] for d in documents]
pid2label = {k: v for k, v in zip(pids, labels)}

### Inference

In [10]:
def inference(pids, embeddings, model_id):
    y = []
    y_hat = []
    res = {}
    cur, batch_size, top_k = 0, 100, 10
    pbar = tqdm(total=len(embeddings) // batch_size + 1)
    while cur < len(embeddings):
        batch_embeddings = embeddings[cur:cur+batch_size]
        batch_query_pids = pids[cur:cur+batch_size]
        batch_pred_scores, batch_pred_indices = desc_index.search(batch_embeddings, top_k)

        for query_pid, pred_scores, pred_indices in zip(batch_query_pids, batch_pred_scores, batch_pred_indices):
            pred_pids = [desc_map.get(str(idx), -1) for idx in pred_indices]

            pred_descs = []
            pred_labels = []
            for pred_score, pred_pid in zip(pred_scores, pred_pids):
                pred_label = pid2label.get(pred_pid, [])
                pred_labels.append(pred_label)
                pred_descs.append(
                    [pred_pid, f"[{model_id}] {pred_score:.4f}"] + pred_label
                )

            res[query_pid] = defaultdict(list)
            query_label = pid2label.get(query_pid, [])
            res[query_pid]["<br>".join(query_label)].append({"text": pred_descs, "image": pred_pids})

            y_hat.append(pred_labels)
            y.append(query_label)

        cur += batch_size
        pbar.update(1)
    pbar.close()
    return y, y_hat, res

### metrics

In [11]:
def get_metrics(labels : List[List[str]], preds : List[List[List[str]]]):
    topk = len(preds[0])
    hits = []
    for label, pred in zip(labels, preds):
        hits.append([1 if set(label) == set(p) else 0 for p in pred])
    hits = np.array(hits)

    hr = np.mean(np.sum(hits, axis=1) > 0)
    mAP = np.mean(
        [
            np.mean([np.mean(hit[: idx + 1]) for idx in range(topk)])
            for hit in hits
        ]
    )
    return hr, mAP

# Case 1

### Build search embeddings : mpnet

In [11]:
target_columns = ["Season", "Place", "Occasion", "Style", "Gender", "Background", "Model", "Fashion attribute"]
dropna=True

In [13]:
pids = []
embeddings = []
for pid, des in tqdm(desc.items()):
    des = parse_desc(
        des,
        target=target_columns,
        dropna=dropna
    )
    if des:
        des = json.dumps(des)
        pids.append(pid)
        embeddings.append(se_runner.run([des]).detach().cpu().numpy()[0])
        
print(f"Total: {len(desc)}, Success: {len(pids)}")

  0%|          | 0/19180 [00:00<?, ?it/s]

Total: 19180, Success: 19174


In [14]:
build_faiss_index(
    embeddings=embeddings,
    save_path="outputs/skb",
    save_name="descriptions_all_mpnet",
    pids=pids
)

### Build query embeddings : mpnet

In [15]:
target_columns = ["Season", "Place", "Occasion", "Style"]
dropna=True

In [16]:
pids = []
embeddings = []
for pid, des in tqdm(desc.items()):
    des = parse_desc(
        des,
        target=target_columns,
        dropna=dropna
    )
    if des:
        des = json.dumps(des)
        pids.append(pid)
        embeddings.append(se_runner.run([des]).detach().cpu().numpy()[0].tolist())
print(f"Total: {len(desc)}, Success: {len(pids)}")

  0%|          | 0/19180 [00:00<?, ?it/s]

Total: 19180, Success: 17368


In [17]:
f_name = "query_embeddings_4cols_mpnet"

In [18]:
with open(f"outputs/skb/{f_name}.json", "w") as f:
    json.dump({k:v for k, v in zip(pids, embeddings)}, f)

### Evaluation : mpnet

In [12]:
index_file_name = "descriptions_all_mpnet"
query_file_name = "query_embeddings_4cols_mpnet"

desc_index = faiss.read_index(f"outputs/skb/{index_file_name}.index")
with open(f"outputs/skb/{index_file_name}_map.json", "r") as f:
    desc_map = json.load(f)
with open(f"outputs/skb/{query_file_name}.json", "r") as f:
    desc_text_embeddings = json.load(f)

pids = []
embeddings = []
for pid, embedding in desc_text_embeddings.items():
    pids.append(pid)
    embeddings.append(embedding)

In [13]:
pids = np.array(pids)
embeddings = np.array(embeddings)

In [20]:
case1_y, case1_y_hat, case1_res = inference(pids, embeddings, "mpnet")

  0%|          | 0/174 [00:00<?, ?it/s]

In [21]:
get_metrics(case1_y, case1_y_hat)

(0.5912597881160755, 0.41146930954939936)

### Build search embeddings : cohere

In [294]:
target_columns = ["Season", "Place", "Occasion", "Style", "Gender", "Background", "Model", "Fashion attribute"]
dropna=True

In [295]:
pids = []
documents = []
for pid, des in tqdm(desc.items()):
    des = parse_desc(
        des,
        target=target_columns,
        dropna=dropna
    )
    if des:
        des = json.dumps(des)
        documents.append(des)
        pids.append(pid)
print(f"Total: {len(desc)}, Success: {len(pids)}")

  0%|          | 0/19180 [00:00<?, ?it/s]

Total: 19180, Success: 19174


In [296]:
batch_size = 1
cur = 0
embeddings = []
pbar = tqdm(total=len(documents) // batch_size)
while cur < len(documents):
    tmp_docs = documents[cur:cur+batch_size]
    body_dict = {
        "texts": tmp_docs,
        "input_type": "search_document",
    }
    body_json = json.dumps(body_dict)
    response = bd_runner.mm.processor.bedrock_runtime.invoke_model(
        modelId="cohere.embed-multilingual-v3",
        body=body_json
    )
    embedding = json.loads(response.get("body").read())["embeddings"]
    embeddings += embedding
    cur += batch_size
    pbar.update(1)
pbar.close()

  0%|          | 0/19174 [00:00<?, ?it/s]

In [298]:
build_faiss_index(
    embeddings=embeddings,
    save_path="outputs/skb",
    save_name="descriptions_all_cohere",
    pids=pids
)

### Build query embeddings : cohere

In [44]:
target_columns = ["Season", "Place", "Occasion", "Style"]
dropna=True

In [45]:
pids = []
documents = []
for pid, des in tqdm(desc.items()):
    des = parse_desc(
        des,
        target=target_columns,
        dropna=dropna
    )
    if des:
        des = json.dumps(des)
        documents.append(des)
        pids.append(pid)
print(f"Total: {len(desc)}, Success: {len(pids)}")

  0%|          | 0/19180 [00:00<?, ?it/s]

Total: 19180, Success: 17368


In [47]:
batch_size = 50
cur = 0
embeddings = []
pbar = tqdm(total=len(documents) // batch_size + 1)
while cur < len(documents):
    tmp_docs = documents[cur:cur+batch_size]
    body_dict = {
        "texts": tmp_docs,
        "input_type": "search_query",
    }
    body_json = json.dumps(body_dict)
    response = bd_runner.mm.processor.bedrock_runtime.invoke_model(
        modelId="cohere.embed-multilingual-v3",
        body=body_json
    )
    embedding = json.loads(response.get("body").read())["embeddings"]
    embeddings += embedding
    cur += batch_size
    pbar.update(1)
pbar.close()

  0%|          | 0/348 [00:00<?, ?it/s]

ClientError: An error occurred (ExpiredTokenException) when calling the InvokeModel operation: The security token included in the request is expired

In [288]:
# embeddings = np.array(embeddings).astype(np.float32)
# faiss.normalize_L2(embeddings)
# embeddings = embeddings.tolist()

In [30]:
f_name = "query_embeddings_4cols_cohere"

In [31]:
with open(f"outputs/skb/{f_name}.json", "w") as f:
    json.dump({k:v for k, v in zip(pids, embeddings)}, f)

### Evaluation : cohere

In [299]:
index_file_name = "descriptions_all_cohere"
query_file_name = "query_embeddings_4cols_cohere"

desc_index = faiss.read_index(f"outputs/skb/{index_file_name}.index")
with open(f"outputs/skb/{index_file_name}_map.json", "r") as f:
    desc_map = json.load(f)
with open(f"outputs/skb/{query_file_name}.json", "r") as f:
    desc_text_embeddings = json.load(f)

pids = []
embeddings = []
for pid, embedding in desc_text_embeddings.items():
    pids.append(pid)
    embeddings.append(embedding)

pids = np.array(pids)
embeddings = np.array(embeddings)

In [300]:
case2_y, case2_y_hat, case2_res = inference(pids, embeddings, "cohere")

  0%|          | 0/174 [00:00<?, ?it/s]

In [24]:
get_metrics(case2_y, case2_y_hat)

(0.37931828650391525, 0.10115503425383665)

In [293]:
get_metrics(case2_y, case2_y_hat)

(0.5892445877475817, 0.3614104460492934)

# Case 2

### Build search embeddings : cohere

In [11]:
@dataclass
class Config:
    ES_URL: str = "http://flush-es-v3.dev.omnious.com"
    ES_PORT: int = 30313
    ES_LOGIN_ID: str = "elastic"
    ES_LOGIN_PWD: str = "5P44CV414Pxb4f7R7nfU1rkq"
    ES_INDEX: str = "01hz64dqt6h8fqkmm78eyz52sd"
    
    ES_RETRY_COUNT: int = 3
    ES_TIMEOUT: int = 2
    ES_MAX_COUNT: int = 1000

In [12]:
class ElasticsearchService:
    def __init__(self, config, timeout: int = 30):
        self.es = Elasticsearch(
            f"{config.ES_URL}:{config.ES_PORT}",
            http_auth=(config.ES_LOGIN_ID, config.ES_LOGIN_PWD),
            timeout=config.ES_TIMEOUT,
            max_retries=config.ES_RETRY_COUNT,
            retry_on_timeout=True,
        )

In [13]:
config = Config

es = ElasticsearchService(config=config)

  self.es = Elasticsearch(
  self.es = Elasticsearch(


In [33]:
query = {
    "bool": {
        "must": {
            "exists": {
                "field": "semantic_vectors_l2_norm"
            }
        }
    }
}
hits = []
scroll_id_list = []

es_results = es.es.search(
    index=config.ES_INDEX,
    query=query,
    size=10000,
    scroll="1m",
    track_total_hits=True
)
tmp_hits = es_results.get("hits", dict()).get("hits", list())
hits.extend(tmp_hits)

scroll_id = es_results["_scroll_id"]
scroll_id_list.append(scroll_id)
while len(tmp_hits):
    es_results = es.es.scroll(
        scroll_id=scroll_id_list[-1],
        scroll="1m"
    )
    tmp_scroll_id = es_results["_scroll_id"]
    scroll_id_list.append(tmp_scroll_id)
    tmp_hits = es_results.get("hits", dict()).get("hits", list())
    hits.extend(tmp_hits)

10:47:02:AM:INFO: POST http://flush-es-v3.dev.omnious.com:30313/01hz64dqt6h8fqkmm78eyz52sd/_search?scroll=1m [status:200 duration:17.698s]
10:47:25:AM:INFO: POST http://flush-es-v3.dev.omnious.com:30313/_search/scroll [status:200 duration:19.650s]
10:47:29:AM:INFO: POST http://flush-es-v3.dev.omnious.com:30313/_search/scroll [status:200 duration:0.131s]
10:47:29:AM:INFO: POST http://flush-es-v3.dev.omnious.com:30313/_search/scroll [status:200 duration:0.008s]


In [35]:
pids, embeddings = [], []
for h in hits:
    pid = h["_source"]["productId"]
    embed = h["_source"]["semantic_vectors_l2_norm"]
    pids.append(pid)
    embeddings.append(embed)

In [38]:
build_faiss_index(
    embeddings=embeddings,
    save_path="outputs/skb",
    save_name="KB_with_des_cohere",
    pids=pids
)

### Build search embeddings: cohere - original

In [36]:
with open("outputs/skb/ppl_documents.json", "rb") as f:
    data = json.load(f)

In [37]:
pids = []
documents = []
for pid, des in tqdm(data.items()):
    pids.append(pid)
    documents.append(des)

  0%|          | 0/20101 [00:00<?, ?it/s]

In [39]:
batch_size = 50
cur = 0
embeddings = []
pbar = tqdm(total=len(documents) // batch_size) + 1
while cur < len(documents):
    tmp_docs = documents[cur:cur+batch_size]
    body_dict = {
        "texts": tmp_docs,
        "input_type": "search_document",
    }
    body_json = json.dumps(body_dict)
    response = bd_runner.mm.processor.bedrock_runtime.invoke_model(
        modelId="cohere.embed-multilingual-v3",
        body=body_json
    )
    embedding = json.loads(response.get("body").read())["embeddings"]
    embeddings += embedding
    cur += batch_size
    pbar.update(1)
pbar.close()

  0%|          | 0/402 [00:00<?, ?it/s]

In [40]:
build_faiss_index(
    embeddings=embeddings,
    save_path="outputs/skb",
    save_name="KB_with_des_cohere_orig",
    pids=pids
)

### Build search embeddings : mpnet

In [12]:
with open("outputs/skb/ppl_documents.json", "rb") as f:
    documents = json.load(f)

In [14]:
pids = []
embeddings = []
for pid, des in tqdm(documents.items()):
    pids.append(pid)
    embeddings.append(se_runner.run([des]).detach().cpu().numpy()[0])

  0%|          | 0/20101 [00:00<?, ?it/s]

In [15]:
build_faiss_index(
    embeddings=embeddings,
    save_path="outputs/skb",
    save_name="KB_with_des_mpnet",
    pids=pids
)

### Evaluation : cohere

In [41]:
index_file_name = "KB_with_des_cohere_orig"
query_file_name = "query_embeddings_4cols_cohere"

desc_index = faiss.read_index(f"outputs/skb/{index_file_name}.index")
with open(f"outputs/skb/{index_file_name}_map.json", "r") as f:
    desc_map = json.load(f)
with open(f"outputs/skb/{query_file_name}.json", "r") as f:
    desc_text_embeddings = json.load(f)

pids = []
embeddings = []
for pid, embedding in desc_text_embeddings.items():
    pids.append(pid)
    embeddings.append(embedding)

pids = np.array(pids)
embeddings = np.array(embeddings)

In [42]:
case2_y_cohere, case2_y_hat_cohere, case2_res_cohere = inference(pids, embeddings, "cohere")

  0%|          | 0/174 [00:00<?, ?it/s]

In [43]:
get_metrics(case2_y_cohere, case2_y_hat_cohere)

(0.2455665591893137, 0.05006628227062359)

### Evaluation : mpnet

In [59]:
index_file_name = "KB_with_des_mpnet"
query_file_name = "query_embeddings_4cols_mpnet"

desc_index = faiss.read_index(f"outputs/skb/{index_file_name}.index")
with open(f"outputs/skb/{index_file_name}_map.json", "r") as f:
    desc_map = json.load(f)
with open(f"outputs/skb/{query_file_name}.json", "r") as f:
    desc_text_embeddings = json.load(f)

pids = []
embeddings = []
for pid, embedding in desc_text_embeddings.items():
    pids.append(pid)
    embeddings.append(embedding)

pids = np.array(pids)
embeddings = np.array(embeddings)

In [60]:
case2_y_mpnet, case2_y_hat_mpnet, case2_res_mpnet = inference(pids, embeddings, "mpnet")

  0%|          | 0/174 [00:00<?, ?it/s]

In [61]:
get_metrics(case2_y_mpnet, case2_y_hat_mpnet)

(0.47794795025333947, 0.20191140841028568)

# Report

In [62]:
subset_res = dict(random.sample(case2_res_cohere.items(), 100))

since Python 3.9 and will be removed in a subsequent version.
  subset_res = dict(random.sample(case2_res_cohere.items(), 100))


In [63]:
for query in list(subset_res.keys()):
    key, val = list(case2_res_mpnet[query].items())[0]
    subset_res[query][key] += val

In [64]:
rm = ReportMaker(
    data_dict=subset_res,
    image_path="jovis_model/_db/skb/ppl_images",
    max_len=15
)

In [65]:
rm.make_report(
    save_path="outputs/skb",
    save_name="multimodal_1"
)

# Visualize

In [73]:
index_file_name = "descriptions_all_cohere"
query_file_name = "query_embeddings_4cols_cohere"

desc_index = faiss.read_index(f"outputs/skb/{index_file_name}.index")
with open(f"outputs/skb/{index_file_name}_map.json", "r") as f:
    desc_map = json.load(f)
with open(f"outputs/skb/{query_file_name}.json", "r") as f:
    desc_text_embeddings = json.load(f)

In [72]:
# label2pid = defaultdict(list)
# for pid, label in pid2label.items():
#     label2pid[" ".join(label)].append(pid)

In [66]:
# model = TSNE(n_components=2)
# embeddings = list(desc_text_embeddings.values())
# reduced = model.fit_transform(np.array(embeddings))

In [272]:
def get_cohere_embed(texts, input_type="search_query"):
    body_dict = {
        "texts": texts,
        "input_type": input_type,
    }
    body_json = json.dumps(body_dict)
    response = bd_runner.mm.processor.bedrock_runtime.invoke_model(
        modelId="cohere.embed-multilingual-v3",
        body=body_json
    )
    embedding = json.loads(response.get("body").read())["embeddings"]
    return embedding

In [273]:
query = '{"Style": "Dandy/Classic"}'
target_1 = '{"스타일": "댄디/클래식"}'
target_2 = '{"Fashion attribute": {"Color": "Yellow", "Texture": "Rubber-like"}}'
target_3 = '{"Style": "N/A", "Gender": "N/A", "Fashion attribute": {"Category": "N/A", "Color": "Yellow", "Shape": "N/A", "Texture": "Rubber-like"}}'
target_4 = '{"Style": "Dandy/Classic", "Gender": "Unisex", "Fashion attribute": {"Category": "Jewelry", "Color": "Gold", "Shape": "Round", "Texture": "Smooth"}}'