# Import

In [None]:
import os

os.chdir(os.path.dirname(os.getcwd()))

In [1]:
import json

from glob import glob
from collections import defaultdict

from tqdm import tqdm
import PIL
import faiss
import pickle
import pandas as pd
import numpy as np

from jovis_model.config import Config
from jovis_model.utils.helper import build_faiss_index
from jovis_model.models.llm.mclip import CLIPModel
from run import ModelRunner
from jovis_model.utils.report import ReportMaker

  from .autonotebook import tqdm as notebook_tqdm


# InternVL vs CLIP Report

### CLIP

In [2]:
params = {
    "pkg": "llm",
    "task": "mclip",
    "use_hf_model": True,
    "params": {
        "hf_name": "M-CLIP/XLM-Roberta-Large-Vit-B-32"
    }
}
config = Config(**params)
model = CLIPModel(config=config)

In [1]:
# image_path = "/data/local/multimodal_for_skb/images/skb"
# image_lst = glob(os.path.join(image_path, "*.webp"))

# pids = []
# embeddings = []
# for img_path in tqdm(image_lst):
#     pid = os.path.basename(img_path).split(".")[0]
#     img = PIL.Image.open(img_path)
#     embed = model.inference(img).detach().cpu().numpy()[0]
#     pids.append(pid)
#     embeddings.append(embed)
# build_faiss_index(
#     embeddings=embeddings,
#     save_path="outputs/skb",
#     save_name="clip_image",
#     pids=pids
# )

In [14]:
with open("jovis_model/_db/llm/multimodal/query.json", "r") as f:
    querys = json.load(f)
ko_embeddings = model.inference(querys["ko"])
en_embeddings = model.inference(querys["en"])
# query_embeddings = model.inference(["hello"])

In [23]:
querys["ko_embeddings"] = ko_embeddings.detach().cpu().numpy().tolist()
querys["en_embeddings"] = en_embeddings.detach().cpu().numpy().tolist()

In [24]:
with open("outputs/skb/clip_text_embeddings.json", "w") as f:
    json.dump(querys, f)

### InternVL

In [4]:
params = {
    "pkg": "llm",
    "task": "internvl",
    "use_hf_model": True,
    "params": {
        "hf_name": "OpenGVLab/InternVL-Chat-V1-5",
        "max_new_tokens": 512
    }
}
config = Config(**params)
runner = ModelRunner(
    config=config,
    mode="inference"
)

### Sentence Embedding : description & query

In [4]:
params = {
    "pkg": "llm",
    "task": "sentence_embedding",
    "use_hf_model": True,
    "params": {
        "hf_name": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    }
}
config = Config(**params)
runner = ModelRunner(
    config=config,
    mode="inference"
)



In [3]:
with open("jovis_model/_db/llm/multimodal/query.json", "r") as f:
    querys = json.load(f)

ko_embeddings = runner.run(querys["ko"])
en_embeddings = runner.run(querys["en"])
querys["ko_embeddings"] = ko_embeddings.detach().cpu().numpy().tolist()
querys["en_embeddings"] = en_embeddings.detach().cpu().numpy().tolist()
with open("outputs/skb/query_text_embeddings.json", "w") as f:
    json.dump(querys, f)

In [4]:
file_path = "/home/omnious/workspace/jovis/jovis-model/outputs/skb/descriptions_v2.json"
with open(file_path, "r") as f:
    descriptions = json.load(f)
pids = []
embeddings = []
for pid, description in tqdm(descriptions.items()):
    pids.append(pid)
    embeddings.append(runner.run([description]).detach().cpu().numpy()[0])
build_faiss_index(
    embeddings=embeddings,
    save_path="/home/omnious/workspace/jovis/jovis-model/outputs/skb",
    save_name="descriptions_v2",
    pids=pids
)

### Evaluation

In [31]:
desc_index = faiss.read_index("outputs/skb/descriptions_v2.index")
clip_index = faiss.read_index("outputs/skb/clip_image.index")

In [11]:
with open("outputs/skb/clip_image_map.json", "r") as f:
    clip_image_map = json.load(f)

with open("outputs/skb/descriptions_v2_map.json", "r") as f:
    ivl_desc_map = json.load(f)

In [12]:
with open("outputs/skb/clip_text_embeddings.json", "r") as f:
    clip_text_embeddings = json.load(f)

with open("outputs/skb/query_text_embeddings.json", "r") as f:
    query_text_embeddings = json.load(f)

In [36]:
res = {}
for idx, (ko_query, ko_clip_embed, ko_query_embed, en_query, en_clip_embed, en_query_embed) in enumerate(zip(
        clip_text_embeddings["ko"], clip_text_embeddings["ko_embeddings"], query_text_embeddings["ko_embeddings"],
        clip_text_embeddings["en"], clip_text_embeddings["en_embeddings"], query_text_embeddings["en_embeddings"]
    )):
    res[f"query_{idx}"] = defaultdict(list)
    scores, ids = clip_index.search(np.array(ko_clip_embed).reshape(1, -1), 5)
    scores = ["[invl] {:.4f}".format(s) for s in scores[0]]
    ids = [clip_image_map[str(i)] for i in ids[0]]
    res[f"query_{idx}"][ko_query].append({"text": scores, "image": ids})

    scores, ids = desc_index.search(np.array(ko_query_embed).reshape(1, -1), 5)
    scores = ["[clip] {:.4f}".format(s) for s in scores[0]]
    ids = [ivl_desc_map[str(i)] for i in ids[0]]
    res[f"query_{idx}"][ko_query].append({"text": scores, "image": ids})

    scores, ids = clip_index.search(np.array(en_clip_embed).reshape(1, -1), 5)
    scores = ["[invl] {:.4f}".format(s) for s in scores[0]]
    ids = [clip_image_map[str(i)] for i in ids[0]]
    res[f"query_{idx}"][en_query].append({"text": scores, "image": ids})

    scores, ids = desc_index.search(np.array(en_query_embed).reshape(1, -1), 5)
    scores = ["[clip] {:.4f}".format(s) for s in scores[0]]
    ids = [ivl_desc_map[str(i)] for i in ids[0]]
    res[f"query_{idx}"][en_query].append({"text": scores, "image": ids})

In [5]:
rm = ReportMaker(
    data_dict=res,
    image_path="/data/local/multimodal_for_skb/images/skb",
    max_len=10
)

In [6]:
rm.make_report(
    save_path="outputs/skb",
    save_name="multimodal_internvl_clip"
)