In [1]:
# !pip install lancedb

In [19]:
import json

import lancedb
import pandas as pd
from PIL import Image

## Load the Dataset

In [20]:
MYCHART_DIR = "data/raw_datasets/mychart"
MYDOC_DIR = "data/raw_datasets/mydoc"
MYINFOGRAPHIC_DIR = "data/raw_datasets/myinfographic"

In [21]:
mychart = json.load(open(f"{MYCHART_DIR}/annot_wo_answer.json", "r"))
mydoc = json.load(open(f"{MYDOC_DIR}/annot_wo_answer.json", "r"))
myinfographic = json.load(open(f"{MYINFOGRAPHIC_DIR}/annot_wo_answer.json", "r"))

In [22]:
df_mychart = pd.read_json(f"{MYCHART_DIR}/annot_wo_answer.json")
df_mychart["dataset"] = "mychart"
df_mychart["slice"] = "test"
# df_mychart["image_dir"] = MYCHART_DIR

In [23]:
df_mydoc = pd.read_json(f"{MYDOC_DIR}/annot_wo_answer.json")
df_mydoc["dataset"] = "mydoc"
df_mydoc["slice"] = "test"
# df_mydoc["image_dir"] = MYDOC_DIR

In [24]:
df_myinfographic = pd.read_json(f"{MYINFOGRAPHIC_DIR}/annot_wo_answer.json")
df_myinfographic["dataset"] = "myinfographic"
df_myinfographic["slice"] = "test"
# df_myinfographic["image_dir"] = MYINFOGRAPHIC_DIR

### Init DB

In [25]:
uri = "data/lancedb"
db = lancedb.connect(uri)

In [26]:
db.table_names()

[]

In [27]:
if "mmfm" not in db.table_names():
    mmfm_table = db.create_table(
        "mmfm",
        data=pd.concat([df_mychart, df_mydoc, df_myinfographic]),
    )
else:
    mmfm_table = db.open_table("mmfm")

In [28]:
db.table_names()

['mmfm']

In [29]:
mmfm_table.schema

id: string
image: string
conversations: list<item: struct<from: string, value: string>>
  child 0, item: struct<from: string, value: string>
      child 0, from: string
      child 1, value: string
dataset: string
slice: string

In [None]:
mmfm_table.to_pandas().sample(3)

## Analysis

In [13]:
len(mychart), len(mydoc), len(myinfographic)

(200, 400, 428)

In [14]:
len(set(doc["image"] for doc in mychart)), len(set(doc["image"] for doc in mydoc)), len(set(doc["image"] for doc in myinfographic))

(200, 360, 428)

Note: Only `mydoc` has duplicate images

### MyChart

In [15]:
TEST_IDX = 4

In [16]:
mychart[TEST_IDX]

{'id': 'mychart_82_hbar',
 'image': '82_hbar_0635a64e806fee03461eddecf061fd851a801f3b482d208e5c651e7c68c69585_36.png',
 'conversations': [{'from': 'human',
   'value': '<image>\nWhat was the cost of software licenses for the year ended December 31, 2004?'}]}

In [None]:
image = Image.open(f"{MYCHART_DIR}/images/{mychart[TEST_IDX]['image']}")
image

In [None]:
other_question_types = set()
what_cnt = 0
is_there_cnt = 0
for idx, chart in enumerate(mychart):
    question = chart["conversations"][0]["value"].lower()
    is_others = True
    if "what" in question:
        what_cnt += 1
        is_others = False
    if "is there" in question:
        is_there_cnt += 1
        is_others = False
        print(question, chart["image"])
    if is_others:
        other_question_types.add(idx)

In [37]:
len(mychart), what_cnt, is_there_cnt, len(mychart) - (what_cnt + is_there_cnt)

(200, 176, 2, 22)

In [None]:
for idx in other_question_types:
    print(mychart[idx])

### MyDoc

In [16]:
TEST_IDX = 21

In [None]:
mydoc[TEST_IDX]

In [None]:
image = Image.open(f"{MYDOC_DIR}/images/{mydoc[TEST_IDX]['image']}")
image

In [21]:
for idx, doc in enumerate(mydoc):
    assert doc["conversations"][0]["value"].startswith("<image>\nWhat is the ")
    assert doc["conversations"][0]["value"].endswith(" in the image?")

In [None]:
for idx, doc in enumerate(mydoc):
    autofill_key = doc["conversations"][0]["value"][len("<image>\nWhat is the "):-len(" in the image?")]
    # if len(autofill_key) == 0:
    #     print(idx, doc, autofill_key)
    print(autofill_key)

### MyInfographics

In [60]:
TEST_IDX = 4

In [None]:
myinfographic[TEST_IDX]

In [None]:
image = Image.open(f"{MYINFOGRAPHIC_DIR}/images/{myinfographic[TEST_IDX]['image']}")
image