In [None]:
import json
from pathlib import Path
from tqdm import tqdm

In [2]:
DATA_DIR = Path("../data")
IMG_DIR = DATA_DIR / "icons_256"
META_FILE = DATA_DIR / "icons_metadata.jsonl"

assert META_FILE.exists(), "metadata jsonl file missing!"
assert IMG_DIR.exists(), "icons_256 directory missing!"

records = []
with open(META_FILE) as f:
    for line in f:
        records.append(json.loads(line))

len(records)


4108

In [3]:
missing = []
for r in records:
    img_path = IMG_DIR / r["image"]
    if not img_path.exists():
        missing.append((r["image"], img_path))

missing[:5], len(missing)

([], 0)

In [4]:
caption_dataset = []

for r in tqdm(records, desc="Building caption dataset"):
    caption_dataset.append({
        "image": str(r["image"]),     # relative name
        "text":  r["caption"]         # LLaVA caption
    })

len(caption_dataset)

Building caption dataset: 100%|██████████| 4108/4108 [00:00<00:00, 2294912.20it/s]


4108

In [5]:
CAPTION_FILE = DATA_DIR / "train_caption.jsonl"

with open(CAPTION_FILE, "w") as f:
    for item in caption_dataset:
        f.write(json.dumps(item) + "\n")

CAPTION_FILE


PosixPath('../data/train_caption.jsonl')

In [6]:
chat_dataset = []

for r in tqdm(records, desc="Building chat-VLM dataset"):
    chat_dataset.append({
        "image": r["image"],
        "messages": [
            {
                "role": "user",
                "content": "<image>\nDescribe this icon."
            },
            {
                "role": "assistant",
                "content": r["caption"]
            }
        ]
    })

Building chat-VLM dataset: 100%|██████████| 4108/4108 [00:00<00:00, 1433699.52it/s]


In [7]:
CHAT_FILE = DATA_DIR / "train_chat.jsonl"

with open(CHAT_FILE, "w") as f:
    for item in chat_dataset:
        f.write(json.dumps(item) + "\n")

CHAT_FILE

PosixPath('../data/train_chat.jsonl')

In [8]:
print("Total records:", len(records))
print("Caption dataset:", len(caption_dataset))
print("Chat dataset:", len(chat_dataset))


Total records: 4108
Caption dataset: 4108
Chat dataset: 4108
