In [1]:
import random
import json
import os
from collections import defaultdict
from tqdm import tqdm


def load_list(file_path: str) -> dict[int, list[str]]:
    """
    list_train.txt / list_val.txt 를 읽어
    {pid: [img_path, ...]} 딕셔너리로 반환
    """
    id_to_imgs = defaultdict(list)
    with open(file_path, "r") as f:
        for line in f:
            img_path, pid = line.strip().split()
            id_to_imgs[int(pid)].append(img_path)
    return id_to_imgs


def pid_from_path(img_path: str) -> int:
    """
    '0953/0953_066_...' → 953
    """
    return int(img_path.split("/")[0])


def generate_pairs(id_to_imgs: dict[int, list[str]], img_root: str) -> list[dict]:
    """
    positive/negative 쌍 생성 후
    LLaVA-interleave-Qwen 포맷의 샘플 리스트 반환
    """
    positive_pairs, negative_pairs = [], []
    pids = list(id_to_imgs.keys())

    for pid in tqdm(pids, desc="Generating pairs"):
        imgs = id_to_imgs[pid]
        # ① 같은 ID → positive
        for i in range(len(imgs)):
            for j in range(i + 1, len(imgs)):
                positive_pairs.append((imgs[i], imgs[j], "yes"))

        # ② 다른 ID → negative  (한 쌍만 샘플)
        neg_pid = random.choice([p for p in pids if p != pid])
        negative_pairs.append(
            (random.choice(imgs), random.choice(id_to_imgs[neg_pid]), "no")
        )

    # 셔플 & 합치기
    all_pairs = positive_pairs + negative_pairs
    random.shuffle(all_pairs)

    samples = []
    for img1, img2, label in all_pairs:
        pid1, pid2 = pid_from_path(img1), pid_from_path(img2)

        # user(질문) 프롬프트
        user_prompt = (
            "<image><image> Are these two people the same? "
            "Also, guess the person-ID of each image."
        )

        # gpt(정답) 응답
        if label == "yes":
            assistant_answer = f"yes, id: {pid1}"
        else:
            assistant_answer = f"no, id1: {pid1}, id2: {pid2}"

        samples.append(
            {
                "system_prompt": "You are a helpful assistant.",
                "image": [
                    os.path.join(img_root, img1),
                    os.path.join(img_root, img2),
                ],
                "conversations": [
                    {"from": "human", "value": user_prompt},
                    {"from": "gpt", "value": assistant_answer},
                ],
            }
        )
    return samples


def main():
    # 경로 설정
    list_train_path = "MSMT17_V1/list_train.txt"
    list_val_path = "MSMT17_V1/list_val.txt"
    train_img_root = "MSMT17_V1/train/"
    val_img_root = "MSMT17_V1/train/"
    output_dir = "./"
    os.makedirs(output_dir, exist_ok=True)

    # train
    train_samples = generate_pairs(load_list(list_train_path), train_img_root)
    with open(os.path.join(output_dir, "msmt17_train_llava_format.jsonl"), "w") as f:
        json.dump(train_samples, f, indent=4)

    # val
    val_samples = generate_pairs(load_list(list_val_path), val_img_root)
    with open(os.path.join(output_dir, "msmt17_val_llava_format.jsonl"), "w") as f:
        json.dump(val_samples, f, indent=4)

    print(f"✅ 저장 완료! (train: {len(train_samples)}개, val: {len(val_samples)}개)")


if __name__ == "__main__":
    main()

Generating pairs: 100%|██████████| 1041/1041 [00:00<00:00, 7664.14it/s]
Generating pairs: 100%|██████████| 1041/1041 [00:00<00:00, 30290.19it/s]


✅ 저장 완료! (train: 726228개, val: 3679개)


In [1]:
import random
import json
import os
from collections import defaultdict
from tqdm import tqdm

random.seed(42)          # ★ 같은 10 ID를 매번 사용하려면 고정

# ---------------- 유틸 ----------------
def load_list(file_path: str) -> dict[int, list[str]]:
    id_to_imgs = defaultdict(list)
    with open(file_path, "r") as f:
        for line in f:
            img_path, pid = line.strip().split()
            id_to_imgs[int(pid)].append(img_path)
    return id_to_imgs

def pid_from_path(img_path: str) -> int:
    return int(img_path.split("/")[0])

# ---------------- 페어 생성 ----------------
def generate_pairs(id_to_imgs: dict[int, list[str]],
                   img_root: str,
                   keep_pids: set[int]) -> list[dict]:
    positive_pairs, negative_pairs = [], []

    for pid in keep_pids:
        imgs = id_to_imgs[pid]
        # ① positive
        for i in range(len(imgs)):
            for j in range(i + 1, len(imgs)):
                positive_pairs.append((imgs[i], imgs[j], "yes"))

        # ② negative (임의 1쌍)
        neg_pid = random.choice(list(keep_pids - {pid}))
        negative_pairs.append(
            (random.choice(imgs), random.choice(id_to_imgs[neg_pid]), "no")
        )

    all_pairs = positive_pairs + negative_pairs
    random.shuffle(all_pairs)

    samples = []
    for img1, img2, label in all_pairs:
        pid1, pid2 = pid_from_path(img1), pid_from_path(img2)
        user_prompt = "<image><image> Are these two people the same? Also, guess the person-ID of each image."
        answer = f"yes, id: {pid1}" if label == "yes" else f"no, id1: {pid1}, id2: {pid2}"

        samples.append(
            {
                "system_prompt": "You are a helpful assistant.",
                "image": [os.path.join(img_root, img1), os.path.join(img_root, img2)],
                "conversations": [
                    {"from": "human", "value": user_prompt},
                    {"from": "gpt",   "value": answer},
                ],
            }
        )
    return samples

# ---------------- 메인 ----------------
def main():
    list_train = "MSMT17_V1/list_train.txt"
    list_val   = "MSMT17_V1/list_val.txt"
    train_root = "MSMT17_V1/train/"
    val_root   = "MSMT17_V1/train/"
    out_dir    = "./"
    os.makedirs(out_dir, exist_ok=True)

    # 全 ID 로드
    train_dict = load_list(list_train)
    val_dict   = load_list(list_val)

    # ★ 10 ID만 선택 (train 기준에서 랜덤 10개)
    keep_pids = set(random.sample(list(train_dict.keys()), 10))
    print("사용 ID 10개:", sorted(keep_pids))

    # train / val 각각 10 ID로 필터링
    train_samples = generate_pairs({pid: train_dict[pid] for pid in keep_pids},
                                   train_root, keep_pids)
    val_samples   = generate_pairs({pid: val_dict[pid]   for pid in keep_pids},
                                   val_root,   keep_pids)

    # 저장
    with open(os.path.join(out_dir, "msmt17_train_small.jsonl"), "w") as f:
        json.dump(train_samples, f, indent=4)
    with open(os.path.join(out_dir, "msmt17_val_small.jsonl"), "w") as f:
        json.dump(val_samples, f, indent=4)

    print(f"✅ 저장 완료! (train: {len(train_samples)}, val: {len(val_samples)})")

if __name__ == "__main__":
    main()

사용 ID 10개: [51, 65, 178, 209, 228, 285, 457, 501, 563, 864]
✅ 저장 완료! (train: 2754, val: 21)


In [2]:
import shutil
import os

# 원본 폴더
src_base = '/workspace/lmms-finetune/example_data/MSMT17_V1'
# 복사할 폴더
dst_base = '/workspace/lmms-finetune/example_data/MSMT17_V1/images'

# 복붙할 하위 폴더들
folders = ['train', 'test']

for folder in folders:
    src = os.path.join(src_base, folder)
    dst = os.path.join(dst_base, folder)

    # ✅ 기존 복사 대상 폴더가 존재하면 삭제
    if os.path.exists(dst):
        print(f"⚠️ 기존 {dst} 폴더 삭제 중...")
        shutil.rmtree(dst)

    # ✅ 폴더 복사
    shutil.copytree(src, dst)
    print(f"✅ {folder}/ 폴더 복사 완료: {dst}")

✅ train/ 폴더 복사 완료: /workspace/lmms-finetune/example_data/MSMT17_V1/images/train
✅ test/ 폴더 복사 완료: /workspace/lmms-finetune/example_data/MSMT17_V1/images/test


In [3]:
import json

# 파일 열기
with open('msmt17_train_small.jsonl', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 앞 5개 출력
for i, item in enumerate(data[:5]):
    print(f"Item {i+1}:")
    print(json.dumps(item, indent=2, ensure_ascii=False))
    print()

Item 1:
{
  "system_prompt": "You are a helpful assistant.",
  "image": [
    "MSMT17_V1/train/0501/0501_019_06_0113noon_0960_0.jpg",
    "MSMT17_V1/train/0501/0501_023_06_0113noon_0962_1.jpg"
  ],
  "conversations": [
    {
      "from": "human",
      "value": "<image><image> Are these two people the same? Also, guess the person-ID of each image."
    },
    {
      "from": "gpt",
      "value": "yes, id: 501"
    }
  ]
}

Item 2:
{
  "system_prompt": "You are a helpful assistant.",
  "image": [
    "MSMT17_V1/train/0228/0228_005_13_0303afternoon_0348_0.jpg",
    "MSMT17_V1/train/0228/0228_003_13_0303afternoon_0346_1.jpg"
  ],
  "conversations": [
    {
      "from": "human",
      "value": "<image><image> Are these two people the same? Also, guess the person-ID of each image."
    },
    {
      "from": "gpt",
      "value": "yes, id: 228"
    }
  ]
}

Item 3:
{
  "system_prompt": "You are a helpful assistant.",
  "image": [
    "MSMT17_V1/train/0285/0285_008_11_0303afternoon_1504_0