In [1]:
import json
from tqdm import tqdm
import os

def load_json(file_path):
    """
    Load a JSON file and return its content as a Python dictionary.

    Parameters:
        file_path (str): The path to the JSON file.

    Returns:
        dict: The content of the JSON file as a dictionary.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [2]:
path = "/data_ssd/MMAD/MMAD_for_llava-onevision.json"
data = load_json(path)

In [3]:
print(len(data))
print(data[0].keys())
[print(k,v) for k, v in data[0].items()]  # Exclude 'image' key for brevity

39672
dict_keys(['id', 'image', 'conversations', 'Answer', 'Question', 'Options', 'type', 'annotation', 'mask_path', 'similar_templates', 'random_templates'])
id Anomaly Detection_DS-MVTec/bottle/image/broken_large/000.png
image DS-MVTec/bottle/image/broken_large/000.png
conversations [{'from': 'human', 'value': "Test image:\n<image>\nIs there any defect in the object?\nA. Yes.\nB. No.\nAnswer with the option's letter from the given choices directly."}, {'from': 'gpt', 'value': 'A'}]
Answer A
Question Is there any defect in the object?
Options {'A': 'Yes.', 'B': 'No.'}
type Anomaly Detection
annotation True
mask_path rbg_mask/broken_large/000_rbg_mask.png
similar_templates ['MVTec-AD/bottle/train/good/001.png', 'MVTec-AD/bottle/train/good/061.png', 'MVTec-AD/bottle/train/good/199.png', 'MVTec-AD/bottle/train/good/124.png', 'MVTec-AD/bottle/train/good/149.png', 'MVTec-AD/bottle/train/good/147.png', 'MVTec-AD/bottle/train/good/089.png', 'MVTec-AD/bottle/train/good/066.png']
random_templa

[None, None, None, None, None, None, None, None, None, None, None]

In [4]:
task_dataset_name_dict = {}
for item in tqdm(data):
    task= item["id"].split("/")[0].split("_")[0]
    dataset_name = item["image"].split("/")[0]
    if task not in task_dataset_name_dict:
        task_dataset_name_dict[task] = {}
    if dataset_name not in task_dataset_name_dict[task]:
        task_dataset_name_dict[task][dataset_name] = []
    
    task_dataset_name_dict[task][dataset_name].append(item)

100%|██████████| 39672/39672 [00:00<00:00, 663465.28it/s]


In [5]:
print("task_dataset_name_dict keys:", task_dataset_name_dict.keys())

data_count = 0
print(f"Total number of items across all tasks and datasets: {data_count}")
for task, datasets in task_dataset_name_dict.items():
    print(f"Task: {task}")
    for dataset_name, items in datasets.items():
        print(f"  Dataset: {dataset_name}, Number of items: {len(items)}")
        data_count += len(items)
        # Uncomment the next line to see the first item in each dataset
        # print(f"    Example item: {items[0]}")
    print()  # Add a newline for better readability between tasks
    
print(f"Total number of items across all tasks and datasets: {data_count}")

task_dataset_name_dict keys: dict_keys(['Anomaly Detection', 'Defect Classification', 'Defect Localization', 'Defect Description', 'Defect Analysis', 'Object Classification', 'Object Structure', 'Object Details', 'Object Analysis'])
Total number of items across all tasks and datasets: 0
Task: Anomaly Detection
  Dataset: DS-MVTec, Number of items: 1691
  Dataset: MVTec-LOCO, Number of items: 1565
  Dataset: VisA, Number of items: 2141
  Dataset: GoodsAD, Number of items: 2900

Task: Defect Classification
  Dataset: DS-MVTec, Number of items: 1205
  Dataset: MVTec-LOCO, Number of items: 982
  Dataset: VisA, Number of items: 1190
  Dataset: GoodsAD, Number of items: 1311

Task: Defect Localization
  Dataset: DS-MVTec, Number of items: 1193
  Dataset: MVTec-LOCO, Number of items: 982
  Dataset: VisA, Number of items: 1197
  Dataset: GoodsAD, Number of items: 1506

Task: Defect Description
  Dataset: DS-MVTec, Number of items: 1213
  Dataset: MVTec-LOCO, Number of items: 974
  Dataset: Vis

In [7]:
save_path = "./MMAD_task_dataset_name_dict.json"
with open(save_path, 'w', encoding='utf-8') as f:
    json.dump(task_dataset_name_dict, f, ensure_ascii=False, indent=4)

In [9]:
save_data = load_json("/data_ssd/MMAD/MMAD_task_dataset_name_dict_llava-oenvision.json")
print(f"Data saved to {save_path}")
print(f"Loaded data from {save_path}: {len(save_data)} items")

print("task_dataset_name_dict keys:", save_data.keys())

data_count = 0
print(f"Total number of items across all tasks and datasets: {data_count}")
for task, datasets in save_data.items():
    print(f"Task: {task}")
    for dataset_name, items in datasets.items():
        print(f"  Dataset: {dataset_name}, Number of items: {len(items)}")
        data_count += len(items)
        # Uncomment the next line to see the first item in each dataset
        # print(f"    Example item: {items[0]}")
    print()  # Add a newline for better readability between tasks
    
print(f"Total number of items across all tasks and datasets: {data_count}")

Data saved to ./MMAD_task_dataset_name_dict.json
Loaded data from ./MMAD_task_dataset_name_dict.json: 9 items
task_dataset_name_dict keys: dict_keys(['Anomaly Detection', 'Defect Classification', 'Defect Localization', 'Defect Description', 'Defect Analysis', 'Object Classification', 'Object Structure', 'Object Details', 'Object Analysis'])
Total number of items across all tasks and datasets: 0
Task: Anomaly Detection
  Dataset: DS-MVTec, Number of items: 1691
  Dataset: MVTec-LOCO, Number of items: 1565
  Dataset: VisA, Number of items: 2141
  Dataset: GoodsAD, Number of items: 2900

Task: Defect Classification
  Dataset: DS-MVTec, Number of items: 1205
  Dataset: MVTec-LOCO, Number of items: 982
  Dataset: VisA, Number of items: 1190
  Dataset: GoodsAD, Number of items: 1311

Task: Defect Localization
  Dataset: DS-MVTec, Number of items: 1193
  Dataset: MVTec-LOCO, Number of items: 982
  Dataset: VisA, Number of items: 1197
  Dataset: GoodsAD, Number of items: 1506

Task: Defect Des

In [4]:
def display_dict_keys_and_items(dictionary):
    """
    Display the keys and items of a dictionary.

    Parameters:
        dictionary (dict): The dictionary to display.

    Returns:
        None
    """
    for key, value in dictionary.items():
        print(f"Key: {key}, Value: {value}")
        


In [5]:
display_dict_keys_and_items(data[0])

Key: sample_id, Value: 5390
Key: conversations, Value: [{'from': 'human', 'value': "<image><image>\nWhat's the detailed difference between the 2 images? Please list in detail."}, {'from': 'gpt', 'value': 'The differences between the two images are:\n\n1. In the second image, there are leaves falling from the sunflowers and the surrounding foliage.\n2. The ground in the second image is covered with a layer of fallen leaves, adding a carpet-like appearance.'}]
Key: image, Value: ['HQ-Edit/images/83425.jpg', 'HQ-Edit/images/83426.jpg']
Key: choice_list, Value: None
Key: metadata, Value: {'dataset': 'HQ-Edit-Diff', 'split': 'train', 'num_sample': 98675, 'task_instruction': "What's the difference between 2 images?", 'question_type': 'open-ended'}


In [6]:
import os
from tqdm import tqdm
image_root_dir = "/home/omote/local-share-data_ssd/huggingface_dataset/lmms-lab/M4-Instruct-Data"

break_flag = False
chcek_data = data[14200:]
for d in tqdm(chcek_data):
    if type(d["image"]) == str:
        image_path_list = [d["image"]]
    elif type(d["image"]) == list:
        image_path_list = d["image"]
        
    for image_path in image_path_list:
        dataset_name = image_path.split("/")[0]
        image_name = image_path[len(dataset_name)+1:]
        image_path = os.path.join(image_root_dir,dataset_name,dataset_name,image_name)
        if not os.path.exists(image_path):
            print(f"Image path does not exist: {image_path}")
            break_flag = True
            break
    if break_flag:
        break
 

  0%|          | 140/601614 [00:00<14:22, 697.65it/s]

100%|██████████| 601614/601614 [06:06<00:00, 1639.34it/s] 


In [5]:
print(data[0]["conversations"])

[{'from': 'human', 'value': "<image><image>\nWhat's the detailed difference between the 2 images? Please list in detail."}, {'from': 'gpt', 'value': 'The differences between the two images are:\n\n1. In the second image, there are leaves falling from the sunflowers and the surrounding foliage.\n2. The ground in the second image is covered with a layer of fallen leaves, adding a carpet-like appearance.'}]


In [7]:
def check_conversation_format(item):
    conversation = item["conversations"]
    if conversation[0]["from"] != "human":
        return False
    
    if conversation[-1]["from"] != "gpt":
        return False
    
    return True

def check_multiple_images(item):
    if type(item["image"]) == list and len(item["image"]) > 1:
        return True
    return False
        
    
    

In [None]:
from tqdm import tqdm
pass_data_list = []
not_pass_data_list = []
for i, d in enumerate(tqdm(data)):
    if not check_conversation_format(d) or not check_multiple_images(d):
        not_pass_data_list.append(d)
        continue
    
    pass_data_list.append(d)
    

100%|██████████| 615814/615814 [00:00<00:00, 1060753.37it/s]


In [11]:
print("All image paths exist.")
print(f"Total data: {len(data)}")
print(f"Pass data: {len(pass_data_list)}")
print(f"Not pass data: {len(not_pass_data_list)}")

All image paths exist.
Total data: 615814
Pass data: 610080
Not pass data: 5734


In [15]:
print(not_pass_data_list[0]["conversations"][0]["value"].count("<image"))

9


In [34]:
import regex as re
def check_image_number(item):
    conversation = item["conversations"]
    image_count = 0

    for message in conversation:
        image_count += message["value"].count("<image>")

    if image_count != len(item["image"]):
        return False
    
    return True
def check_image_in_human_conversation(item):
    conversation = item["conversations"]
    for message in conversation:            
        if "<image>" in message["value"] and message["from"] == "gpt":
            return False
    return True

In [54]:
not_conversation_list = []
not_multiple_images_list = []

for d in tqdm(not_pass_data_list):
    if not check_image_in_human_conversation(d):
        not_conversation_list.append(d)
        
        continue
    if not check_image_number(d) and not check_multiple_images(d):
        not_multiple_images_list.append(d)
        continue
    


100%|██████████| 5734/5734 [00:00<00:00, 1422075.40it/s]


In [42]:
print(len(not_conversation_list))
print(len(not_multiple_images_list))

5734
0


In [56]:
print(not_conversation_list[0])

{'datasource': 'twitter_post', 'id': 0, 'conversations': [{'from': 'gpt', 'value': 'Help me write a Twitter post considering the following images.\n<image><image><image><image><image><image><image><image><image>'}, {'from': 'human', 'value': '"Embracing the serenity of island life where the water is as clear as the skies. 🌊☀️ #IslandVibes #CrystalClear #BeachDays #TravelDiaries"'}], 'image': ['mmchat/images/mw2048_832851e1gy1foxoneylnaj22c02c07wi.jpg', 'mmchat/images/mw2048_832851e1gy1foxop6mkvgj22av2av7wi.jpg', 'mmchat/images/mw2048_832851e1gy1foxoo3z9v0j2276276hdu.jpg', 'mmchat/images/mw2048_832851e1gy1foxonjqlsyj21o02t8x6q.jpg', 'mmchat/images/mw2048_832851e1gy1foxoswo4w9j229d29db2h.jpg', 'mmchat/images/mw2048_832851e1gy1foxongl7khj212p0t0wog.jpg', 'mmchat/images/mw2048_832851e1gy1foxou97bjvj229d29d7wk.jpg', 'mmchat/images/mw2048_832851e1gy1foxonujr34j22c02c1kjn.jpg', 'mmchat/images/mw2048_832851e1gy1foxonbhlqoj22z228a7wi.jpg'], 'metadata': {'dataset': 'twitter_post', 'split': 'trai

In [67]:
def swap_gpt_and_human_conversation(item):
    """
    Swap the 'from' field of the first and last conversation messages.

    Parameters:
        item (dict): The item containing conversations.

    Returns:
        dict: The item with swapped conversation roles.
    """
    conversation = item["conversations"]
    for i,message in enumerate(conversation):
        if message["from"] == "human" and i % 2 == 1:
            conversation[i]["from"] = "gpt"
        elif message["from"] == "gpt" and i % 2 == 0:
            conversation[i]["from"] = "human"
        else:
            print(item)
            raise ValueError(f"Unknown conversation role: {message['from']}")
    return item

In [70]:
from copy import deepcopy
swap_data_list = []
for d in tqdm(not_conversation_list):
    swapped_item = swap_gpt_and_human_conversation(deepcopy(d))
    swap_data_list.append(swapped_item)

  0%|          | 0/5734 [00:00<?, ?it/s]

100%|██████████| 5734/5734 [00:00<00:00, 27124.18it/s]


In [71]:
print(swap_data_list[0])
print(not_conversation_list[0])

{'datasource': 'twitter_post', 'id': 0, 'conversations': [{'from': 'human', 'value': 'Help me write a Twitter post considering the following images.\n<image><image><image><image><image><image><image><image><image>'}, {'from': 'gpt', 'value': '"Embracing the serenity of island life where the water is as clear as the skies. 🌊☀️ #IslandVibes #CrystalClear #BeachDays #TravelDiaries"'}], 'image': ['mmchat/images/mw2048_832851e1gy1foxoneylnaj22c02c07wi.jpg', 'mmchat/images/mw2048_832851e1gy1foxop6mkvgj22av2av7wi.jpg', 'mmchat/images/mw2048_832851e1gy1foxoo3z9v0j2276276hdu.jpg', 'mmchat/images/mw2048_832851e1gy1foxonjqlsyj21o02t8x6q.jpg', 'mmchat/images/mw2048_832851e1gy1foxoswo4w9j229d29db2h.jpg', 'mmchat/images/mw2048_832851e1gy1foxongl7khj212p0t0wog.jpg', 'mmchat/images/mw2048_832851e1gy1foxou97bjvj229d29d7wk.jpg', 'mmchat/images/mw2048_832851e1gy1foxonujr34j22c02c1kjn.jpg', 'mmchat/images/mw2048_832851e1gy1foxonbhlqoj22z228a7wi.jpg'], 'metadata': {'dataset': 'twitter_post', 'split': 'trai

In [3]:
path = "/data_ssd/M4-Instruct-Data/m4_instruct_annotations_fixed.json"

saved_data = pass_data_list.extend(swap_data_list)
print(f"Total saved data: {len(pass_data_list)}")

import os
if not os.path.exists(os.path.dirname(path)):
    os.makedirs(os.path.dirname(path))
with open(path, 'w', encoding='utf-8') as f:
    json.dump(pass_data_list, f, ensure_ascii=False, indent=4)

NameError: name 'pass_data_list' is not defined

In [8]:
fixed_data = load_json(path)
print(f"Fixed data length: {len(fixed_data)}")

image_root_dir = "/data_ssd/llava-onevision-data-symbolic-link"

break_flag = False
for d in tqdm(fixed_data):
    if type(d["image"]) == str:
        image_path_list = [d["image"]]
    elif type(d["image"]) == list:
        image_path_list = d["image"]
        
    for image_path in image_path_list:
        image_path = os.path.join(image_root_dir, image_path)
        if not os.path.exists(image_path):
            print(f"Image path does not exist: {image_path}")
            break_flag = True
            break
    if break_flag:
        break
 
from tqdm import tqdm
pass_data_list = []
not_pass_data_list = []
for i, d in enumerate(tqdm(fixed_data)):
    if not check_conversation_format(d) or not check_multiple_images(d):
        not_pass_data_list.append(d)
        continue
    
    pass_data_list.append(d)

Fixed data length: 621548


100%|██████████| 621548/621548 [07:41<00:00, 1345.43it/s] 
100%|██████████| 621548/621548 [00:00<00:00, 673190.70it/s]


In [10]:
print(len(not_pass_data_list[0]))

5
