# フォルダ名の変更

# 欠陥名当て用データ作成

## フォルダの選別
1. 子フォルダ内の画像枚数が0枚→子フォルダを削除
2. 子フォルダ数が0→親フォルダを削除

※ 子フォルダ名を "None" or "欠陥"にしておく

In [1]:
# ベースパス
base_folder = "/data/yyama_dataset/AC_images"

In [2]:
import os
import shutil

def delete_folder(folder_path):
    try:
        shutil.rmtree(folder_path)
    except Exception as e:
        print(f"エラー：{e}")

def count_image_files(folder_path):
    image_extensions = [".jpg", ".jpeg", ".png"]
    image_count = 0
    for file in os.listdir(folder_path):
        if any(file.lower().endswith(ext) for ext in image_extensions):
            image_count += 1
    return image_count

if os.path.isdir(base_folder):  # ベースフォルダが存在するかチェック
    for parent_folder in os.listdir(base_folder):
        parent_folder_path = os.path.join(base_folder, parent_folder)
        if os.path.isdir(parent_folder_path):  # parent_folder_pathがディレクトリであるかチェック
            for child_folder in os.listdir(parent_folder_path):
                child_folder_path = os.path.join(parent_folder_path, child_folder)
                if os.path.isdir(child_folder_path):  # child_folder_pathがディレクトリであるかチェック
                    num_images = count_image_files(child_folder_path)
                    if num_images == 0:
                        print(f"削除フォルダ名: {child_folder_path}, 枚数: {num_images}")
                        delete_folder(child_folder_path)
            if len(os.listdir(parent_folder_path)) == 0:
                print(f"削除フォルダ名: {parent_folder_path}")
                delete_folder(parent_folder_path)

#### 拡張子を除いたフォルダ名の重複を修正<br>
punctured_tire内に"image_49.jpeg"と"image_49.jpg"が存在したため、全てチェックする

In [3]:
import os
import re

# ベースフォルダが存在するかどうかを確認
if os.path.isdir(base_folder):
    # 親フォルダ内のサブフォルダを取得
    for folder_name in os.listdir(base_folder):
        subfolder_path = os.path.join(base_folder, folder_name)
        
        if os.path.isdir(subfolder_path):  # サブフォルダであるか確認
            # サブフォルダ内の子フォルダを取得
            for subfolder_name in os.listdir(subfolder_path):
                child_folder_path = os.path.join(subfolder_path, subfolder_name)
                
                if os.path.isdir(child_folder_path):  # 子フォルダであるか確認
                    # 子フォルダ内のファイル名を取得
                    files = os.listdir(child_folder_path)
                    
                    # ファイル名から数字部分を取得し、リストに保存
                    numbers = [int(re.search(r"\d+", file).group()) for file in files if re.search(r"\d+", file)]
                    
                    # ファイル名の重複チェックと変更
                    for file in files:
                        match = re.search(r"\d+", file)
                        if match:
                            number = int(match.group())
                            
                            # 同じ番号が複数存在する場合
                            if numbers.count(number) > 1:
                                new_number = 0
                                
                                # 重複していない番号を探す
                                while new_number in numbers:
                                    new_number += 1
                                
                                # 新しいファイル名を作成
                                new_file = file.replace(str(number), str(new_number))
                                new_file_path = os.path.join(child_folder_path, new_file)
                                
                                # ファイル名を変更
                                os.rename(
                                    os.path.join(child_folder_path, file), 
                                    new_file_path
                                )
                                
                                # ファイル名の変更を表示
                                print(f"{child_folder_path}/{file} -> {new_file_path}")
                                
                                # リストを更新
                                numbers.remove(number)
                                numbers.append(new_number)

train：val = 8：2 に分ける

In [18]:
import os
import shutil
import random

# ディレクトリのパスを指定

src_dir = "/data/yyama_dataset/AC_images"
train_dir = "/data/yyama_dataset/tasks/VI_simple_prompt/train/"
val_dir = "/data/yyama_dataset/tasks/VI_simple_prompt/val/"

# trainとvalの割合
ratio = 0.8
random.seed(42)
# trainとvalのディレクトリを作成
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# src_dir内の各フォルダに対して
for root, dirs, files in os.walk(src_dir):
    # 画像のリストを取得し、シャッフル
    images = [f for f in files if os.path.isfile(os.path.join(root, f))]
    random.shuffle(images)
    
    # trainとvalに分割
    train_images = images[:int(ratio * len(images))]
    val_images = images[int(ratio * len(images)):]
    # print(len(train_images))
    # print(len(val_images))
    
    # trainとvalのサブディレクトリを作成
    # print(os.path.join(train_dir, os.path.relpath(root, src_dir)))
    train_folder = os.path.join(train_dir, os.path.relpath(root, src_dir))
    val_folder = os.path.join(val_dir, os.path.relpath(root, src_dir))
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    
    # # 画像をコピー
    for img in train_images:
        shutil.copy(os.path.join(root, img), os.path.join(train_folder, img))
        # print(os.path.join(root, img))
        # print(os.path.join(train_folder, img))
    for img in val_images:
        shutil.copy(os.path.join(root, img), os.path.join(val_folder, img))
        # print(os.path.join(root, img))
        # print(os.path.join(val_folder, img))

print("Images copied to train and val folders.")

Images copied to train and val folders.


val中の画像枚数が1の子フォルダをtrainへ移動

In [19]:
import os
import shutil
from pathlib import Path
# 子フォルダを移動させる関数
def move_child_folder(src, dst):
    """
    :param src: 移動するフォルダのパス
    :param dst: 移動先のパス
    """ 
    if os.path.exists(dst):
        # 移動先のディレクトリが存在する場合、ファイルのみを移動
        for file_name in os.listdir(src):
            full_file_name = os.path.join(src, file_name)
            if os.path.isfile(full_file_name):
                shutil.move(full_file_name, dst)
        # もとのフォルダを削除
        os.rmdir(src)
    else:
        # 移動先のフォルダが存在しない場合、フォルダごと移動
        if not os.path.exists(os.path.dirname(dst)):
            os.makedirs(os.path.dirname(dst))
        shutil.move(src, os.path.join(os.path.dirname(dst), os.path.basename(src)))

# 親フォルダごと移動させる関数
def move_parent_folder(src, dst):
    """
    :param src: 移動するフォルダのパス
    :param dst: 移動先のパス
    """
    for src_dir, dirs, files in os.walk(src):
        dst_dir = src_dir.replace(src, dst, 1)
        if not os.path.exists(dst_dir):
            os.makedirs(dst_dir)
        for file_ in files:
            src_file = os.path.join(src_dir, file_)
            dst_file = os.path.join(dst_dir, file_)
            if os.path.exists(dst_file):
                os.remove(dst_file)
            shutil.move(src_file, dst_dir)
    shutil.rmtree(src)
    
# 移動先のパスを生成する関数
def change_path(path, new_base):
    # パスを分割
    parts = path.split("/")
    # 新しいベースでパスを再構築
    parts[1] = new_base
    return "/".join(parts)

# 与えられたフォルダ内の画像ファイル数を数える関数
def count_image_files(folder_path):
    image_extensions = [".jpg", ".jpeg", ".png"]  # 画像ファイルの拡張子を追加
    image_count = 0
    for file in os.listdir(folder_path):
        if any(file.lower().endswith(ext) for ext in image_extensions):
            image_count += 1
    return image_count

def val(val_dir, val_base):
    if os.path.isdir(val_dir):  # val_dirがディレクトリであるか確認
        for parent_folder in os.listdir(val_dir):
            parent_folder_path = os.path.join(val_dir, parent_folder)
            parent_dir_name = Path(parent_folder_path).name
            if os.path.isdir(parent_folder_path):  # ディレクトリであるか確認
                for child_folder in os.listdir(parent_folder_path):
                    child_folder_path = os.path.join(parent_folder_path, child_folder)
                    if os.path.isdir(child_folder_path):  # ディレクトリであるか確認
                        num_images = count_image_files(child_folder_path)
                        if num_images == 1:
                            # print(f"子フォルダ:{child_folder_path}")
                            print(f'親:{Path(child_folder_path).parent.name}    子:{Path(child_folder_path).name}')
                            print(f'移動フォルダ名: {child_folder_path}, 移動先: {val_base+Path(child_folder_path).parent.name+"/"+Path(child_folder_path).name}, 枚数: {num_images}')
                            move_child_folder(child_folder_path, val_base+Path(child_folder_path).parent.name+"/"+Path(child_folder_path).name)

val_dir = "/data/yyama_dataset/tasks/VI_simple_prompt/val"
val_base = "/data/yyama_dataset/tasks/VI_simple_prompt/train/"
val(val_dir, val_base)

親:acrylic_paint_on_canvas    子:crack
移動フォルダ名: /data/yyama_dataset/tasks/VI_simple_prompt/val/acrylic_paint_on_canvas/crack, 移動先: /data/yyama_dataset/tasks/VI_simple_prompt/train/acrylic_paint_on_canvas/crack, 枚数: 1


Error: Destination path '/data/yyama_dataset/tasks/VI_simple_prompt/train/acrylic_paint_on_canvas/crack/image_0.jpg' already exists

valとtrainから空のフォルダを削除

In [20]:
# 空のフォルダを削除
def delete_empty_folders(base_folder):
    if os.path.isdir(base_folder):
        for parent_folder in os.listdir(base_folder):
            parent_folder_path = os.path.join(base_folder, parent_folder)
            if os.path.isdir(parent_folder_path):  # ディレクトリであるか確認
                for child_folder in os.listdir(parent_folder_path):
                    child_folder_path = os.path.join(parent_folder_path, child_folder)
                    if os.path.isdir(child_folder_path):  # ディレクトリであるか確認
                        num_images = count_image_files(child_folder_path)
                        # 画像枚数が2枚未満の場合、その子フォルダを削除
                        if num_images==0:
                            print(f'削除フォルダ名: {child_folder_path}, 枚数: {num_images}')
                            delete_folder(child_folder_path)
                # 親フォルダ内の子フォルダ数が0の場合、親フォルダを削除
                if len(os.listdir(parent_folder_path))==0 :
                    print(f'削除フォルダ名: {parent_folder_path}')
                    delete_folder(parent_folder_path)
                    
train_dir = "/data/yyama_dataset/tasks/VI_simple_prompt/train"
print(train_dir)
delete_empty_folders(train_dir)
val_dir = "/data/yyama_dataset/tasks/VI_simple_prompt/val"
print(val_dir)
delete_empty_folders(val_dir)

/data/yyama_dataset/tasks/VI_simple_prompt/train
/data/yyama_dataset/tasks/VI_simple_prompt/val


In [21]:
import os

def find_missing_folders(folder1, folder2):
    """
    指定された2つのフォルダ間で存在しないフォルダを見つけて出力します。
    Args:
        folder1 (str): 最初のフォルダのパス
        folder2 (str): 2番目のフォルダのパス
    """
    # フォルダ1内のフォルダリストを取得
    folders1 = os.listdir(folder1)

    # フォルダ2内のフォルダリストを取得
    folders2 = os.listdir(folder2)

    # フォルダ1に存在し、フォルダ2に存在しないフォルダを見つける
    missing_folders = [folder for folder in folders1 if folder not in folders2]

    # 結果を出力
    print(f'フォルダ "{folder1}" にあって "{folder2}" に存在しないフォルダ:')
    for folder in missing_folders:
        print(folder)
    print("FINISH")

# 使用例
folder1_path = "/data/yyama_dataset/tasks/VI_simple_prompt/train"
folder2_path = "/data/yyama_dataset/tasks/VI_simple_prompt/val"
find_missing_folders(folder1_path, folder2_path)
find_missing_folders(folder2_path, folder1_path)

フォルダ "/data/yyama_dataset/tasks/VI_simple_prompt/train" にあって "/data/yyama_dataset/tasks/VI_simple_prompt/val" に存在しないフォルダ:
FINISH
フォルダ "/data/yyama_dataset/tasks/VI_simple_prompt/val" にあって "/data/yyama_dataset/tasks/VI_simple_prompt/train" に存在しないフォルダ:
FINISH


## ○○_instructions.jsonの作成

In [22]:
import os
import json

def natural_sort_key(s):
    """
    ファイル名の数字部分を考慮してソートするためのキー関数
    """
    import re
    return [int(text) if text.isdigit() else text.lower() for text in re.split("(\d+)", s)]

def generate_json_from_directory(directory_path, output_json_path):
    output = {"data": {}}
    i = 0
    # メインディレクトリ内のサブディレクトリを走査
    for main_folder in os.listdir(directory_path):
        main_folder_path = os.path.join(directory_path, main_folder)
        print(f'main folder : {main_folder_path}')
        # サブディレクトリがディレクトリであるかの確認
        if os.path.isdir(main_folder_path):
            
            # サブディレクトリ内のサブディレクトリを走査
            for sub_folder in os.listdir(main_folder_path):
                sub_folder_path = os.path.join(main_folder_path, sub_folder)
                print(f'sub folder : {sub_folder_path}')
                # サブディレクトリ内のファイルを昇順に走査
                for image_file in sorted(os.listdir(sub_folder_path), key=natural_sort_key):
                    # 画像ファイルの拡張子を除去
                    image_name_without_extension = os.path.splitext(image_file)[0]
                    
                    # キーの名前を生成
                    key_name = f"{main_folder}+{sub_folder}+{image_name_without_extension}"
                    print(key_name)
                    # JSONのデータ構造を生成
                    output["data"][key_name] = {
                        "instruction": "",
                        "answer": "",
                        "image_ids": [key_name],
                        "label": i
                    }
                i += 1
    
    # JSONをファイルに書き出し
    with open(output_json_path, "w", encoding="utf-8") as json_file:
        json.dump(output, json_file, indent=4, ensure_ascii=False)

# プログラムの実行
context_path = "/data/yyama_dataset/tasks/VI_simple_prompt/train"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_train_instructions.json"
generate_json_from_directory(context_path, output_json_path)

context_path = "/data/yyama_dataset/tasks/VI_simple_prompt/val"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_val_instructions.json"
generate_json_from_directory(context_path, output_json_path)

### default用のjsonを作成
context_path = "/data/yyama_dataset/tasks/VI_simple_prompt/train"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_train_instructions.json"
generate_json_from_directory(context_path, output_json_path)

context_path = "/data/yyama_dataset/tasks/VI_simple_prompt/val"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_val_instructions.json"
generate_json_from_directory(context_path, output_json_path)

main folder : /data/yyama_dataset/tasks/VI_simple_prompt/train/acrylic_paint
sub folder : /data/yyama_dataset/tasks/VI_simple_prompt/train/acrylic_paint/None
acrylic_paint+None+image_1
acrylic_paint+None+image_2
acrylic_paint+None+image_4
acrylic_paint+None+image_6
acrylic_paint+None+image_7
acrylic_paint+None+image_9
acrylic_paint+None+image_10
acrylic_paint+None+image_20
acrylic_paint+None+image_26
acrylic_paint+None+image_32
acrylic_paint+None+image_38
acrylic_paint+None+image_51
main folder : /data/yyama_dataset/tasks/VI_simple_prompt/train/acrylic_paint_on_canvas
sub folder : /data/yyama_dataset/tasks/VI_simple_prompt/train/acrylic_paint_on_canvas/crack
acrylic_paint_on_canvas+crack+image_0
acrylic_paint_on_canvas+crack+image_2
acrylic_paint_on_canvas+crack+image_3
sub folder : /data/yyama_dataset/tasks/VI_simple_prompt/train/acrylic_paint_on_canvas/fading
acrylic_paint_on_canvas+fading+image_2
acrylic_paint_on_canvas+fading+image_3
acrylic_paint_on_canvas+fading+image_4
acrylic_p

### instructionとanswerを埋める

In [23]:
question_lines = [
    "In this image featuring a {product}, can you identify any issues?",
]

yes_responses_array = [
    "Yes. This {product} has {defect}, indicating an issue.",
]

no_responses_array = [
    "No. This photograph of the {product} does not exhibit any signs. It appears to be non-defective.",
]

In [35]:
import json
import os
import random
import numpy as np


# 指定されたリスト形式の文字列を作成する関数
def generate_list_string(items):
    # アンダースコアをスペースに変換
    items = [item.replace("_", " ") for item in items]
    
    if len(items) == 1:
        return items[0]
    elif len(items) == 2:
        return f"{items[0]} and {items[1]}"
    else:
        return ", ".join(items[:-1]) + f", and {items[-1]}"

def fill_instruction_and_answer(json_path, train_context_dir,flag=0):
    # JSONを読み込む
    no_count = 0
    yes_count = 0
    random_list = np.zeros(len(question_lines))
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)["data"]

    # ["data"]からキーを順に読み込む
    for key, value in data.items():
        # キーの名前から親フォルダ名と子フォルダ名を認識する
        parent_folder, child_folder, _ = key.split("+")

        # "./1_train_context/親フォルダ名"にアクセスし、その中にある子フォルダ名をリスト化する
        subfolder_list = os.listdir(os.path.join(train_context_dir, parent_folder))
        subfolder_list = [folder for folder in subfolder_list if folder != parent_folder] # 親フォルダ名を除外

        # "instruction"にテキストを入力する
        defect = generate_list_string(subfolder_list)
        parent_folder__ = parent_folder.replace("_", " ")
        if flag==0:
            random_idx = random.randint(0, len(question_lines)-1)
            random_list[random_idx] += 1
            value["instruction"] = question_lines[random_idx].format(product=parent_folder, defect=child_folder)
        else:
            value["instruction"] = f"Does this image have any defects? If there are any defects, please provide the defect name. If not, please say None."
       
        # "answer"にテキストを入力する
        if child_folder=="None":
            no_count += 1
            if flag==0:
                value["answer"] = no_responses_array[random_idx].format(product=parent_folder, defect=child_folder)
                print(value["answer"])
            else:
                value["answer"] = f"None"
                print(value["answer"])   
        else:
            yes_count += 1
            child_folder = child_folder.replace("_", " ")
            if flag==0:
                value["answer"] = yes_responses_array[random_idx].format(product=parent_folder, defect=child_folder)
            else:
                value["answer"] = f"{child_folder}"
                print(value["answer"])
            # print(f"selected answer:  {yes_responses_array[random_idx][0]}")
    print(f"no_count: {no_count}")
    print(f"yes_count: {yes_count}")
    print(f"random_list: {random_list}")

    # JSONをファイルに書き出し
    with open(json_path, "w", encoding="utf-8") as json_file:
        json.dump({"data": data}, json_file, indent=4, ensure_ascii=False)

# プログラムの実行
json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_train_instructions.json"
dir = "/data/yyama_dataset/tasks/VI_simple_prompt/train"
fill_instruction_and_answer(json_path, dir)

json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_val_instructions.json"
dir = "/data/yyama_dataset/tasks/VI_simple_prompt/val"
fill_instruction_and_answer(json_path, dir)

### default用の実験
# プログラムの実行
json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_train_instructions.json"
dir = "/data/yyama_dataset/tasks/VI_simple_prompt/train"
fill_instruction_and_answer(json_path, dir,flag=1)

json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_val_instructions.json"
dir = "/data/yyama_dataset/tasks/VI_simple_prompt/val"
fill_instruction_and_answer(json_path, dir,flag=1)

No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-defective.
No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-defective.
No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-defective.
No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-defective.
No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-defective.
No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-defective.
No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-defective.
No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-defective.
No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-defective.
No. This photograph of the acrylic_paint does not exhibit any signs. It appears to be non-d

## ○○_train.jsonの作成

In [30]:
import json
import random

def create_visual_inspection_train(input_json_path, output_json_path):
    # JSONを読み込む
    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)["data"]
    # print(data)

    train_data = {}

    # ["data"]からキーを順に読み込む
    keys = []
    for key, value in data.items():
        keys.append(key)
        # print(f"key: {key}  value: {value}" )
    
    # ランダムにシャッフルしてデータを保存
    random.shuffle(keys)
    for key_name in keys:
        train_data[f"{key_name}"] = []
        print(key_name)

    # JSONをファイルに書き出し
    with open(output_json_path, "w", encoding="utf-8") as json_file:
        json.dump(train_data, json_file, indent=4, ensure_ascii=False)

# プログラムの実行
input_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_train_instructions.json"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_train_train.json"
create_visual_inspection_train(input_json_path, output_json_path)

input_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_val_instructions.json"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_val_train.json"
create_visual_inspection_train(input_json_path, output_json_path)

# プログラムの実行
input_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_train_instructions.json"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_train_train.json"
create_visual_inspection_train(input_json_path, output_json_path)

input_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_val_instructions.json"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_val_train.json"
create_visual_inspection_train(input_json_path, output_json_path)

leather_jacket+wrinkle+image_20
copper+None+image_4
camera+crack+image_4
printed_circuit_board+None+image_25
wood+wormhole+image_97
candle+None+image_47
glass_pendant+None+image_13
cup+None+image_23
spoon+None+image_24
metal_key+None+image_16
automobile_body+rust+image_20
eraser+None+image_9
steel_ladder+None+image_39
wheel+None+image_11
cardboard_tube+None+image_2
satin_ribbon+None+image_12
belt+None+image_37
cotton_socks+None+image_29
lamp_shade+None+image_44
hard_drive_disk+None+image_14
cup+None+image_61
steel_beam+scratch+image_3
platinum_necklace+None+image_37
lotion_bottle+None+image_11
spoon+None+image_83
linen_fabric+None+image_41
backpack+None+image_31
mouse+None+image_54
cork_board+None+image_28
gypsum_board+None+image_6
wicker_basket+None+image_29
chair+None+image_68
shoe_sole+wear+image_22
toothbrush+None+image_4
spoon+discoloration+image_18
bottle+None+image_18
automobile_body+scratch+image_8
teflon_coated_pan+None+image_42
rubber+crack+image_6
cutting_board+warp+image_30

disk+crack+image_106
window+scratch+image_5
book+aging+image_16
painting+None+image_26
battery+leak+image_64
steel_bridge+None+image_8
bed+None+image_25
curtain+None+image_23
granite_floor+None+image_1
soap_bar+None+image_31
linen_fabric+None+image_4
wooden_door+chip+image_8
disk+crack+image_78
cork_board+None+image_23
sheet_of_paper+crumpled+image_3
silk_pillowcase+None+image_1
computer_keyboard+None+image_43
spoon+None+image_75
power_socket+None+image_16
quartz_watch+None+image_42
wooden_chair+stain+image_30
magazine_page+None+image_8
jeans_jacket+None+image_22
cup+None+image_20
printed_circuit_board+burn+image_19
tire+puncture+image_11
metal_chain+None+image_34
printer+None+image_24
porcelain_plate+None+image_40
fork+None+image_5
silver_coin+tarnish+image_18
hair_brush+None+image_4
wine_bottle+None+image_37
mirror+crack+image_45
brass_doorknob+None+image_19
calculator+None+image_42
metal+hole+image_62
keyboard+missing+image_92
keyboard+None+image_31
spoon+bent+image_23
copper_pipe+N

In [31]:
# 数の確認
json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_train_train.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)
    
count = len(data.keys())
print(f"データ数: {count}")

json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_val_train.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)
    
count = len(data.keys())
print(f"データ数: {count}")

# 数の確認
json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_train_train.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)
    
count = len(data.keys())
print(f"データ数: {count}")

json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/default_VI_val_train.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)
    
count = len(data.keys())
print(f"データ数: {count}")

データ数: 15271
データ数: 4543
データ数: 15271
データ数: 4543


## ○○.jsonの作成

In [32]:
import json
import os
from PIL import Image
import io
import base64

# コンテキスト　→　クエリの順で実行すること

def image_to_urlsafe_base64_png(img_path):
    """画像をメモリ上でPNGに変換し、その後URL-safeなBase64に変換する関数"""
    with Image.open(img_path) as image:
        # CMYKモードの画像をRGBモードに変換
        if image.mode == "CMYK":
            image = image.convert("RGB")
        # パレットモードの画像をRGBAモードに変換
        if image.mode == "P":
            image = image.convert("RGBA")
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        img_str = base64.urlsafe_b64encode(buffered.getvalue()).decode("utf-8")
    return img_str

def create_visual_inspection(input_json_path, base_folder, output_json_path):
    # JSONを読み込む
    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)["data"]

    # 既存のvisual_inspection.jsonが存在する場合、その内容を読み込む
    if os.path.exists(output_json_path):
        with open(output_json_path, "r", encoding="utf-8") as f:
            visual_data = json.load(f)
    else:
        visual_data = {}

    extensions = [".png", ".jpg", ".jpeg","JPEG",".JPG",".PNG"]

    # ["data"]からキーを順に読み込む
    for key in data.keys():
        parent_folder, child_folder, image_name = key.split("+")
        
        # 各拡張子を試して、存在するファイルを見つける
        for ext in extensions:
            img_path = os.path.join(base_folder, parent_folder, child_folder, image_name + ext)
            if os.path.exists(img_path):
                # 画像をURL-safeなBase64 PNG形式に変換
                # print(img_path)
                visual_data[key] = image_to_urlsafe_base64_png(img_path)
                break

    # JSONをファイルに書き出し
    with open(output_json_path, "w", encoding="utf-8") as json_file:
        json.dump(visual_data, json_file, indent=4, ensure_ascii=False)

# プログラムの実行
base_folder = "/data/yyama_dataset/tasks/VI_simple_prompt/train"  # これは一つ上のベースフォルダのパス
input_json_path = "/data/yyama_dataset/tasks/VI_simple_prompt/VI_train_instructions.json"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_train.json"
create_visual_inspection(input_json_path, base_folder, output_json_path)
print(f"finish making train.json")

base_folder = "/data/yyama_dataset/tasks/VI_simple_prompt/val"  # これは一つ上のベースフォルダのパス
input_json_path = "/data/yyama_dataset/tasks/VI_simple_prompt/VI_val_instructions.json"
output_json_path = f"/data/yyama_dataset/tasks/VI_simple_prompt/VI_val.json"
create_visual_inspection(input_json_path, base_folder, output_json_path)
print(f"finish making val.json")

# base_folder = "/home/dataset/yyama_dataset/tasks/AC/train"  # これは一つ上のベースフォルダのパス
# input_json_path = "/home/dataset/yyama_dataset/tasks/AC/default_AC_train_instructions.json"
# output_json_path = f"/home/dataset/yyama_dataset/tasks/AC/default_AC_train.json"
# create_visual_inspection(input_json_path, base_folder, output_json_path)
# print(f"finish making default train.json")
# base_folder = "/home/dataset/yyama_dataset/tasks/AC/val"  # これは一つ上のベースフォルダのパス
# input_json_path = "/home/dataset/yyama_dataset/tasks/AC/default_AC_val_instructions.json"
# output_json_path = f"/home/dataset/yyama_dataset/tasks/default_AC/AC_val.json"
# create_visual_inspection(input_json_path, base_folder, output_json_path)
# print(f"finish making default val.json")

finish making train.json
finish making val.json


In [33]:
import json

# 数の確認
json_path = "/data/yyama_dataset/tasks/VI_simple_prompt/VI_train.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)
    
count = len(data.keys())
print(f"train画像枚数: {count}")

json_path = "/data/yyama_dataset/tasks/VI_simple_prompt/VI_val.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)
    
count = len(data.keys())
print(f"val画像枚数: {count}")

train画像枚数: 15271
val画像枚数: 4543
