In [1]:
from pathlib import Path
import pandas as pd
import shutil
import os
import src.pic_preprocess as pp
import re

def generate_dataframe_from_image(file_name, defect_class, dataframe=None):
    # if not file_name.endswith(".tif"):
    #     raise ValueError("Input file must be a .tif file")
    # file_base = os.path.splitext(file_name)[0]

    if dataframe is None:
        raise ValueError("No dataframe to save data")
    parts = file_name.split("_")
    if len(parts) < 5:
        raise ValueError("File name format is invalid, unable to parse required fields")

    image_id = file_name
    original_id = "_".join(parts[:-1]) if len(parts) > 5 else file_name
    measurement_id = "_".join(parts[:3])
    bevel_section = parts[3]

    try:
        flame_no = int(parts[4])
    except ValueError:
        flame_no = parts[4]

    split = parts[5] if len(parts) > 5 else ""

    new_row = {
        "image_id": image_id,
        "original_id": original_id,
        "measurement_id": measurement_id,
        "foup_slot": "",  
        "bevel_section": bevel_section,
        "flame_no": flame_no,
        "split": split,
        "making_defect_type": "",  
        "selection_no": "", 
        "defect_class": defect_class
    }

    dataframe = pd.concat([dataframe, pd.DataFrame([new_row])], ignore_index=True)
    return dataframe
                    
def extract_parts(measurement_id):
    # Split the measurement_id based on underscores
    parts = measurement_id.split('_')
    
    # Extract the relevant parts (200, 01, and A1_0000)
    part1 = parts[0]+parts[1]  # '200'
    part2 = parts[2]  # '01'
    part3 = parts[3] + '_' + parts[4]  # 'A1_0000'
    
    return part1, part2, part3


def find_matching_folder(base_path, folder_name):
    for folder in base_path.iterdir():
        if folder.is_dir() and folder.name.startswith(folder_name):  # Match folder that starts with folder_name
            return folder
    return None

def find_and_copy_images(src_dir, dest_dir, id):

    part1, part2, part3 = extract_parts(id)
    
    src_path = Path(src_dir)
    dest_path = Path(dest_dir)   
    if not dest_path.exists():
        dest_path.mkdir(parents=True) 
    first_folder = find_matching_folder(src_path, part1)   
    if first_folder:
        second_folder = find_matching_folder(first_folder, part2)
        if second_folder:
            raw_folder = second_folder / 'Raw'
            if raw_folder.is_dir():
                tif_file = raw_folder / (part3 + '.tif')
                if tif_file.is_file():
                    destination_file = dest_path / ( id+ '.tif')
                    shutil.copy(tif_file, destination_file)
                    print(f"Copied {tif_file.name} to {destination_file}")
                else:
                    print(f"{tif_file.name} not found in {raw_folder}")
            else:
                print(f"Raw folder not found in {second_folder}")
        else:
            print(f"Second folder matching {part2} not found")
    else:
        print(f"First folder matching {part1} not found")
        
        
def process_image_name(image,is_vertical=False):
        if not is_vertical:           
            image_id = image
            original_id=image.rsplit('_',1)[0]
            measurement_id=image.rsplit('_',3)[0]
            bevel_section=image.split('_')[3]
            flame_no=image.split('_')[4]
            split=image.rsplit('_',1)[1]
        else:
            image_id = image
            original_id=image
            measurement_id=image.rsplit('_',2)[0]
            bevel_section=image.split('_')[3]
            flame_no=image.split('_')[4]
            split=""
        return image_id, original_id, measurement_id, bevel_section, flame_no, split


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\huang\work\dataset\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\huang\work\dataset\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\huang\work\dataset\.venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\hu

In [2]:
input_file_names=["アノテーションDB_20250117-141645__パターン付き_0-正常_一覧",
"アノテーションDB_20250117-141645__パターン付き_1-汚れ_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_2-擦り跡_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_3-発塵痕_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_4-打痕、傷_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_5-異物_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_6-ウォーターマーク_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_7-残渣_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_8-ヒゲ_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_9-膜境界不良_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_10-コメット_一覧",                 
"アノテーションDB_20250117-141645__パターン付き_11-タイガーストライプ_一覧",]

fieldnames = [
        "image_id", "original_id", "measurement_id", "foup_slot", "bevel_section",
        "flame_no", "split", "making_defect_type", "selection_no", "defect_class"
    ]
df_split_info = pd.DataFrame(columns=fieldnames)

for i in range(len(input_file_names)):
    input_file_name=input_file_names[i]
    excel_path = Path(r"pattern/") / f"{input_file_name}.csv"

    output_file_folder_1st=input_file_name.split("_")[3]
    output_file_folder_2nd=input_file_name.split("_")[4]
 
    src_dir = r"D:\H23057-J_とめ研社内共有\画像データ\オリジナル\20241004_画像データ\提出画像_20241003"
    dest_dir = Path(r"D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120")
    dest_dir = dest_dir / output_file_folder_1st / output_file_folder_2nd

    category_names=["01_normal","02_blot","03_scratch","04_dust","05_dent","06_foreigh_substance","07_watermark","08_residue","09_barb","10_defective_boundary","11_comet","12_tiger_stripe"]
    defect_class=category_names[i]

    df=pd.read_csv(excel_path,encoding='shift_jis', dtype=str,on_bad_lines='warn')   
    df['画像コード'] = df['画像コード'].str.replace('-', '_')
    
    if df.empty or df.shape[0] <= 1:
        print(f"Failed to load or no data except index row in {excel_path}")
        continue
       
    for image in df['画像コード']:
        is_vertical = "C" in image
        image_id,original_id,measurement_id,bevel_section,flame_no,split=process_image_name(image,is_vertical)             
        find_and_copy_images(src_dir, dest_dir, original_id)  
              
    pp.split_images_in_folder(dest_dir, dest_dir)    
    for image in df['画像コード']: 
        tif_file=dest_dir/ "splitted"/ (image+ '.tif')
        if not (dest_dir.parent/"input_data").exists():
            (dest_dir.parent/"input_data") .mkdir(parents=True) 
        destination_file = dest_dir.parent/"input_data" / ( image+ '.tif')
        if (tif_file.exists()) and not (destination_file.exists()):          
            shutil.copy(tif_file, destination_file) 
            df_split_info=generate_dataframe_from_image(image, defect_class, dataframe=df_split_info)         
            print(f"Copied: {image}.tif")
        else:
            print(f"File not found: {image}.tif")    

df_split_info.to_csv(dest_dir.parent/"input_data" /"split_image_info.csv", index=False, encoding='shift_jis', mode='w') 


Copied A1_0000.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0000.tif
Copied A1_0000.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0000.tif
Copied A1_0001.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0001.tif
Copied A1_0001.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0001.tif
Copied A1_0002.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0002.tif
Copied A1_0002.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0002.tif
Copied A1_0003.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0003.tif
Copied A1_0003.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0003.tif
Copied A1_0004.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0004.tif
Copied A1_0004.tif to D:\H23057-J_とめ研社内共有\画像データ\アノテーション済\20250120\パターン付き\0-正常\2_00_01_A1_0004.tif
Copied A1_0005.tif t

In [3]:
def select_in_range(idx_start1, idx_end1, idx_start2, idx_end2):
    def get_range(start, end):
        start = start % 480
        end = end % 480
        if start <= end:
            return list(range(start, end + 1))
        else:
            return list(range(start, 480)) + list(range(0, end + 1))

    range1 = get_range(idx_start1, idx_end1)
    range2 = get_range(idx_start2, idx_end2)
    return set(range1 + range2)

def copy_and_rename_files(src_dir, dest_dir,excel_path ,target_folders):
       
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    for _, row in df.iterrows():
        sample_folder = row["サンプルフォルダ名"]

        #if sample_folder not in target_folders:
            #continue

        wafer_folder = row["ウエハフォルダ名"]
        wafer_number = row["ウエハ番号"].replace('-', '_')

        idx_start1 = row["有効エリア1_開始(idx)"]
        idx_end1 = row["有効エリア1_終了(idx)"]
        idx_start2 = row["有効エリア2_開始(idx)"]
        idx_end2 = row["有効エリア2_終了(idx)"]

        valid_indices = select_in_range(idx_start1, idx_end1, idx_start2, idx_end2)

        source_path = os.path.join(src_dir, sample_folder, wafer_folder, "Raw")

        if not os.path.exists(source_path):
            continue

        for file_name in os.listdir(source_path):
            if file_name.endswith(".tif"):
                file_index = int(file_name.split("_")[1].split(".")[0])
                if file_index in valid_indices:
                    new_file_name = f"{wafer_number}_{file_name}"
                    shutil.copy2(
                        os.path.join(source_path, file_name),
                        os.path.join(dest_dir, new_file_name)
                    )