# 篩選資料

## 留下圖(所有的標記檔長寬比>1.5 且標記文字中沒有*且含有中文字)

In [1]:
# import os
# import json
# import shutil
# from PIL import Image, ImageDraw

# def copy_images_with_conditions(annotation_file_path, source_folder, target_folder):
#     with open(annotation_file_path, 'r', encoding='utf-8') as file:
#         annotations = json.load(file)

#     for image_key, annotation in annotations.items():
#         image_filename = annotation["filename"]
#         image_path = os.path.join(source_folder, image_filename)

#         regions = annotation["regions"]

#         # Check conditions for all regions
#         all_regions_valid = all(is_valid_region(region) for region in regions)

#         if all_regions_valid:
#             # Create target folder if it doesn't exist
#             os.makedirs(target_folder, exist_ok=True)

#             # Copy the image to the target folder
#             target_path = os.path.join(target_folder, image_filename)
#             shutil.copy(image_path, target_path)

# def is_valid_region(region):
#     word = region["region_attributes"].get("word", "")
#     if '*' not in word and any(c.isalpha() for c in word):
#         # Calculate width and height of the bounding box
#         x_coordinates = region["shape_attributes"]["all_points_x"]
#         y_coordinates = region["shape_attributes"]["all_points_y"]
#         width = max(x_coordinates) - min(x_coordinates)
#         height = max(y_coordinates) - min(y_coordinates)

#         # Check the width/height ratio condition
#         ratio = width / height
#         return ratio >= 1.5

#     return False

# # Replace these paths with your actual paths
# annotation_file_path = r"C:\Users\MAGIC\Ching\survey\plan\rawdata_all\1Tainan\1Tainan_ori_all.json"
# source_folder = r"C:\Users\MAGIC\Ching\survey\plan\rawdata_all\1Tainan\images"
# target_folder = r"C:\Users\MAGIC\Ching\survey\plan\rawdata_all\1Tainan\ocrRaw2
# os.makedirs(target_folder, exist_ok=True)

# copy_images_with_conditions(annotation_file_path, source_folder, target_folder)

## 真正洗出來的data 放在各個資料夾內
先找出ROI ratio>1.5，再找出無*有中文字的部分(忽視直立路牌)

In [5]:
fields = ['1Tainan'] #, '2Penghu', '3Taoyuan']

In [10]:
import os
import json
import shutil
import csv
from PIL import Image, ImageDraw

def copy_images_with_conditions(annotation_file_path, source_folder, target_folder, csv_file_path):
    with open(annotation_file_path, 'r', encoding='utf-8') as file:
        annotations = json.load(file)

    # 開啟 CSV 檔案，準備寫入
    with open(csv_file_path, 'w', newline='', encoding='big5') as csvfile:
        fieldnames = ['raw_filename', 'ROI_count', 'category', 'box', 'words']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # 寫入 CSV 表頭
        writer.writeheader()

        for image_key, annotation in annotations.items():
            image_filename = annotation["filename"]
            image_path = os.path.join(source_folder, image_filename)

            regions = annotation["regions"]

            # Check conditions for all regions
            valid_regions = [region for region in regions if is_valid_region(region)]

            # Check if any valid region has a width/height ratio > 1.5
            has_valid_region = any(valid_regions) and any(region["width_height_ratio"] > 1.5 for region in valid_regions)

            if has_valid_region:
                # Create target folder if it doesn't exist
                os.makedirs(target_folder, exist_ok=True)

                # Copy the image to the target folder
                target_path = os.path.join(target_folder, image_filename)
                shutil.copy(image_path, target_path)

                roi_count = len(valid_regions)

                # Write to CSV file with error handling
                for region in valid_regions:
                    try:
                        shape_attributes = region['shape_attributes']
                        region_attributes = region['region_attributes']

                        category = region_attributes.get('category', '')
                        word = region_attributes.get('word', '')

                        all_points_x = shape_attributes.get('all_points_x', [])
                        all_points_y = shape_attributes.get('all_points_y', [])

                        box = list(zip(all_points_x, all_points_y))
                        
                        if category in ['1', '2', '3', '4', '5', '6']:
                            writer.writerow({
                                'raw_filename': image_filename,
                                'ROI_count': roi_count,
                                'category': category,
                                'box': box,
                                'words': word
                            })
                    except UnicodeEncodeError as e:
                        # Print the error message and continue to the next row
                        print(f"Error writing row for '{image_filename}': {e}")
                        continue

def is_valid_region(region):
    word = region["region_attributes"].get("word", "")
    if '*' not in word and any(c.isalpha() for c in word):
        x_coordinates = region["shape_attributes"]["all_points_x"]
        y_coordinates = region["shape_attributes"]["all_points_y"]
        width = max(x_coordinates) - min(x_coordinates)
        height = max(y_coordinates) - min(y_coordinates)

        # Calculate width/height ratio
        ratio = width / height

        # Store width/height ratio in region for later use
        region["width_height_ratio"] = ratio

        return ratio > 1.5

    return False

# Replace these paths with your actual paths

for field in fields:
    annotation_file_path = f"C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\{field}\\{field}_ori_all.json"
    source_folder = f"C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\{field}\\images"
    target_folder = f"C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\{field}\\ocrRaw2"
    os.makedirs(target_folder, exist_ok=True)
    csv_file_path = os.path.join(target_folder, 'output2.csv')

    copy_images_with_conditions(annotation_file_path, source_folder, target_folder, csv_file_path)
    print(f'{field}的圖片和 CSV: {csv_file_path} 已完成')

1Tainan的圖片和 CSV: C:\Users\MAGIC\Ching\survey\plan\rawdata_all\1Tainan\ocrRaw2\output2.csv 已完成


### 記得要刪掉重疊的!!!!
CSV檔案中存在重複的標記:
    filename                            words                   raw_filename  \
148  185.jpg              大臺南會展中心\nICC Tainan  221005_014356855_Camera_0.jpg   
152  185.jpg              大臺南會展中心\nICC Tainan  221005_014356855_Camera_0.jpg   
153  185.jpg              大臺南會展中心\nICC Tainan  221005_014356855_Camera_0.jpg   
250  306.jpg  汽機車停車場\nCar and Motorcycle Park  221005_014559627_Camera_0.jpg   
252  306.jpg  汽機車停車場\nCar and Motorcycle Park  221005_014559627_Camera_0.jpg   
261  319.jpg                 快速公路\nEXPRESSWAY  221005_014648729_Camera_0.jpg   
264  319.jpg                 快速公路\nEXPRESSWAY  221005_014648729_Camera_0.jpg   
265  319.jpg                 快速公路\nEXPRESSWAY  221005_014648729_Camera_0.jpg   


In [10]:
import os
import pandas as pd

csv_path = "C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\train\\1Tainan\\labels.csv"
folder_path = "C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\train\\1Tainan"

df = pd.read_csv(csv_path, encoding='big5')

csv_filenames = set(df['filename'].tolist())
folder_filenames = set(os.listdir(folder_path))

# 找出CSV中存在但資料夾中不存在的檔案
missing_files = csv_filenames - folder_filenames

# 找出CSV中重複的標記
duplicate_labels = df[df.duplicated(subset=['filename'], keep=False)]

if len(missing_files) > 0:
    print("資料夾中缺少的圖片:")
    for file in missing_files:
        print(file)
else:
    print("資料夾中的所有圖片都在CSV中找到。")

if not duplicate_labels.empty:
    print("\nCSV檔案中存在重複的標記:")
    print(duplicate_labels)
else:
    print("\nCSV檔案中沒有重複的標記。")

資料夾中的所有圖片都在CSV中找到。

CSV檔案中沒有重複的標記。


### 南崁.的符號無法decode

Error writing row for '220707_060614804_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_060617550_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_060722774_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_060801663_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_060950255_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_060951671_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_060952938_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_030343373_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_030715653_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_030716727_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_030717264_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_030728389_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_030728915_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_030728915_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_030729441_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_030729441_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_032755076_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_033559873_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_061120592_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_061123781_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 92: illegal multibyte sequence
Error writing row for '220707_060908688_Camera_1.jpg': 'big5' codec can't encode character '\u2027' in position 93: illegal multibyte sequence

# 切割出ROI

In [2]:
fields = ['1Tainan', '2Penghu', '3Taoyuan']

In [3]:
import os
import csv
from ast import literal_eval
from PIL import Image
import matplotlib.pyplot as plt
import time

t0 = time.time()

def crop_and_save_images(src_csv, src_img, dst_img, new_csv_file, csv_header, field):
    with open(src_csv, 'r', encoding='big5') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        rows = list(csv_reader)
    os.makedirs(dst_img, exist_ok=True)

    with open(new_csv_file, 'w', encoding='big5', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(csv_header)

        for row in rows:
            try:
                raw_filename = row['raw_filename']
                roi_count = int(row['ROI_count'])
                category = int(row['category'])
                box = literal_eval(row['box'])
                words = row['words'].strip()

                if category in [1, 2, 3, 4, 5, 6]:
#                     print(raw_filename)
#                     print(row)

                    image_path = os.path.join(src_img, raw_filename)

                    image = Image.open(image_path)
                    try:
                        exif = image._getexif()
                        orientation = exif.get(274, 1)  # 274 是 Exif 中方向信息的標籤
                        # print(orientation)
                    except (AttributeError, KeyError, IndexError):
                        orientation = 1  # 如果沒有 Exif 信息，預設為 1

                    # 根據方向信息進行旋轉
                    if orientation == 3:
                        image = image.rotate(180, expand=True)
                    elif orientation == 6:
                        image = image.rotate(-90, expand=True)
                    elif orientation == 8:
                        image = image.rotate(90, expand=True)

                    # 根據 box 切割影像
                    x1, y1 = min(box, key=lambda x: x[0])[0], min(box, key=lambda x: x[1])[1]
                    x2, y2 = max(box, key=lambda x: x[0])[0], max(box, key=lambda x: x[1])[1]

                    cropped_image = image.crop((x1, y1, x2, y2))

                    new_filename = f'{rows.index(row)}.jpg'
                    new_image_path = os.path.join(dst_img, new_filename)
                    cropped_image.save(new_image_path)

                    csv_writer.writerow([new_filename, words, raw_filename, category])
#                     print('-'*80)


            except ValueError as e:
                print(f"Skipping row: {row['raw_filename']} {e}")
                continue

for field in fields:
    src_img = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\{field}\\ocrRaw'
    dst_img = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\{field}\\ocrRaw\\ROI'
    src_csv = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\{field}\\ocrRaw\\output.csv'
    new_csv_file = os.path.join(dst_img, 'ROI.csv')
    csv_header = ['filename', 'words', 'raw_filename', 'category']

    print(f'{field=}')
    crop_and_save_images(src_csv, src_img, dst_img, new_csv_file, csv_header, field)
    print(f'{new_csv_file} 已完成')
    print(time.time()-t0)
    print()

field='1Tainan'
Skipping row: 221005_022119052_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_022119269_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_022119491_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_032452506_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_032453266_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_032453512_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_032626335_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_032626335_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_032626548_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_033320224_Camera_0.jpg invalid literal for int() with base 10: 'B_4'
Skipping row: 221005_033404633_Camera_0.jpg invalid literal for int() with base 10: 

# 依比例切割資料集

In [4]:
fields = ['1Tainan', '2Penghu', '3Taoyuan']

In [5]:
import os
import pandas as pd
import random
from shutil import copyfile

for field in fields:
    # 資料夾路徑
    data_folder = dst_img = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\{field}\\ocrRaw\\ROI'
    output_train_folder = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\train\\{field}'
    output_val_folder = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\val\\{field}'
    output_test_folder = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\test\\{field}'
    labels_file = os.path.join(data_folder, 'ROI.csv')

    # 設定訓練集、驗證集和測試集比例 (K:N:M)
    K = 8  # 設定為8表示訓練集佔80%，驗證集和測試集各佔10%
    N = 1
    M = 1

    os.makedirs(output_train_folder, exist_ok=True)
    os.makedirs(output_val_folder, exist_ok=True)
    os.makedirs(output_test_folder, exist_ok=True)

    labels_df = pd.read_csv(labels_file, encoding='big5')

    # 創建一個字典，用於將標籤映射到相應的檔案
    label_to_filenames = {}
    for index, row in labels_df.iterrows():
        filename = row['filename']
        label = row['words']
        if label in label_to_filenames:
            label_to_filenames[label].append(filename)
        else:
            label_to_filenames[label] = [filename]

    # 分配資料到訓練集、驗證集和測試集
    cnt_tr = 0
    cnt_va = 0
    cnt_test = 0
    print(f'{field=}')
    for label, filenames in label_to_filenames.items():
        num_samples = len(filenames)
        if num_samples == 1:
            num_test_samples = 0
            num_val_samples = 0
            num_train_samples = 1

        elif num_samples == 2:
            num_test_samples = 1
            num_val_samples = 0
            num_train_samples = 1

        elif num_samples < 6:
            num_test_samples = 1
            num_val_samples = 1
            num_train_samples = num_samples -2

        else:
            num_test_samples = round((num_samples / (K + N + M)) * M)
            num_val_samples = round((num_samples / (K + N + M)) * N)
            num_train_samples = num_samples - num_test_samples - num_val_samples

            random.shuffle(filenames)


        test_samples = filenames[:num_test_samples]
        val_samples = filenames[num_test_samples:num_test_samples + num_val_samples]
        train_samples = filenames[num_test_samples + num_val_samples:]

    #     print()
    #     print(f'{label=}')
    #     print(f'{filenames=}')
    #     print(f'{num_samples=}')
    #     print(f'{num_train_samples=}')
    #     print(f'{num_val_samples=}')
    #     print(f'{num_test_samples=}')
    #     print('*'*80)

        # 複製影像到對應的資料夾
        for filename in train_samples:
            cnt_tr += 1 
            source_image = os.path.join(data_folder, filename)
            destination_image = os.path.join(output_train_folder, filename)
            copyfile(source_image, destination_image)

        for filename in val_samples:
            source_image = os.path.join(data_folder, filename)
            destination_image = os.path.join(output_val_folder, filename)
            copyfile(source_image, destination_image)

        for filename in test_samples:
            source_image = os.path.join(data_folder, filename)
            destination_image = os.path.join(output_test_folder, filename)
            copyfile(source_image, destination_image)

    # 創建新的標記檔
    train_labels = labels_df[labels_df['filename'].isin(os.listdir(output_train_folder))]
    val_labels = labels_df[labels_df['filename'].isin(os.listdir(output_val_folder))]
    test_labels = labels_df[labels_df['filename'].isin(os.listdir(output_test_folder))]

    train_labels.to_csv(os.path.join(output_train_folder, 'labels.csv'), index=False, encoding='big5')
    val_labels.to_csv(os.path.join(output_val_folder, 'labels.csv'), index=False, encoding='big5')
    test_labels.to_csv(os.path.join(output_test_folder, 'labels.csv'), index=False, encoding='big5')

    # print(f'{len(train_samples)=}')
    # print(f'{len(val_samples)=}')
    # print(f'{len(test_samples)=}')
    # total = len(train_samples) + len(val_samples) + len(test_samples)
    # print((len(train_samples)/total), (len(val_samples)/total), (len(test_samples)/total))
    print(f"{os.path.join(output_train_folder, 'labels.csv')} is done")
    print(f"{os.path.join(output_val_folder, 'labels.csv')} is done")
    print(f"{os.path.join(output_test_folder, 'labels.csv')} is done")

field='1Tainan'
C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\train\1Tainan\labels.csv is done
C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\val\1Tainan\labels.csv is done
C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\test\1Tainan\labels.csv is done
field='2Penghu'
C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\train\2Penghu\labels.csv is done
C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\val\2Penghu\labels.csv is done
C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\test\2Penghu\labels.csv is done
field='3Taoyuan'
C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\train\3Taoyuan\labels.csv is done
C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\val\3Taoyuan\labels.csv is done
C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\test\3Taoyuan\labels.csv is done


# 合併不同區域的標記檔

In [6]:
import os
import pandas as pd
from shutil import copyfile

fields = ['1Tainan', '2Penghu', '3Taoyuan']
output_train_folder = r'C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\train_all'
output_val_folder = r'C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\val_all'
output_test_folder = r'C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\test_all'

os.makedirs(output_train_folder, exist_ok=True)
os.makedirs(output_val_folder, exist_ok=True)
os.makedirs(output_test_folder, exist_ok=True)

# 合併各field的資料
for field in fields:
    field_train_folder = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\train\\{field}'
    field_val_folder = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\val\\{field}'
    field_test_folder = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\test\\{field}'

    # 將field的training dataset複製到統一的training資料夾，避免檔名覆蓋問題
    for filename in os.listdir(field_train_folder):
        source_image = os.path.join(field_train_folder, filename)
        destination_image = os.path.join(output_train_folder, f'{field}_{filename}')
        copyfile(source_image, destination_image)

    # 將field的validation dataset複製到統一的validation資料夾，避免檔名覆蓋問題
    for filename in os.listdir(field_val_folder):
        source_image = os.path.join(field_val_folder, filename)
        destination_image = os.path.join(output_val_folder, f'{field}_{filename}')
        copyfile(source_image, destination_image)

    # 將field的test dataset複製到統一的testing資料夾，避免檔名覆蓋問題
    for filename in os.listdir(field_test_folder):
        source_image = os.path.join(field_test_folder, filename)
        destination_image = os.path.join(output_test_folder, f'{field}_{filename}')
        copyfile(source_image, destination_image)

In [7]:
datasets = ['test_all', 'train_all', 'val_all']

In [8]:
# 合併標記檔
for dataset in datasets:
    labels_dfs = []
    output_combined_csv = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\{dataset}\\labels.csv'
    for field in fields:
        field_labels_file = f'C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\{dataset}\\{field}_labels.csv'
        labels_df = pd.read_csv(field_labels_file, encoding='big5')
        labels_df['filename'] = labels_df['filename'].apply(lambda x: f'{field}_{x}')  # 將filename加上field的前綴
        labels_dfs.append(labels_df)

    # 將標記檔合併
    merged_labels_df = pd.concat(labels_dfs, ignore_index=True)
    merged_labels_df['filename'] = merged_labels_df['filename'].apply(lambda x: x.replace(',', ' '))
    merged_labels_df['words'] = merged_labels_df['words'].apply(lambda x: x.replace(',', ' '))

    merged_labels_df.to_csv(output_combined_csv, index=False, encoding='big5')

# 確認是否有沒有重疊

In [9]:
import os
import pandas as pd

def compare_labels_and_images(field):
    labels_path = f"C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\{field}\\labels.csv"
    images_path = f"C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\{field}"

    df = pd.read_csv(labels_path, encoding='big5')

    csv_filenames = set(df['filename'].tolist())
    folder_filenames = set(os.listdir(images_path))

    # 找出CSV中存在但資料夾中不存在的檔案
    missing_files = csv_filenames - folder_filenames

    # 找出CSV中重複的標記
    duplicate_labels = df[df.duplicated(subset=['filename'], keep=False)]

    if len(missing_files) > 0:
        print(f'=== {dataset} ===')
        print("資料夾中缺少的圖片:")
        for file in missing_files:
            print(file)
    else:
        print(f'=== {dataset} ===')
        print("資料夾中的所有圖片都在CSV中找到")

    print('*'*40)
    if not duplicate_labels.empty:
        print("CSV檔案中存在重複的標記:")
        print(duplicate_labels)
    else:
        print("CSV檔案中沒有重複的標記。")

datasets = ['test_all', 'train_all', 'val_all']
for dataset in datasets:
    compare_labels_and_images(dataset)
    print('-'*80)

=== test_all ===
資料夾中的所有圖片都在CSV中找到
****************************************
CSV檔案中沒有重複的標記。
--------------------------------------------------------------------------------
=== train_all ===
資料夾中的所有圖片都在CSV中找到
****************************************
CSV檔案中沒有重複的標記。
--------------------------------------------------------------------------------
=== val_all ===
資料夾中的所有圖片都在CSV中找到
****************************************
CSV檔案中沒有重複的標記。
--------------------------------------------------------------------------------


# 將英文字拿掉

In [23]:
import pandas as pd

def clean_and_save_labels(labels_path):
    labels_df = pd.read_csv(labels_path, encoding='big5')

    # 刪除 words 中的英文字母、數字和不希望的字符（包括 . 和 &＆）
    labels_df['words'] = labels_df['words'].replace(to_replace="[a-zA-Z0-9\n.&＆]+", value="", regex=True)

    cleaned_labels_path = labels_path.replace(".csv", "_cleaned.csv")
    labels_df.to_csv(cleaned_labels_path, index=False, encoding='big5')

    print(f"已將標記檔字符刪除，並儲存到 {cleaned_labels_path}")

datasets = ['test_all', 'train_all', 'val_all']
for dataset in datasets:
    labels_path = f"C:\\Users\\MAGIC\\Ching\\survey\\plan\\rawdata_all\\ocrROI\\{dataset}\\labels.csv"
    clean_and_save_labels(labels_path)

已將標記檔字符刪除，並儲存到 C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\test_all\labels_cleaned.csv
已將標記檔字符刪除，並儲存到 C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\train_all\labels_cleaned.csv
已將標記檔字符刪除，並儲存到 C:\Users\MAGIC\Ching\survey\plan\rawdata_all\ocrROI\val_all\labels_cleaned.csv
