In [1]:
import os
from tqdm import tqdm

image_folder_path = '../../pilot data/ocr/'  # dir/image inside
commonkaha = ['は', 'か', 'へ', 'で', 'す', 'あ', 'お', 'の', 'に', 'を', 'る', 'く', 'し', 'な', 'よ', 'ス', 'ル']
commonkanji = ['日', '事', '人', '一', '見', '本', '子', '出', '年', '大', '言', '学', '分', '中', '記', '会', '新', '月', '時', '行', '本', '立', '気', '報', '思', '上', '語', '自', '者', '生', '文', '明', '情', '国', '朝', '用', '書', '私', '手', '間', '小', '合']

# Create sets for commonkaha and commonkanji
kaha_sets = {char: set() for char in commonkaha}
kanji_sets = {char: set() for char in commonkanji}

# Traverse through the image folder
for img_dir_name in tqdm(os.listdir(image_folder_path)):
    for filename in os.listdir(os.path.join(image_folder_path, img_dir_name)):
        if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
            first_char = filename.split('_')[0]
            # Check if the first character is in commonkaha
            if first_char in commonkaha:
                kaha_sets[first_char].add(filename)
            # Check if the first character is in commonkanji
            elif first_char in commonkanji:
                kanji_sets[first_char].add(filename)

# Print or manipulate the sets as needed
for char, img_set in kaha_sets.items():
    if img_set:
        print(f"{char}: {img_set}")

for char, img_set in kanji_sets.items():
    if img_set:
        print(f"{char}: {img_set}")

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 65.57it/s]

あ: {'あ__56.jpg.jpg', 'あ__109.jpg.jpg', 'あ__7.jpg.jpg', 'あ__62.jpg.jpg', 'あ__50.jpg.jpg', 'あ__91.jpg.jpg', 'あ__31.jpg.jpg'}
の: {'の__yongsong_1_12.jpg.jpg', 'の__yongsong_1_5.jpg.jpg'}
く: {'く__33.jpg.jpg', 'く__23.jpg.jpg', 'く__13.jpg.jpg'}





In [54]:
import cv2
import numpy as np
import os

def load_local_image(file_path):
    """ 从本地加载图像 """
    if os.path.exists(file_path):
        image = cv2.imread(file_path)
        return image
    else:
        print(f"文件 {file_path} 不存在")
        return None

def preprocess_image(image, margin=10, fixed_size=(200, 200)):
    if image is None:
        print("图像为空，无法进行预处理")
        return None

    # 转换为灰度图
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # 自适应阈值
    binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)

    # 寻找轮廓
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        # 寻找最大轮廓
        c = max(contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(c)

        # 裁剪并添加留白
        cropped = binary[y-margin:y+h+margin, x-margin:x+w+margin]

        # 检查裁剪后的图像尺寸
        if cropped.shape[0] < fixed_size[0] or cropped.shape[1] < fixed_size[1]:
            # 如果尺寸小于fixed_size，则先填充到fixed_size
            padded = cv2.copyMakeBorder(cropped, 
                                        top=margin, bottom=margin, 
                                        left=margin, right=margin, 
                                        borderType=cv2.BORDER_CONSTANT, 
                                        value=[0, 0, 0])
            cropped = padded[:fixed_size[0], :fixed_size[1]]

        # 调整到固定尺寸
        resized = cv2.resize(cropped, fixed_size)

        return resized
    else:
        print("none")
        return None


def extract_features(binary_image):
    # """ 在处理后的图像上提取特征 """
    # if np.sum(binary_image == 255) == 0:
    #     print("图像中没有白色像素，无法提取特征")
    #     return None

    features = {
        'white_pixels_ratio': np.sum(binary_image == 255) / binary_image.size
    }
    return features

# 遍历文件夹中的图像并提取特征
# dirurl = "testocr2"  # 替换为您的图像文件夹路径
# image_folder_path


In [None]:
#extract features with images set
def process_image_set(image_set, dir_path):
    """ 处理一组图像文件，并提取特征 """
    features_list = []
    for file_name in image_set:
        file_path = os.path.join(dir_path, file_name)
        image = cv2.imread(file_path)
        if image is not None:
            preprocessed_image = preprocess_image(image)
            if preprocessed_image is not None:
                features = extract_features(preprocessed_image)
                if features is not None:
                    features_list.append(features)
            else:
                print(f"预处理图像 {file_name} 失败")
        else:
            print(f"加载图像 {file_name} 失败")
    return features_list


In [55]:
#extract features from image
dirurl = "testocr2"
feature_list = []
for img in os.listdir(dirurl):
    if img.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
        file_path = os.path.join(dirurl, img)
        image = load_local_image(file_path)
        
        if image is not None:
            preprocessed_image = preprocess_image(image)
            if preprocessed_image is not None:
                features = extract_features(preprocessed_image)
                if features is not None:
                    feature_list.append((img,features))
                    print(img)

print(feature_list)

1.1.jpg
1.jpg
2.jpg
3.jpg
4.jpg
5.jpg
6.1.jpg
6.jpg
7.jpg
8.jpg
[('1.1.jpg', {'white_pixels_ratio': 0.0}), ('1.jpg', {'white_pixels_ratio': 0.06665}), ('2.jpg', {'white_pixels_ratio': 0.0799}), ('3.jpg', {'white_pixels_ratio': 0.0}), ('4.jpg', {'white_pixels_ratio': 0.07485}), ('5.jpg', {'white_pixels_ratio': 0.0}), ('6.1.jpg', {'white_pixels_ratio': 0.086675}), ('6.jpg', {'white_pixels_ratio': 0.087025}), ('7.jpg', {'white_pixels_ratio': 0.059625}), ('8.jpg', {'white_pixels_ratio': 0.0})]
