## (0) Import Library

In [1]:
import numpy as np
import torch
import cv2
import csv
import sys
import os
import pandas as pd
import multiprocessing
import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from PIL import Image


current_dir = os.getcwd() # Get the current directory
sys.path.append(current_dir) # Add the directory where the module is located

# from pathlib import Path
from yolov9_2.utils.general import non_max_suppression, scale_boxes
# from utils.plots import Annotator, colors
from yolov9_2.models.common import DetectMultiBackend
# from utils.dataloaders import LoadImages
from yolov9_2.utils.general import check_img_size
from yolov9_2.utils.torch_utils import select_device
from yolov9_2.utils.augmentations import (Albumentations, augment_hsv, classify_albumentations, classify_transforms, copy_paste,
                                 letterbox, mixup, random_perspective)
from yolov9_2.utils.plots import Annotator, colors
from itertools import zip_longest


## (1) Data Setup

### (1.1) Helper function <br>

In [2]:
def get_char_index(char):
    """
    Convert a letter or digit to its corresponding index.

    Args:
        char: str, the character to convert.

    Returns:
        int, the index of the character.

    Raises:
        ValueError: If the character is unsupported.
    """
    if char.isdigit():
        return ord(char) - ord('0')  # Digits (0-9) -> 0-9
    elif 'a' <= char <= 'z':
        return ord(char) - ord('a') + 10  # Lowercase letters (a-z) -> 10-35
    elif 'A' <= char <= 'Z':
        return ord(char) - ord('A') + 36
    else:
        raise ValueError(f"Unsupported character: {char}")


### (1.2) Load datasets <br>

In [3]:
def load_data(image_dir, labels, limit, predict_split=1000, img_height=90, img_width=30, num_classes=62):
    """
        Load images and labels from the specified directory.

        Args:
            image_dir: str, the directory containing the images.
            labels: dict, a mapping from image filenames to labels.
            limit: int, the maximum number of images to load.
            predict_split: int, the number of images to reserve for prediction.
            img_height: int, the height of the images.
            img_width: int, the width of the images.
            num_classes: int, the number of classes.

        Returns:
            Tuple of NumPy arrays: (images, target_labels, predict_images, predict_labels)
    """
    images = []
    target_labels = []
    predict_images = [] 
    predict_labels = []  # Store labels for reserved images

  
    all_image_filenames = [f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]

  
    for i, img_name in enumerate(all_image_filenames):
        img_path = os.path.join(image_dir, img_name)
  
    
        label = labels[img_name]
        img = load_img(img_path, color_mode="grayscale", target_size=(img_height, img_width))
        img_array = img_to_array(img) / 255.0  # Normalize pixel values to [0, 1]
            
         
        char_index = get_char_index(label)  # Convert char to index (0-61)
        label_array = to_categorical(char_index, num_classes=num_classes)
            
           
        if i < predict_split:
                predict_images.append(img_array)
                predict_labels.append(label_array)
        else:
                # Store in main arrays if within limit
                if len(images) < limit:
                    images.append(img_array)
                    target_labels.append(label_array)

    # Convert lists to NumPy arrays
    images = np.array(images)
    target_labels = np.array(target_labels)
    predict_images = np.array(predict_images)
    predict_labels = np.array(predict_labels)
   
    return images, target_labels, predict_images, predict_labels



### (1.3) Loads Label.csv

In [4]:
def load_labels_from_csv(csv_path):
    """
        Load image labels from a CSV file.

        Args:
            csv_path: str, the path to the CSV file.

        Returns:
            dict, a mapping from image filenames to labels.
    """
    labels = {}
    with open(csv_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            labels[row['image']] = row['label']
    return labels

### (1.4) Caller & Separation

In [5]:
img_height, img_width = 90, 30  # Image dimensions (adjust for individual characters)
num_classes = 62  # Number of classes (0-9, A-Z, a-z)

training_data_path = "dataset\\overlap\\train\\segment"      # Directory containing images
training_data_label_path = "dataset\\overlap\\train\\labels\\segmented_labels.csv"  # Path to the CSV file with labels
validation_data_path = "dataset\\overlap\\val\\segment"  # Directory containing images
validation_data_label_path = "dataset\\overlap\\val\\labels\\segmented_labels.csv"  # Path to the CSV file with labels

# Load labels from CSV
train_labels = load_labels_from_csv(training_data_label_path)

# Load train images and labels
X_train, y_train, _, _ = load_data(training_data_path, train_labels, limit=100000, predict_split=0, img_height=img_height, img_width=img_width, num_classes=num_classes)

# Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(images, target_labels, test_size=0.2, random_state=42)

# Load validation images and labels
val_labels = load_labels_from_csv(validation_data_label_path)
X_val, y_val, _, _ = load_data(validation_data_path, val_labels, limit=20000, predict_split=0, img_height=img_height, img_width=img_width, num_classes=num_classes)


## (2) Model

#### (2.0) Test if your GPU is working

In [6]:
### Testing if your GPU is available
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

### print the shape of the training and validation data
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
# print(predict_images.shape, predict_labels.shape)

Num GPUs Available:  1
(100000, 90, 30, 1) (100000, 62)
(20000, 90, 30, 1) (20000, 62)


### (2.1) Training

In [7]:
input_shape = (90, 30, 1)  # Shape of the input image (for single character)
input_img = Input(shape=input_shape)

# Convolutional layers #1
x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D(pool_size=(2, 2))(x)
# Convolutional layers #2
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
# Convolutional layers #3
x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
# Flatten
x = Flatten()(x)
# Dense and dropout layers
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
# Single output layer for character classification
output = Dense(num_classes, activation='softmax')(x)

# Create the model
model = Model(inputs=input_img, outputs=output)
# Compile the model
model.compile(optimizer=Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(
    X_train,
    y_train,
    epochs=20,            
    batch_size=64,         
    validation_data=(X_val, y_val)
)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation accuracy: {accuracy:.2f}")
print(f"Validation loss: {loss:.2f}")

model.save("captcha_model_2.h5")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation accuracy: 0.94
Validation loss: 0.59


### (2.2) Testing

#### (2.2.1) Data Preprocess Function

In [8]:
# from prediction import init_process, process_single_image, sliding_predict_and_validate 
def preprocess_image(image, input_shape):
    """
    Preprocess image for model prediction.
    Args:
        image: np.array, input image.
        input_shape: tuple, (height, width) required by the model.

    Returns:
        np.array, preprocessed image.
    """
    # Resize image to match model's expected sizing and scale pixel values
    image = cv2.resize(image, (input_shape[1], input_shape[0]))
    image = image.astype("float32") / 255.0
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    return image

def repair_gray(frame, print_img=False):
    # 将图像转换为灰度
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    # gray = frame

    # 进行二值化处理，将数字和背景区分开
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # this method can automatically find the best threshold value
    # Other threshold methods
    # _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)

    if print_img:
        plt.imshow(binary, cmap='gray')
        plt.axis('off')
        plt.title('Binary Image')
        plt.show()

    # 使用形态学操作去除小噪声
    # Erode
    kernel = np.ones((2, 2), np.uint8)
    cleaned = cv2.erode(binary, kernel, iterations=1)
    # Dilate
    kernel = np.ones((2, 2), np.uint8)
    cleaned = cv2.dilate(cleaned, kernel, iterations=2)

    # Remove lines
    kernel = np.ones((2, 2), np.uint8)
    cleaned = cv2.erode(cleaned, kernel, iterations=3)

    # 反转图像，使数字为白色，背景为黑色
    mask = cv2.bitwise_not(cleaned)
    
    # 检查生成的掩码
    # cv2.imwrite("cleaned_mask.jpg", mask)

    # 使用掩码进行修复
    # dst = cv2.inpaint(frame, mask, 18, cv2.INPAINT_TELEA)

    # 保存修复后的图像
    # cv2.imwrite("repaired_image.png", dst)
    dst = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
    return dst

def to_black_and_white(image, threshold=200, print_img=False):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    binary_image = cv2.bitwise_not(binary_image)
    if print_img:
        plt.imshow(binary_image, cmap='gray')
        plt.axis('off')
        plt.title('Binary Image')
        plt.show()

    return binary_image

def segment_using_column_scan(image, captcha_text, print_msg=False):
    '''
        Segment characters using column scanning method
        
        Args:
            image: cv2 image (binary black and white)
        
        Returns:
            character_images: list of segmented character images
    '''
    captcha_text_length = len(captcha_text)
    if print_msg:
        print(f'CAPTCHA text length: {captcha_text_length}')

    # Convert image to binary
    binary_image = image.copy()
    # Reverse the color from background white to black, character black to white
    # White: 255, Black: 0
    binary_image = cv2.bitwise_not(binary_image)

    # Get height and width of image
    height, width = binary_image.shape
    if print_msg:
        print(f'height: {height}, width: {width}')
    
    # Initialize variables
    char_start = None
    character_images = []
    
    # 扫描每一列
    for i, col in enumerate(range(width)):
        column_sum = np.sum(binary_image[:, col])
        # print(f'column {i+1} sum: {column_sum}')
        # white: 0, black: 255
        if column_sum > 255 * 1 and char_start is None:
            char_start = col
        # if column sum < 50, it may be end of character
        elif column_sum < 255 * 1 and char_start is not None:

            # That might be connected characters, so we need to use the threshold to split them
            num_col = col - char_start
            threshold = 50
            average_col = num_col / threshold    
            overlapping_col = 0

            if average_col <= 1:
                # Split character directly
                char_image = binary_image[:, char_start:col]

                # Calculate the width of the character
                char_width = col - char_start
                # If the width of the character is too small, it may be noise
                if char_width > 10:
                    character_images.append(cv2.bitwise_not(char_image))
            else:
                average_col = int(average_col)
                # Split connected characters
                for j in range(average_col):
                    # Split character using threshold & add some overlapping
                    if j == 0:
                        char_image = binary_image[:, char_start + j * threshold : char_start + (j + 1) * threshold + overlapping_col]
                    else:
                        char_image = binary_image[:, char_start + j * threshold : char_start + (j + 1) * threshold + overlapping_col]
                    character_images.append(cv2.bitwise_not(char_image))

            char_start = None


    # 检查最后一个字符是否未完成
    if char_start is not None:
        char_image = binary_image[:, char_start:width]
        character_images.append(cv2.bitwise_not(char_image))

    if len(character_images) != captcha_text_length:
        if print_msg:
            print(f'Warning: Number of segmented characters is not equal to CAPTCHA text length')

        character_images = []
    return character_images

def preprocess_captcha_image(image_path, captcha_text='1234', print_img=False):
    '''
        Preprocess CAPTCHA image and segment characters

        Args:
            image_path: str, path to the image file
        
        Returns:
            characters: list, list of segmented characters in image
    '''
    # Load the image
    image = cv2.imread(image_path)
    # Display the image
    if print_img:
        plt.imshow(image)
        plt.axis('off')
        plt.title('Original CAPTCHA Image')
        plt.show()

    # Repair gray
    preprocessed_image = repair_gray(image, print_img=print_img)
    if print_img:
        plt.imshow(preprocessed_image, cmap='gray')
        plt.axis('off')
        plt.title('After open CAPTCHA Image')
        plt.show()

    # Save the preprocessed image
    # cv2.imwrite('preprocessed_image.png', preprocessed_image)

    # Column scan method to segment characters
    # Turn the image into binary format
    preprocessed_image = cv2.cvtColor(preprocessed_image, cv2.COLOR_BGR2GRAY)
    # characters = segment_using_column_scan(preprocessed_image, captcha_text, print_msg=print_img)
    characters = None
    
    return characters, preprocessed_image

#### (2.2.2) YoloBoundingCreator

In [9]:
class YoloBoundingCreator:
    def __init__(self):
        self._weights = "yolov9_2\\runs\\train\\exp\\weights\\best.pt"  # 模型權重文件
        self._imgsz = (640, 640)
        self._device = select_device(device='cpu')
        self._dnn = False
        self._data = 'data\datasets.yaml'
        self._half = False
        self._model = DetectMultiBackend(self._weights, device=self._device,  dnn=self._dnn, data=self._data, fp16=self._half)
        self._stride, self._names, self._pt = self._model.stride, self._model.names, self._model.pt
        self._imgsz = check_img_size(self._imgsz, s=self._stride)

    def bouding_generator(self, source_image):
        im0 = source_image
        im = letterbox(im0, self._imgsz, stride=self._stride, auto=self._pt)[0]  # padded resize
        im = im.transpose((2, 0, 1))[::-1]
        im = np.ascontiguousarray(im)

        img = torch.from_numpy(im).to(self._device)
        img = img.half() if self._model.fp16 else img.float()
        img /= 255.0
        if len(img.shape) == 3:
            img = img[None]

        pred = self._model(img)
        pred = pred[0][1]
        pred = non_max_suppression(pred, 0.1, 0.45, classes=None, agnostic=False)

        s = ''
        for i, det in enumerate(pred):  # per image
            s += '%gx%g ' % img.shape[2:]
            annotator = Annotator(im0, line_width=1, example=str(self._names))
            if len(det):
                det[:, :4] = scale_boxes(img.shape[2:], det[:, :4], im0.shape).round()
                for c in det[:, 5].unique():
                    n = (det[:, 5] == c).sum()
                    s += f"{n} {self._names[int(c)]}{'s' * (n > 1)}, "
                hide_labels = True
                hide_conf = True
                for *xyxy, conf, cls in reversed(det):
                    c = int(cls)
                    label = None if hide_labels else (self._names[c] if hide_conf else f'{self._names[c]} {conf:.2f}')
                    annotator.box_label(xyxy, label, color=colors(c, True))

        im0 = annotator.result()
        bouding_box = det[:, :4].cpu().numpy()

        # sort the bounding box by x1
        bouding_box = bouding_box[bouding_box[:, 0].argsort()]

        return bouding_box

def draw_bounding_box(image, bounding_box):
    
    for i in range(len(bounding_box)):
        x1, y1, x2, y2 = bounding_box[i]
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 1) # (0, 255, 0) is the color of the bounding box, 2 is the thickness of the bounding box

    return image

yolo_bounding_creator = YoloBoundingCreator()
                
# Testing
source_path = "dataset\\overlap\\val\\preprocess\\0.png"
source_image = cv2.imread(source_path)
bouding_box = yolo_bounding_creator.bouding_generator(source_image)
print(bouding_box)
image_with_bounding_box = draw_bounding_box(source_image, bouding_box)
# Save the image with bounding box
cv2.imwrite("image_with_bounding_box.jpg", image_with_bounding_box)



YOLO  2024-12-1 Python-3.10.15 torch-2.5.1+cpu CPU

  ckpt = torch.load(attempt_download(w), map_location='cpu')  # load
Fusing layers... 
yolov9-c summary: 604 layers, 50698278 parameters, 0 gradients, 236.6 GFLOPs


[[          8          27          39          76]
 [         66          22          92          75]
 [        144          21         170          76]
 [        172          23         200          71]]


True

#### (2.2.3) Prediction Function

In [10]:
def predict_with_fixed_stride(image, model, input_shape, window_width, stride, confidence_threshold, index_to_char, verbose=0):
    """
    Predict characters using fixed stride and select top 4 predictions with highest confidence and no overlapping regions.
    Args:
        image: np.array, input image.
        model: trained model for prediction.
        input_shape: tuple, (height, width) required by the model.
        window_width: int, width of the sliding window.
        stride: int, step size for sliding window horizontally.
        confidence_threshold: float, minimum confidence to accept prediction.
        index_to_char: dict, mapping from class index to character.
        verbose: int, 0 or 1 for verbose.

    Returns:
        str, predicted text for the image.
    """
    image_height, image_width = image.shape
    predictions = []

    # Sliding window with fixed stride
    x_start = 0
    while x_start + window_width <= image_width:
        # Crop region
        cropped_region = image[0:image_height, x_start:x_start + window_width]
        region = preprocess_image(cropped_region, input_shape)

        # Predict with model
        probs = model.predict(region, verbose=verbose)
        max_prob = np.max(probs)
        if max_prob >= confidence_threshold:
            predicted_class = np.argmax(probs)
            predicted_char = index_to_char.get(predicted_class, '?')
            predictions.append((predicted_char, max_prob, x_start, x_start + window_width))  # Save char, prob, start, end

        x_start += stride  # Fixed stride

    # Sort predictions by confidence (descending)
    predictions = sorted(predictions, key=lambda x: x[1], reverse=True)

    # Select top 4 predictions with no overlapping regions
    selected_predictions = []
    for char, prob, start, end in predictions:
        # Check for overlap with already selected regions
        if all(end <= sel_start or start >= sel_end for _, _, sel_start, sel_end in selected_predictions):
            selected_predictions.append((char, prob, start, end))
            if len(selected_predictions) == 4:  # Stop after selecting 4 predictions
                break

    # Sort selected predictions by position (start)
    selected_predictions = sorted(selected_predictions, key=lambda x: x[2])

    # Combine selected characters into a string
    predicted_text = ''.join([char for char, _, _, _ in selected_predictions])

    return predicted_text

def get_char_region(image, bounding_box):
    """
    Get the region of the character from the bounding box.

    Args:
        image: np.array, input image.
        bounding_box: list, bounding box coordinates.

    Returns:
        np.array, region of the character.
    """
    x1, y1, x2, y2 = bounding_box
    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
    # char_region = image[y1:y2, x1:x2] 
    # Keep the original height of the character
    char_region = image[:, x1:x2]
    return char_region

def sliding_predict_and_validate(image_dir, labels_csv, model_path, input_shape, yolo_model ,confidence_threshold, index_to_char, print_img=False, predict_size='None'):
    """
    Process multiple images, predict characters, and validate against ground truth.
    We have two methods to predict characters:
    1. Use the preprocess_captcha_image function to segment characters, and then predict each character.
    2. Use the predict_with_multiple_windows or predict_with_fixed_stride function to predict characters directly.

    We Use 2. if the characters are not segmented properly.

    Args:
        image_dir: str, directory containing image files.
        labels_csv: str, path to the CSV file containing labels for each character.
        model: trained model for prediction.
        input_shape: tuple, (height, width) required by the model.
        yolo_model: YoloBoundingCreator, the YOLO model for bounding box prediction.
        confidence_threshold: float, threshold to accept predictions.
        index_to_char: dict, mapping from class index to character.

    Returns:
        None (Displays validation results).
    """
    # Predicition labels list
    prediction_labels = []
    # Ground truth labels list
    ground_truth_labels = []

    # Output path
    output_path = "prediction.txt"

    # Load labels
    labels_df = pd.read_csv(labels_csv)
    if not {'image', 'label'}.issubset(labels_df.columns):
        raise ValueError("CSV file must contain 'image' and 'label' columns.")
    
    # Load the model
    model = load_model(model_path)

    correct = 0
    correct_combined = 0
    total = 0
    count_images = 0
    total_images = predict_size
    image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]

    # Progress bar start
    progress_bar = tqdm.tqdm(total=total_images, position=0, leave=True)

    # Write predictions to a file
    f = open(output_path, 'w')
    for image_path in image_paths:
        image_id = os.path.basename(image_path)
        if image_id not in labels_df['image'].values:
            print(f"Warning: No label found for image ID {image_id}. Skipping.")
            continue

        ground_truth = labels_df.loc[labels_df['image'] == image_id, 'label'].values[0]
        
        # # Load the image
        # image = cv2.imread(image_path) # Load image in BGR format
        # # To black and white
        # image = to_black_and_white(image, threshold=200, print_img=print_img)

        # if image is None:
        #     print(f"Warning: Failed to load image {image_path}. Skipping.")
        #     continue

        # <Method 1> Preprocess image and segment characters
        characters, preprocessed_image = preprocess_captcha_image(image_path, ground_truth, print_img=print_img)

        if print_img:
            print(f"Number of segmented characters: {len(characters)}")

        # if len(characters) == 4: # The characters are segmented properly
        #     predicted_text = ""
        #     for char_image in characters:
        #         if print_img:
        #             plt.imshow(char_image, cmap='gray')
        #             plt.axis('off')
        #             plt.title('Segmented Character')
        #             plt.show()
        #         region = preprocess_image(char_image, input_shape)
        #         probs = model.predict(region, verbose=0)
        #         predicted_class = np.argmax(probs)
        #         predicted_char = index_to_char.get(predicted_class, '?')
        #         predicted_text += predicted_char
                
        # else:
        #     # <Method 2> Predict with fixed stride
        #     predicted_text = predict_with_fixed_stride(
        #         preprocessed_image, model, input_shape, window_width=20, stride=1, confidence_threshold=confidence_threshold, index_to_char=index_to_char
        #     )

        # <Method 3> Use the yolov9 model to get the bounding box of the characters, and then predict each character
        # Remember to convert the image to BGR format
        preprocessed_image = cv2.cvtColor(preprocessed_image, cv2.COLOR_GRAY2BGR)
        bounding_boxes = yolo_model.bouding_generator(preprocessed_image)

        predicted_text = ""
        if len(bounding_boxes) > 0:
            # Keep only the 4 bounding boxes with the highest confidence
            if len(bounding_boxes) > 4:
                bounding_boxes = bounding_boxes[:4]

            for i, bounding_box in enumerate(bounding_boxes):
                char_region = get_char_region(preprocessed_image, bounding_box)
                char_region = cv2.cvtColor(char_region, cv2.COLOR_BGR2GRAY)

                # if print_img:
                #     # Save the character region
                #     cv2.imwrite(f"char_region_{i}.jpg", char_region)

                region = preprocess_image(char_region, input_shape)

                # if print_img:
                #     # Save the character region
                #     cv2.imwrite(f"char_region_p_{i}.jpg", region)

                probs = model.predict(region, verbose=0)
                predicted_class = np.argmax(probs) # Get the class with the highest probability
                predicted_char = index_to_char.get(predicted_class, '?')
                predicted_text += predicted_char

        # If the predicted text is 0 characters, that means we over process the image and the preprocessed image maybe all white
        # We should use the original image to predict
        else:
            print("Warning: No bounding box found. Using the original image to predict.")
            image = cv2.imread(image_path)
            image = to_black_and_white(image, threshold=200, print_img=print_img)
            image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
            bounding_boxes = yolo_model.bouding_generator(image)

            predicted_text = ""
            # Keep only the 4 bounding boxes with the highest confidence
            if len(bounding_boxes) > 4:
                bounding_boxes = bounding_boxes[:4]

            for i, bounding_box in enumerate(bounding_boxes):
                char_region = get_char_region(image, bounding_box)
                char_region = cv2.cvtColor(char_region, cv2.COLOR_BGR2GRAY)

                # if print_img:
                #     # Save the character region
                #     cv2.imwrite(f"char_region_{i}.jpg", char_region)

                region = preprocess_image(char_region, input_shape)

                # if print_img:
                #     # Save the character region
                #     cv2.imwrite(f"char_region_p_{i}.jpg", region)

                probs = model.predict(region, verbose=0)
                predicted_class = np.argmax(probs)
                predicted_char = index_to_char.get(predicted_class, '?')
                predicted_text += predicted_char

        # Compare predictions with the ground truth
        for pred_char, gt_char in zip_longest(predicted_text, ground_truth, fillvalue=' '):
            prediction_labels.append(pred_char) # if the prediction is empty, we add a space
            ground_truth_labels.append(gt_char)
            if pred_char == gt_char:
                correct += 1

        total += len(ground_truth)
        count_images += 1
        progress_bar.update(1)

        # print(f"Image: {image_id}, Ground Truth: '{ground_truth}', Prediction: '{predicted_text}'")
        # Write the prediction to a file
        f.write(f"{image_id}, Ground Truth: '{ground_truth}', Prediction: '{predicted_text}'\n")
        if predicted_text == ground_truth:
            correct_combined += 1

        if type(predict_size) == int and count_images >= predict_size:
            break
        
        if count_images % 100 == 0:
            # Accuracy for every 100 images
            accuracy = (correct / total) * 100
            print(f"Validation Accuracy: {accuracy:.2f}%")
            accuracy_combined = (correct_combined / count_images) * 100
            print(f"Combined Accuracy: {accuracy_combined:.2f}%")

    progress_bar.close()
    f.close()
    # Display overall accuracy
    if total > 0:
        accuracy = (correct / total) * 100
        print(f"Validation Accuracy: {accuracy:.2f}%")
    else:
        print("No valid images processed.")

    # Display overall accuracy
    if count_images > 0:
        combined_accuracy = (correct_combined / count_images) * 100
        print(f"Combined Accuracy: {combined_accuracy:.2f}%")
    else:
        print("No valid images processed.")

    return prediction_labels, ground_truth_labels

#### (2.2.3) Parallel Prediction (DO NOT USE THIS)

In [8]:
def sliding_predict_and_validate_parallel(image_dir, labels_csv, model_path, input_shape, windows, confidence_threshold, index_to_char, print_img=False, predict_size='None'):
    """
    Parallel processing of multiple images for prediction and validation.
    Args:
        image_dir: str, directory containing image files.
        labels_csv: str, path to the CSV file containing labels for each character.
        model_path: str, path to the trained model file.
        input_shape: tuple, (height, width) required by the model.
        windows: list of tuples, each containing (window_width, stride).
        confidence_threshold: float, threshold to accept predictions.
        index_to_char: dict, mapping from class index to character.
        print_img: bool, whether to display images during processing.

    Returns:
        None (Displays validation results).
    """
    # Load labels
    labels_df = pd.read_csv(labels_csv)
    if not {'image', 'label'}.issubset(labels_df.columns):
        raise ValueError("CSV file must contain 'image' and 'label' columns.")

    image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]

    # Prepare arguments for each image
    args_list = []
    count_images = 0
    for image_path in image_paths:
        args = (image_path, labels_df, model_path, input_shape, windows, confidence_threshold, index_to_char, print_img)
        args_list.append(args)
        count_images += 1
        if type(predict_size) == int and count_images >= predict_size:
            break

    # 使用多进程池
    num_processes = 2
    results = []
    with multiprocessing.Pool(processes=num_processes, initializer=init_process, initargs=(model_path,)) as pool:
        results = pool.map(process_single_image, args_list)

    # 汇总结果
    total_correct = 0
    total_characters = 0
    for result in results:
        if result is not None:
            _, _, _, correct, total = result
            total_correct += correct
            total_characters += total

    # Display the results
    

    # Display overall accuracy
    if total_characters > 0:
        accuracy = (total_correct / total_characters) * 100
        print(f"Validation Accuracy: {accuracy:.2f}%")
    else:
        print("No valid images processed.")

#### (2.2.4) Prediction

In [11]:
# Testing
test_image_paths = "dataset\\overlap\\test\\original"  # Directory containing images
test_labels_csv = "dataset\\overlap\\test\\labels\\labels.csv"  # Path to the CSV file with labels

# Parameters for sliding window
confidence_threshold = 0.8 # Minimum confidence required to accept a prediction
input_shape = (90, 30)      # Model's expected input shape

# Define your index-to-character mapping
index_to_char = {i: chr(i + ord('0')) for i in range(10)}  # 0-9
index_to_char.update({i + 10: chr(i + ord('a')) for i in range(26)})  # 10-35
index_to_char.update({i + 36: chr(i + ord('A')) for i in range(26)})  # 36-61

prediction_labels, ground_truth_labels = sliding_predict_and_validate(test_image_paths, test_labels_csv, "captcha_model.h5", input_shape, yolo_bounding_creator, confidence_threshold, index_to_char, print_img=False, predict_size=5000)
# sliding_predict_and_validate_parallel(image_paths, labels_csv, "captcha_model.h5", input_shape, windows, confidence_threshold, index_to_char, print_img=False, predict_size=100)


  2%|▏         | 100/5000 [01:37<1:13:36,  1.11it/s]

Validation Accuracy: 93.00%
Combined Accuracy: 76.00%


  4%|▍         | 200/5000 [03:05<1:10:31,  1.13it/s]

Validation Accuracy: 91.25%
Combined Accuracy: 75.00%


  6%|▌         | 300/5000 [04:30<1:03:26,  1.23it/s]

Validation Accuracy: 91.50%
Combined Accuracy: 75.67%


  8%|▊         | 400/5000 [05:50<1:01:09,  1.25it/s]

Validation Accuracy: 92.25%
Combined Accuracy: 77.00%


 10%|█         | 500/5000 [07:11<59:12,  1.27it/s]  

Validation Accuracy: 92.50%
Combined Accuracy: 78.20%


 12%|█▏        | 600/5000 [08:32<57:47,  1.27it/s]  

Validation Accuracy: 92.50%
Combined Accuracy: 78.33%


 14%|█▍        | 700/5000 [09:51<57:23,  1.25it/s]  

Validation Accuracy: 92.25%
Combined Accuracy: 78.00%


 16%|█▌        | 800/5000 [11:12<54:57,  1.27it/s]  

Validation Accuracy: 92.12%
Combined Accuracy: 77.00%


 18%|█▊        | 900/5000 [12:31<53:26,  1.28it/s]

Validation Accuracy: 92.36%
Combined Accuracy: 77.67%


 20%|██        | 1000/5000 [13:50<52:59,  1.26it/s]

Validation Accuracy: 92.30%
Combined Accuracy: 77.50%


 22%|██▏       | 1100/5000 [15:09<51:08,  1.27it/s]

Validation Accuracy: 92.36%
Combined Accuracy: 77.64%


 24%|██▍       | 1200/5000 [16:28<50:17,  1.26it/s]

Validation Accuracy: 92.42%
Combined Accuracy: 77.75%


 26%|██▌       | 1300/5000 [17:46<48:02,  1.28it/s]

Validation Accuracy: 92.40%
Combined Accuracy: 77.46%


 28%|██▊       | 1400/5000 [19:06<50:11,  1.20it/s]

Validation Accuracy: 92.39%
Combined Accuracy: 77.79%


 30%|███       | 1500/5000 [20:26<46:13,  1.26it/s]

Validation Accuracy: 92.42%
Combined Accuracy: 77.67%


 32%|███▏      | 1600/5000 [21:46<44:33,  1.27it/s]

Validation Accuracy: 92.48%
Combined Accuracy: 77.88%


 34%|███▍      | 1700/5000 [23:10<43:43,  1.26it/s]

Validation Accuracy: 92.62%
Combined Accuracy: 78.29%


 36%|███▌      | 1800/5000 [24:29<41:59,  1.27it/s]

Validation Accuracy: 92.79%
Combined Accuracy: 78.78%


 38%|███▊      | 1900/5000 [25:49<40:51,  1.26it/s]

Validation Accuracy: 92.79%
Combined Accuracy: 78.79%


 40%|████      | 2000/5000 [27:08<39:36,  1.26it/s]

Validation Accuracy: 92.73%
Combined Accuracy: 78.60%


 42%|████▏     | 2100/5000 [28:28<38:21,  1.26it/s]

Validation Accuracy: 92.67%
Combined Accuracy: 78.43%


 44%|████▍     | 2200/5000 [29:48<36:34,  1.28it/s]

Validation Accuracy: 92.60%
Combined Accuracy: 78.27%


 46%|████▌     | 2300/5000 [31:07<35:52,  1.25it/s]

Validation Accuracy: 92.70%
Combined Accuracy: 78.57%


 48%|████▊     | 2400/5000 [32:27<34:06,  1.27it/s]

Validation Accuracy: 92.70%
Combined Accuracy: 78.62%


 50%|█████     | 2500/5000 [33:47<32:58,  1.26it/s]

Validation Accuracy: 92.75%
Combined Accuracy: 78.84%


 52%|█████▏    | 2600/5000 [35:06<32:36,  1.23it/s]

Validation Accuracy: 92.72%
Combined Accuracy: 78.73%


 54%|█████▍    | 2700/5000 [36:25<30:28,  1.26it/s]

Validation Accuracy: 92.61%
Combined Accuracy: 78.48%


 56%|█████▌    | 2800/5000 [37:46<29:04,  1.26it/s]

Validation Accuracy: 92.57%
Combined Accuracy: 78.43%


 58%|█████▊    | 2900/5000 [39:05<27:47,  1.26it/s]

Validation Accuracy: 92.47%
Combined Accuracy: 78.17%


 60%|██████    | 3000/5000 [40:20<22:03,  1.51it/s]

Validation Accuracy: 92.42%
Combined Accuracy: 77.97%


 62%|██████▏   | 3100/5000 [41:29<23:01,  1.38it/s]

Validation Accuracy: 92.41%
Combined Accuracy: 78.00%


 64%|██████▍   | 3200/5000 [42:31<19:24,  1.55it/s]

Validation Accuracy: 92.45%
Combined Accuracy: 78.03%


 66%|██████▌   | 3300/5000 [43:37<19:50,  1.43it/s]

Validation Accuracy: 92.50%
Combined Accuracy: 78.18%


 68%|██████▊   | 3400/5000 [44:43<16:56,  1.57it/s]

Validation Accuracy: 92.50%
Combined Accuracy: 78.24%


 70%|███████   | 3500/5000 [45:42<14:51,  1.68it/s]

Validation Accuracy: 92.48%
Combined Accuracy: 78.20%


 72%|███████▏  | 3600/5000 [46:42<13:55,  1.68it/s]

Validation Accuracy: 92.48%
Combined Accuracy: 78.22%


 74%|███████▍  | 3700/5000 [47:41<12:35,  1.72it/s]

Validation Accuracy: 92.50%
Combined Accuracy: 78.27%


 76%|███████▌  | 3800/5000 [48:40<11:43,  1.71it/s]

Validation Accuracy: 92.51%
Combined Accuracy: 78.37%


 78%|███████▊  | 3900/5000 [49:39<10:35,  1.73it/s]

Validation Accuracy: 92.45%
Combined Accuracy: 78.23%


 80%|████████  | 4000/5000 [50:42<11:50,  1.41it/s]

Validation Accuracy: 92.42%
Combined Accuracy: 78.15%


 82%|████████▏ | 4100/5000 [51:33<07:39,  1.96it/s]

Validation Accuracy: 92.46%
Combined Accuracy: 78.20%


 84%|████████▍ | 4200/5000 [52:24<06:44,  1.98it/s]

Validation Accuracy: 92.52%
Combined Accuracy: 78.29%


 86%|████████▌ | 4300/5000 [53:14<05:45,  2.03it/s]

Validation Accuracy: 92.56%
Combined Accuracy: 78.33%


 88%|████████▊ | 4400/5000 [54:04<04:55,  2.03it/s]

Validation Accuracy: 92.57%
Combined Accuracy: 78.41%


 90%|█████████ | 4500/5000 [54:56<04:58,  1.68it/s]

Validation Accuracy: 92.56%
Combined Accuracy: 78.27%


 92%|█████████▏| 4600/5000 [55:46<03:20,  2.00it/s]

Validation Accuracy: 92.56%
Combined Accuracy: 78.17%


 94%|█████████▍| 4700/5000 [56:36<02:27,  2.03it/s]

Validation Accuracy: 92.59%
Combined Accuracy: 78.15%


 96%|█████████▌| 4800/5000 [57:26<01:54,  1.74it/s]

Validation Accuracy: 92.55%
Combined Accuracy: 78.06%


 98%|█████████▊| 4900/5000 [58:16<00:50,  1.96it/s]

Validation Accuracy: 92.57%
Combined Accuracy: 78.14%


100%|██████████| 5000/5000 [59:05<00:00,  1.41it/s]

Validation Accuracy: 92.53%
Combined Accuracy: 78.04%





#### (2.2.5) Evaluation

In [13]:
def F1_score(records):
    """
    Calculate the F1 score for a prediction compared to ground truth.
    Args:
        records: A txt file containing the prediction and ground truth for each image.
                 Each line: "image_id, Ground Truth: 'XXXX', Prediction: 'XXXX'"
    Returns:    
        F1 score.
    """

    predictions = []
    ground_truths = []

    with open(records, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                fields = line.split(',')
                ground_truth = fields[1].split('\'')[1]
                prediction = fields[2].split('\'')[1]
                
                if len(prediction) == 4:
                    for i in range(4):
                        predictions.append(prediction[i])
                        ground_truths.append(ground_truth[i])
                else:
                    for i in range(4):
                        predictions.append(' ') # Add a space for missing characters
                        ground_truths.append(ground_truth[i])

    # Calculate F1 score
    classes = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ')
    le = LabelEncoder()
    le.fit(classes)

    y_true = le.transform(ground_truths)
    y_pred = le.transform(predictions)

    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')

    return f1_micro, f1_macro, f1_weighted

def F1_score_with_labels(predictions, ground_truths):
    """
    Calculate the F1 score for a prediction compared to ground truth.
    Args:
        records: A txt file containing the prediction and ground truth for each image.
                 Each line: "image_id, Ground Truth: 'XXXX', Prediction: 'XXXX'"
    Returns:    
        F1 score.
    """

    assert len(predictions) == len(ground_truths), "Number of predictions and ground truths must be the same."
    # Calculate F1 score
    classes = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ')
    le = LabelEncoder()
    le.fit(classes)

    y_true = le.transform(ground_truths)
    y_pred = le.transform(predictions)

    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')

    return f1_micro, f1_macro, f1_weighted

# Testing
# records = "prediction.txt"
# f1_micro, f1_macro, f1_weighted = F1_score(records)
f1_micro, f1_macro, f1_weighted = F1_score_with_labels(prediction_labels, ground_truth_labels)
print(f"F1 Micro: {f1_micro:.4f}")
print(f"F1 Macro: {f1_macro:.4f}")
print(f"F1 Weighted: {f1_weighted:.4f}")

                

F1 Micro: 0.9253
F1 Macro: 0.9251
F1 Weighted: 0.9256


In [None]:
# Testing
image_paths = "dataset\\overlap\\test\\original"
labels_csv = "dataset\\overlap\\test\\labels\\labels.csv"

# Parameters for sliding window
confidence_threshold = 0.8 # Minimum confidence required to accept a prediction
input_shape = (90, 30)      # Model's expected input shape

# Define your index-to-character mapping
index_to_char = {i: chr(i + ord('0')) for i in range(10)}  # 0-9
index_to_char.update({i + 10: chr(i + ord('a')) for i in range(26)})  # 10-35
index_to_char.update({i + 36: chr(i + ord('A')) for i in range(26)})  # 36-61

sliding_predict_and_validate(image_paths, labels_csv, "captcha_model.h5", input_shape, yolo_bounding_creator, confidence_threshold, index_to_char, print_img=False, predict_size=10)