In [100]:
import os
import cv2
import torch
import easyocr
from ultralytics import YOLO
from pathlib import Path
from difflib import SequenceMatcher
import numpy as np

In [101]:
# Пути
test_images_folder = Path(r"C:\Users\abram\DataspellProjects\FinetuningEasyOCR\dataset\test")
test_labels_folder = Path(r"C:\Users\abram\DataspellProjects\FinetuningEasyOCR\dataset\ch4_test_localization_transcription_gt")
weights_path = Path(r"C:\Users\abram\DataspellProjects\FinetuningEasyOCR\runs\yolo8_experiment4\weights\best.pt")

# Загрузка модели YOLOv8
model = YOLO(weights_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Инициализация EasyOCR
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())

In [102]:
# Функция для загрузки ground truth
def load_ground_truths(labels_folder):
    ground_truths = {}
    bboxes = {}
    for label_file in Path(labels_folder).iterdir():
        if label_file.suffix == ".txt":
            image_name = label_file.stem.replace("gt_", "") + ".jpg"
            with label_file.open("r", encoding="utf-8") as f:
                gt_texts = []
                gt_bboxes = []
                for line in f.readlines():
                    parts = line.strip().split(",")
                    if parts[-1] != "###":
                        x_min, y_min, x_max, y_max = map(float, parts[:4])
                        text = parts[-1].lower()
                        gt_texts.append(text)
                        gt_bboxes.append([x_min, y_min, x_max, y_max])
                ground_truths[image_name] = gt_texts
                bboxes[image_name] = gt_bboxes
    return ground_truths, bboxes

# Функция для получения предсказаний YOLO + EasyOCR
def get_yolo_easyocr_predictions(image_folder, model, reader):
    predictions = {}
    bboxes = {}
    for image_file in Path(image_folder).iterdir():
        if image_file.suffix.lower() in [".jpg", ".png", ".jpeg"]:
            # Загрузка изображения
            image = cv2.imread(str(image_file))
            # YOLO: детекция текстовых областей
            results = model.predict(image, conf=0.5)
            predicted_words = []
            predicted_bboxes = []
            for box in results[0].boxes.xyxy:  # Координаты детекции
                x_min, y_min, x_max, y_max = map(int, box.tolist())
                cropped_image = image[y_min:y_max, x_min:x_max]
                ocr_results = reader.readtext(cropped_image)
                for _, text, _ in ocr_results:
                    predicted_words.append(text.lower())
                    predicted_bboxes.append([x_min, y_min, x_max, y_max])
            predictions[image_file.name] = predicted_words
            bboxes[image_file.name] = predicted_bboxes
    return predictions, bboxes

In [103]:
# Метрики CER и WER
def calculate_cer(gt_texts, pred_texts):
    cer_scores = []
    for gt, pred in zip(gt_texts, pred_texts):
        sm = SequenceMatcher(None, gt, pred)
        edit_distance = sum(
            (i2 - i1) if tag != 'insert' else (j2 - j1)  # Размер изменения
            for tag, i1, i2, j1, j2 in sm.get_opcodes()  # Кортеж из get_opcodes()
            if tag != 'equal'  # Игнорируем совпадения
        )
        cer = edit_distance / max(len(gt), 1)  # CER = edit_distance / длина GT
        cer_scores.append(cer)
    return np.mean(cer_scores)

# WER calculation
def calculate_wer(gt_texts, pred_texts):
    wer_scores = []
    for gt, pred in zip(gt_texts, pred_texts):
        gt_words = gt.split()  # Разделяем GT на слова
        pred_words = pred.split()  # Разделяем предсказание на слова
        sm = SequenceMatcher(None, gt_words, pred_words)
        edit_distance = sum(
            (i2 - i1) if tag != 'insert' else (j2 - j1)  # Размер изменения
            for tag, i1, i2, j1, j2 in sm.get_opcodes()  # Кортеж из get_opcodes()
            if tag != 'equal'  # Игнорируем совпадения
        )
        wer = edit_distance / max(len(gt_words), 1)  # WER = edit_distance / количество слов в GT
        wer_scores.append(wer)
    return np.mean(wer_scores)

# Метрика IoU
def calculate_iou(gt_bboxes, pred_bboxes):
    def iou(box1, box2):
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])
        inter_area = max(0, x2 - x1) * max(0, y2 - y1)
        box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
        box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
        union_area = box1_area + box2_area - inter_area
        return inter_area / union_area if union_area > 0 else 0

    iou_scores = []
    for gt_boxes, pred_boxes in zip(gt_bboxes.values(), pred_bboxes.values()):
        for gt, pred in zip(gt_boxes, pred_boxes):
            iou_scores.append(iou(gt, pred))
    return np.mean(iou_scores)

In [104]:
# Основная логика
if __name__ == "__main__":
    # Загрузка ground truth для оценки
    ground_truths, gt_bboxes = load_ground_truths(test_labels_folder)

    # Получение предсказаний YOLO + EasyOCR
    yolo_predictions, pred_bboxes = get_yolo_easyocr_predictions(test_images_folder, model, reader)

    # Вычисление метрик
    cer = calculate_cer(
        [" ".join(gt) for gt in ground_truths.values()],
        [" ".join(pred) for pred in yolo_predictions.values()]
    )
    wer = calculate_wer(
        [" ".join(gt) for gt in ground_truths.values()],
        [" ".join(pred) for pred in yolo_predictions.values()]
    )
    iou = calculate_iou(gt_bboxes, pred_bboxes)

    print(f"CER: {cer:.4f}")
    print(f"WER: {wer:.4f}")
    print(f"IoU: {iou:.4f}")


0: 384x640 (no detections), 21.9ms
Speed: 20.0ms preprocess, 21.9ms inference, 13.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 texts, 12.5ms
Speed: 1.5ms preprocess, 12.5ms inference, 19.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 texts, 7.0ms
Speed: 2.0ms preprocess, 7.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 texts, 7.5ms
Speed: 1.0ms preprocess, 7.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 texts, 7.5ms
Speed: 1.0ms preprocess, 7.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 texts, 7.5ms
Speed: 1.0ms preprocess, 7.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 texts, 7.0ms
Speed: 1.0ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 texts, 8.5ms
Speed: 1.0ms preprocess, 8.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



In [105]:
# Функция для загрузки истинных текстов (ground truth)
def load_ground_truths(labels_folder):
    ground_truths = {}
    gt_bboxes = {}
    for label_file in Path(labels_folder).iterdir():
        if label_file.suffix == ".txt":
            image_name = label_file.stem.replace("gt_", "") + ".jpg"
            with label_file.open("r", encoding="utf-8") as f:
                gt_texts = []
                gt_boxes = []
                for line in f.readlines():
                    parts = line.strip().split(",")
                    if parts[-1] != "###":
                        x_min, y_min, x_max, y_max = map(float, parts[:4])
                        text = parts[-1].lower()
                        gt_texts.append(text)
                        gt_boxes.append([x_min, y_min, x_max, y_max])
                ground_truths[image_name] = gt_texts
                gt_bboxes[image_name] = gt_boxes
    return ground_truths, gt_bboxes

# Функция для получения предсказаний EasyOCR
def get_easyocr_predictions(image_folder, reader):
    predictions = {}
    bboxes = {}
    for image_file in Path(image_folder).iterdir():
        if image_file.suffix.lower() in [".jpg", ".png", ".jpeg"]:
            results = reader.readtext(str(image_file))
            predicted_words = [result[1].lower().strip() for result in results]
            predicted_bboxes = [result[0] for result in results]  # Координаты bounding boxes
            predictions[image_file.name] = predicted_words
            bboxes[image_file.name] = predicted_bboxes
    return predictions, bboxes

In [106]:
# Метрики CER и WER
def calculate_cer_easy(gt_texts, pred_texts):
    cer_scores = []
    for gt, pred in zip(gt_texts, pred_texts):
        sm = SequenceMatcher(None, gt, pred)
        edit_distance = sum(
            (i2 - i1) if tag != 'insert' else (j2 - j1)  # Размер изменения
            for tag, i1, i2, j1, j2 in sm.get_opcodes()  # Кортеж из get_opcodes()
            if tag != 'equal'  # Игнорируем совпадения
        )
        cer = edit_distance / max(len(gt), 1)  # CER = edit_distance / длина GT
        cer_scores.append(cer)
    return np.mean(cer_scores)

def calculate_wer_easy(gt_texts, pred_texts):
    wer_scores = []
    for gt, pred in zip(gt_texts, pred_texts):
        gt_words = gt.split()  # Разделяем GT на слова
        pred_words = pred.split()  # Разделяем предсказание на слова
        sm = SequenceMatcher(None, gt_words, pred_words)
        edit_distance = sum(
            (i2 - i1) if tag != 'insert' else (j2 - j1)  # Размер изменения
            for tag, i1, i2, j1, j2 in sm.get_opcodes()  # Кортеж из get_opcodes()
            if tag != 'equal'  # Игнорируем совпадения
        )
        wer = edit_distance / max(len(gt_words), 1)  # WER = edit_distance / количество слов в GT
        wer_scores.append(wer)
    return np.mean(wer_scores)

# Метрика IoU
def calculate_iou_easy(gt_bboxes, pred_bboxes):
    def iou(box1, box2):
        # Преобразуем bounding boxes в формат [x_min, y_min, x_max, y_max]
        def convert_box(box):
            if isinstance(box, list) and all(isinstance(point, list) for point in box):
                # Если bounding box представлен как полигон (список точек)
                x_coords = [point[0] for point in box]
                y_coords = [point[1] for point in box]
                return [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
            else:
                # Если bounding box уже в формате [x_min, y_min, x_max, y_max]
                return box

        box1 = convert_box(box1)
        box2 = convert_box(box2)

        # Вычисляем площадь пересечения
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])
        inter_area = max(0, x2 - x1) * max(0, y2 - y1)

        # Вычисляем площадь объединения
        box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
        box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
        union_area = box1_area + box2_area - inter_area

        return inter_area / union_area if union_area > 0 else 0

    iou_scores = []
    for image_name, gt_boxes in gt_bboxes.items():
        pred_boxes = pred_bboxes.get(image_name, [])
        for gt_box in gt_boxes:
            best_iou = 0
            for pred_box in pred_boxes:
                best_iou = max(best_iou, iou(gt_box, pred_box))
            iou_scores.append(best_iou)
    return np.mean(iou_scores)

In [107]:
# Основная логика
if __name__ == "__main__":
    # Загрузка ground truth для оценки
    ground_truths, gt_bboxes = load_ground_truths(test_labels_folder)

    # Получение предсказаний EasyOCR
    easyocr_predictions, pred_bboxes = get_easyocr_predictions(test_images_folder, reader)

    # Вычисление метрик
    cer = calculate_cer_easy(
        [" ".join(gt) for gt in ground_truths.values()],
        [" ".join(pred) for pred in easyocr_predictions.values()]
    )
    wer = calculate_wer_easy(
        [" ".join(gt) for gt in ground_truths.values()],
        [" ".join(pred) for pred in easyocr_predictions.values()]
    )
    iou = calculate_iou_easy(gt_bboxes, pred_bboxes)

    print("Оценка работы EasyOCR:")
    print(f"CER: {cer:.4f}")
    print(f"WER: {wer:.4f}")
    print(f"IoU: {iou:.4f}")

Оценка работы EasyOCR:
CER: 0.8004
WER: 0.9725
IoU: 0.0410


yolov8n.pt (nano) -> Оценка работы YOLO + EasyOCR: Precision: 0.12, Recall: 0.01, F1: 0.02

yolov8s.pt (small) -> Оценка работы YOLO + EasyOCR: Precision: 0.09, Recall: 0.01, F1: 0.01

yolov8m.pt (medium) -> Оценка работы YOLO + EasyOCR: Precision: 0.11, Recall: 0.01, F1: 0.02

yolov8l.pt (large) -> Оценка работы YOLO + EasyOCR: Precision: 0.09, Recall: 0.01, F1: 0.01

yolov8x.pt (extra large) -> Оценка работы YOLO + EasyOCR: Precision: 0.12, Recall: 0.01, F1: 0.02

yolo11n.pt (nano) -> Оценка работы YOLO + EasyOCR: Precision: 0.14, Recall: 0.01, F1: 0.02

best.pt -> Оценка работы YOLO + EasyOCR: Precision: 0.28, Recall: 0.25, F1: 0.27