In [1]:
# run the OCR outputs against a coherence detection model
    # create markov chains of the letter transitions
    # create frequency distributions of letter transitions in English
    # strip out any numeric characters not attached to alpha characters, but fail the string if it only contains numeric characters
    # score the OCR strings based on how many 'common' or 'real' letter transitions they have
    # the highest scoring string is the most likely to have originated from the properly oriented image
# use the string with the highest score

In [2]:
from PIL import Image
from ultralytics import YOLO
import numpy as np
from shapely import Polygon
import easyocr
from scipy.ndimage import rotate as scipy_rotate
import rasterio.features

def get_rotate_to_flat_angle(mask_polygon: Polygon) -> float:
    centroid = mask_polygon.centroid.coords[0]

    Ix = 0
    Iy = 0
    Ixy = 0

    for i in range(len(mask_polygon.exterior.coords) - 1):
        xi, yi = mask_polygon.exterior.coords[i]
        xi1, yi1 = mask_polygon.exterior.coords[i + 1]

        Ai = xi * yi1 - xi1 * yi
        xi_avg = (xi + xi1) / 2
        yi_avg = (yi + yi1) / 2

        Ix += Ai * (yi_avg - centroid[1]) ** 2
        Iy += Ai * (xi_avg - centroid[0]) ** 2
        Ixy += Ai * (xi_avg - centroid[0]) * (yi_avg - centroid[1])
        
    theta_rad = 0.5 * np.arctan2(2 * Ixy, Ix - Iy)
    theta_deg = np.degrees(theta_rad)
    rotate_to_flat = -theta_deg
    return rotate_to_flat

def get_polygon_bounds_for_slicing(mask_polygon):
    bounds_int = [int(_) for _ in mask_polygon.bounds] # .bounds returns floats, convert to int for slicing
    min_x, min_y, max_x, max_y = bounds_int
    return  min_x, min_y, max_x, max_y

def extract_mask_array_from_image(input_image: Image, mask_polygon: Polygon) -> np.array:
    mask_bitmap = rasterio.features.rasterize([mask_polygon], out_shape=(input_image.height, input_image.width))
    input_image_bw = input_image.convert("L")
    input_image_masked = input_image_bw * mask_bitmap
    min_x, min_y, max_x, max_y = get_polygon_bounds_for_slicing(mask_polygon)
    input_image_masked_bounded_array = input_image_masked[min_y:max_y,min_x:max_x]

    return input_image_masked_bounded_array

def get_ocr_results_all_rotations(book_horizontal_array, reader) -> dict:
    angles = [0, 90, 180, 270]
    ocr_results = {}
    for angle in angles:
        rotated_array = scipy_rotate(book_horizontal_array, angle, reshape=True)
        bookspine_ocr = reader.readtext(rotated_array, detail=0)
        ocr_results[angle] = ' '.join(bookspine_ocr)

    return ocr_results

In [5]:
mask_num = 20

input_image = Image.open(r"..\test-data\book_shelf.jpg")
model = YOLO(r"yolov8_n_24aug2023.pt")
reader = easyocr.Reader(['en'])

results = model.predict(source=input_image, save=True, show_labels=False, show_conf=False, boxes=False)
bookspine_mask = results[0].masks.xy[mask_num]
bookspine_mask_polygon = Polygon(bookspine_mask)
bookspine_isolated_np = extract_mask_array_from_image(input_image, bookspine_mask_polygon)
rotate_to_flat_angle = get_rotate_to_flat_angle(bookspine_mask_polygon)
bookspine_isolated_rotated_to_flat_np = scipy_rotate(bookspine_isolated_np, rotate_to_flat_angle, reshape=True)
ocr_results = get_ocr_results_all_rotations(bookspine_isolated_rotated_to_flat_np, reader)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.

0: 480x640 32 book-spines, 115.7ms
Speed: 4.0ms preprocess, 115.7ms inference, 30.9ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1me:\Code\shelfscan\runs\segment\predict17[0m


In [6]:
ocr_results

{0: 'doom Nvid]Mhon Uwiavunw mXnevh',
 90: '1 1 1 8',
 180: 'Haruki MURAKAMI Norwegian Wood',
 270: '8 1 1 1'}