In [39]:
from PIL import Image
from ultralytics import YOLO
import numpy as np
from shapely import Polygon
import easyocr
from scipy.ndimage import rotate as scipy_rotate
import rasterio.features
import pandas as pd

def get_rotate_to_flat_angle(mask_polygon: Polygon) -> float:
    centroid = mask_polygon.centroid.coords[0]

    Ix = 0
    Iy = 0
    Ixy = 0

    for i in range(len(mask_polygon.exterior.coords) - 1):
        xi, yi = mask_polygon.exterior.coords[i]
        xi1, yi1 = mask_polygon.exterior.coords[i + 1]

        Ai = xi * yi1 - xi1 * yi
        xi_avg = (xi + xi1) / 2
        yi_avg = (yi + yi1) / 2

        Ix += Ai * (yi_avg - centroid[1]) ** 2
        Iy += Ai * (xi_avg - centroid[0]) ** 2
        Ixy += Ai * (xi_avg - centroid[0]) * (yi_avg - centroid[1])
        
    theta_rad = 0.5 * np.arctan2(2 * Ixy, Ix - Iy)
    theta_deg = np.degrees(theta_rad)
    rotate_to_flat = -theta_deg
    return rotate_to_flat

def get_polygon_bounds_for_slicing(mask_polygon):
    bounds_int = [int(_) for _ in mask_polygon.bounds] # .bounds returns floats, convert to int for slicing
    min_x, min_y, max_x, max_y = bounds_int
    return  min_x, min_y, max_x, max_y

def extract_mask_array_from_image(input_image: Image, mask_polygon: Polygon) -> np.array:
    mask_bitmap = rasterio.features.rasterize([mask_polygon], out_shape=(input_image.height, input_image.width))
    input_image_bw = input_image.convert("L")
    input_image_masked = input_image_bw * mask_bitmap
    min_x, min_y, max_x, max_y = get_polygon_bounds_for_slicing(mask_polygon)
    input_image_masked_bounded_array = input_image_masked[min_y:max_y,min_x:max_x]

    return input_image_masked_bounded_array

def get_ocr_results_all_rotations(book_horizontal_array, reader) -> dict:
    angles = [0, 90, 180, 270]
    ocr_results = {}
    for angle in angles:
        rotated_array = scipy_rotate(book_horizontal_array, angle, reshape=True)
        bookspine_ocr = reader.readtext(rotated_array, detail=0)
        ocr_results[angle] = ' '.join([_.upper() for _ in bookspine_ocr])

    return ocr_results

In [108]:
mask_num = 24

input_image = Image.open(r"..\test-data\book_shelf.jpg")
model = YOLO(r"yolov8_n_24aug2023.pt")
reader = easyocr.Reader(['en'])

results = model.predict(source=input_image, save=True, show_labels=False, show_conf=False, boxes=False)
bookspine_mask = results[0].masks.xy[mask_num]
bookspine_mask_polygon = Polygon(bookspine_mask)
bookspine_isolated_np = extract_mask_array_from_image(input_image, bookspine_mask_polygon)
rotate_to_flat_angle = get_rotate_to_flat_angle(bookspine_mask_polygon)
bookspine_isolated_rotated_to_flat_np = scipy_rotate(bookspine_isolated_np, rotate_to_flat_angle, reshape=True)
ocr_results = get_ocr_results_all_rotations(bookspine_isolated_rotated_to_flat_np, reader)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.

0: 480x640 32 book-spines, 122.7ms
Speed: 6.0ms preprocess, 122.7ms inference, 30.9ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1me:\Code\shelfscan\runs\segment\predict25[0m


In [109]:
ocr_results

{0: 'BRAM STOKER DBACULA ',
 90: '3 8 3',
 180: '{ MDIDD #1XOIS WVAA',
 270: '2 3 2'}

***
research begins here

In [110]:
#### run the OCR outputs against a coherence detection model
    # create markov chains of the letter transitions (start with just trigrams)
    # create frequency distributions of letter transitions in English
    # strip out any numeric characters not attached to alpha characters, but fail the string if it only contains numeric characters
    # score the OCR strings based on how many 'common' or 'real' letter transitions they have
    # the highest scoring string is the most likely to have originated from the properly oriented image
##### use the string with the highest score

In [111]:
# get trigrams from ocr results
ngrams_dict = {}
ngram_len = 3
alpha_chars = 'abcdefghijklmnopqrstuvwxyz'
for angle, ocr_text_result in ocr_results.items():
    ngrams_ = []
    for word in ocr_text_result.split(' '):
        for start in range(0, len(word)-ngram_len-1):
            ngram = word[start:start+ngram_len]
            ngrams_.append(ngram)

    ngrams_dict[angle] = ngrams_

In [112]:
ngrams_dict

{0: ['STO', 'TOK', 'DBA', 'BAC', 'ACU'],
 90: [],
 180: ['MDI', '#1X', '1XO'],
 270: []}

In [113]:
ngram_data_df = pd.read_csv('../3grams.csv')
ngram_data_df.head()

Unnamed: 0,3-gram,*/*
0,THE,69221160871
1,AND,26468697834
2,ING,21289988294
3,ION,20449402244
4,TIO,16819216214


In [114]:
# ngram_data_df[ ngram_data_df['3-gram'] == ngrams[0][0] ]['*/*'].values[0]

In [115]:
ngram_data_df = pd.read_csv('../3grams.csv')
ocr_results_coherence_scores = {}
for angle, ngrams in ngrams_dict.items():
    sum = 0
    for ngram in ngrams:
        df_lookup = ngram_data_df[ ngram_data_df['3-gram'] == ngram ]['*/*'].values
        if len(df_lookup) > 0:
            sum += df_lookup[0]
    ocr_results_coherence_scores[angle] = (sum / len(ngrams)) if ngrams else 0 # protect against divide by 0 error on empty lists

In [116]:
print(ocr_results_coherence_scores[0])
print(ocr_results_coherence_scores[180])

598444680.8
136832.0


In [117]:
most_coherent_angle =  max(ocr_results_coherence_scores, key=ocr_results_coherence_scores.get)
print(f'best scoring orientation/most likely real entry: {most_coherent_angle}')
print(f'bookspine text: {ocr_results[most_coherent_angle]}')

best scoring orientation/most likely real entry: 0
bookspine text: BRAM STOKER DBACULA 


***
whole process in 1 cell

In [2]:
from PIL import Image
from ultralytics import YOLO
import numpy as np
from shapely import Polygon
import easyocr
from scipy.ndimage import rotate as scipy_rotate
import rasterio.features
import pandas as pd

def get_rotate_to_flat_angle(mask_polygon: Polygon) -> float:
    centroid = mask_polygon.centroid.coords[0]

    Ix = 0
    Iy = 0
    Ixy = 0

    for i in range(len(mask_polygon.exterior.coords) - 1):
        xi, yi = mask_polygon.exterior.coords[i]
        xi1, yi1 = mask_polygon.exterior.coords[i + 1]

        Ai = xi * yi1 - xi1 * yi
        xi_avg = (xi + xi1) / 2
        yi_avg = (yi + yi1) / 2

        Ix += Ai * (yi_avg - centroid[1]) ** 2
        Iy += Ai * (xi_avg - centroid[0]) ** 2
        Ixy += Ai * (xi_avg - centroid[0]) * (yi_avg - centroid[1])
        
    theta_rad = 0.5 * np.arctan2(2 * Ixy, Ix - Iy)
    theta_deg = np.degrees(theta_rad)
    rotate_to_flat = -theta_deg
    return rotate_to_flat

def get_polygon_bounds_for_slicing(mask_polygon):
    bounds_int = [int(_) for _ in mask_polygon.bounds] # .bounds returns floats, convert to int for slicing
    min_x, min_y, max_x, max_y = bounds_int
    return  min_x, min_y, max_x, max_y

def extract_mask_array_from_image(input_image: Image, mask_polygon: Polygon) -> np.array:
    mask_bitmap = rasterio.features.rasterize([mask_polygon], out_shape=(input_image.height, input_image.width))
    input_image_bw = input_image.convert("L")
    input_image_masked = input_image_bw * mask_bitmap
    min_x, min_y, max_x, max_y = get_polygon_bounds_for_slicing(mask_polygon)
    input_image_masked_bounded_array = input_image_masked[min_y:max_y,min_x:max_x]

    return input_image_masked_bounded_array

def get_ocr_results_all_rotations(book_horizontal_array, reader) -> dict:
    angles = [0, 90, 180, 270]
    ocr_results = {}
    for angle in angles:
        rotated_array = scipy_rotate(book_horizontal_array, angle, reshape=True)
        bookspine_ocr = reader.readtext(rotated_array, detail=0)
        ocr_results[angle] = ' '.join([_.upper() for _ in bookspine_ocr])

    return ocr_results

def generate_ngrams_dict(ocr_results) -> dict:
    ngrams_dict = {}
    ngram_len = 3
    for angle, ocr_text_result in ocr_results.items():
        ngrams_ = []
        for word in ocr_text_result.split(' '):
            for start in range(0, len(word)-ngram_len-1):
                ngram = word[start:start+ngram_len]
                ngrams_.append(ngram)

        ngrams_dict[angle] = ngrams_

    return ngrams_dict

def calculate_ocr_coherence_scores(ngrams_dict, ngram_data_df) -> dict:
    ocr_results_coherence_scores = {}
    for angle, ngrams in ngrams_dict.items():
        sum = 0
        for ngram in ngrams:
            df_lookup = ngram_data_df[ ngram_data_df['3-gram'] == ngram ]['*/*'].values
            if len(df_lookup) > 0:
                sum += df_lookup[0]
        ocr_results_coherence_scores[angle] = (sum / len(ngrams)) if ngrams else 0 # protect against divide by 0 error on empty lists

    return ocr_results_coherence_scores

def get_most_coherent_text(ocr_results_coherence_scores):
    most_coherent_angle =  max(ocr_results_coherence_scores, key=ocr_results_coherence_scores.get)
    return ocr_results[most_coherent_angle]

input_image = Image.open(r"..\test-data\book_shelf3.jpg")
model = YOLO(r"yolov8_n_24aug2023.pt")
reader = easyocr.Reader(['en'])
ngram_data_df = pd.read_csv('../3grams.csv')

results = model.predict(source=input_image, save=True, show_labels=False, show_conf=False, boxes=False)
bookspine_masks = [_ for _ in results[0].masks.xy if len(_) > 0] #filter empty masks
for bookspine_mask in bookspine_masks:
    bookspine_mask_polygon = Polygon(bookspine_mask)
    bookspine_isolated_np = extract_mask_array_from_image(input_image, bookspine_mask_polygon)
    rotate_to_flat_angle = get_rotate_to_flat_angle(bookspine_mask_polygon)
    bookspine_isolated_rotated_to_flat_np = scipy_rotate(bookspine_isolated_np, rotate_to_flat_angle, reshape=True)
    ocr_results = get_ocr_results_all_rotations(bookspine_isolated_rotated_to_flat_np, reader)
    ngrams_dict = generate_ngrams_dict(ocr_results)
    ocr_results_coherence_scores = calculate_ocr_coherence_scores(ngrams_dict, ngram_data_df)
    most_coherent_text = get_most_coherent_text(ocr_results_coherence_scores)
    print(most_coherent_text)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.

0: 480x640 17 book-spines, 140.6ms
Speed: 5.0ms preprocess, 140.6ms inference, 17.0ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1me:\Code\shelfscan\runs\segment\predict28[0m


SONIA SOTOMAVOR BELOVED WORLD MY
THE TRAIN TO CRYSTAL CITY JAN JARBOE RUSSELL
UI S13IY 1431[ AN UCLINEM XWOD
NEZHUKUMATATHIL MURLLR
QU INILGAU T. (
AM PUSHEEN THE CAT CIALEEELON
NNOIVD DLVM NOSTIM
WUTHERING HEIGHTS BRONTE EMILY 

60 BACK TO WHE SSHA POLAKOW- YOU CAME FRO SURANSKY
GLORIA STEINEM MY LIFE ON THE ROAD
HBR PRREALB  EA
OBSTER IS THE BEST MEDICINE CLIMO
OLIVER BEFORE FALL
KAUR MILK AND HONEY
1 HLIANAT K _ ] "04N
