In [None]:
!sudo apt install tesseract-ocr
!sudo apt-get install tesseract-ocr-rus

!pip install timm
!pip install transformers
!pip install pytesseract
!pip install pillow

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 15 not upgraded.
Need to get 4,850 kB of archives.
After this operation, 16.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1 [1,598 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr amd64 4.1.1-2build2 [262 kB]
Fetched 4,850 kB in 0s (12.7 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/Fron

In [None]:
# импорты сторонних библиотек
import numpy as np
import pandas as pd
import os
import cv2
import torch
import pytesseract
import subprocess


# импорты модулей текущего проекта
from google.colab.patches import cv2_imshow
from google.colab import drive
from transformers import (
    AutoImageProcessor,
    TableTransformerForObjectDetection,
    TableTransformerModel,
    DetrImageProcessor,
    DetrForObjectDetection,
)
from PIL import Image
from tqdm import trange
from os import listdir

# настройки
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
ocr_settings = ' - -l rus --oem 3 --psm 7 --dpi 72 -c tessedit_char_whitelist="йцукенгшщзхъфывапролджэячсмитьбю/ЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ0123456789().calmg* "'
drive.mount("/content/drive")


# константы
FILE_PATH = r"/content/drive/MyDrive/213950.jpg"
CSV_PATH = "/content/drive/MyDrive/213950.csv"
DIRECTORY = "/content/drive/MyDrive/workshop/"
SLICES_FOLDER = "/content/drive/MyDrive/ocr_slices/"
PRETRAINED_MODEL = "TahaDouaji/detr-doc-table-detection"

Mounted at /content/drive


# Модель

In [None]:
def order_points(pts):
    pts = pts.reshape(4, 2)
    rect = np.zeros((4, 2), dtype="float32")
    s = pts.sum(axis=1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]
    diff = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    return rect


def calculateDistanceBetween2Points(p1, p2):
    dis = ((p2[0] - p1[0]) ** 2 + (p2[1] - p1[1]) ** 2) ** 0.5
    return dis


def get_mean_height_of_bounding_boxes(bounding_boxes):
    heights = []
    for bounding_box in bounding_boxes:
        x, y, w, h = bounding_box
        heights.append(h)
    return np.mean(heights)


def get_result_from_tersseract(image_path):
    output = subprocess.getoutput("tesseract " + image_path + ocr_settings)
    output = output.strip()
    return output

In [None]:
def preprocessor(file_path):
    image = Image.open(file_path).convert("RGB")

    processor = DetrImageProcessor.from_pretrained(PRETRAINED_MODEL)
    model = DetrForObjectDetection.from_pretrained(PRETRAINED_MODEL)

    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)

    target_sizes = torch.tensor([image.size[::-1]])
    results = processor.post_process_object_detection(
        outputs, target_sizes=target_sizes, threshold=0.9
    )[0]

    for score, label, box in zip(
        results["scores"], results["labels"], results["boxes"]
    ):
        box = [round(i, 2) for i in box.tolist()]
        print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
        )
    orig_size = list(image.size)

    box[3] = orig_size[1]
    box[1] = box[1] - 75
    box[0] = 0
    box[2] = orig_size[0]
    image = image.crop(box)
    np_image = np.asarray(image)
    # переводим в черно-белое
    grayscale_image = cv2.cvtColor(np_image, cv2.COLOR_BGR2GRAY)
    thresholded_image = cv2.threshold(
        grayscale_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
    )[1]
    # меняем черное и белое
    inverted_image = cv2.bitwise_not(thresholded_image)
    # делаем текст и рамки толще
    dilated_image = cv2.dilate(inverted_image, None, iterations=5)
    return dilated_image, np_image

In [None]:
def table_extractor(dilated_image, np_image):
    contours, hierarchy = cv2.findContours(
        dilated_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
    )
    image_with_all_contours = np_image.copy()
    cv2.drawContours(image_with_all_contours, contours, -1, (0, 255, 0), 3)
    rectangular_contours = []
    for contour in contours:
        peri = cv2.arcLength(contour, True)
        approx = cv2.approxPolyDP(contour, 0.02 * peri, True)
        if len(approx) == 4:
            rectangular_contours.append(approx)

    image_with_only_rectangular_contours = np_image.copy()
    cv2.drawContours(
        image_with_only_rectangular_contours, rectangular_contours, -1, (0, 255, 0), 3
    )
    max_area = 0
    contour_with_max_area = None
    for contour in rectangular_contours:
        area = cv2.contourArea(contour)
        if area > max_area:
            max_area = area
            contour_with_max_area = contour

    image_with_contour_with_max_area = np_image.copy()
    cv2.drawContours(
        image_with_contour_with_max_area, [contour_with_max_area], -1, (0, 255, 0), 3
    )
    contour_with_max_area_ordered = order_points(contour_with_max_area)
    image_with_points_plotted = np_image.copy()
    for point in contour_with_max_area_ordered:
        point_coordinates = (int(point[0]), int(point[1]))
        image_with_points_plotted = cv2.circle(
            image_with_points_plotted, point_coordinates, 10, (0, 0, 255), -1
        )
    existing_image_width = np_image.shape[1]
    existing_image_width_reduced_by_10_percent = int(existing_image_width * 0.9)

    distance_between_top_left_and_top_right = calculateDistanceBetween2Points(
        contour_with_max_area_ordered[0], contour_with_max_area_ordered[1]
    )
    distance_between_top_left_and_bottom_left = calculateDistanceBetween2Points(
        contour_with_max_area_ordered[0], contour_with_max_area_ordered[3]
    )
    aspect_ratio = (
        distance_between_top_left_and_bottom_left
        / distance_between_top_left_and_top_right
    )
    new_image_width = existing_image_width_reduced_by_10_percent
    new_image_height = int(new_image_width * aspect_ratio)

    pts1 = np.float32(contour_with_max_area_ordered)
    pts2 = np.float32(
        [
            [0, 0],
            [new_image_width, 0],
            [new_image_width, new_image_height],
            [0, new_image_height],
        ]
    )
    matrix = cv2.getPerspectiveTransform(pts1, pts2)
    perspective_corrected_image = cv2.warpPerspective(
        dilated_image, matrix, (new_image_width, new_image_height)
    )
    perspective_corrected_orig_image = cv2.warpPerspective(
        np_image, matrix, (new_image_width, new_image_height)
    )
    return perspective_corrected_image, perspective_corrected_orig_image

In [None]:
def text_recognizer(perspective_corrected_image, perspective_corrected_orig_image):
    hor = np.array([[1, 1, 1, 1, 1, 1]])
    vertical_lines_eroded_image = cv2.erode(
        perspective_corrected_image, hor, iterations=100
    )
    vertical_lines_eroded_image = cv2.dilate(
        vertical_lines_eroded_image, hor, iterations=100
    )
    ver = np.array([[1], [1], [1], [1], [1], [1], [1]])
    horizontal_lines_eroded_image = cv2.erode(
        perspective_corrected_image, ver, iterations=100
    )
    horizontal_lines_eroded_image = cv2.dilate(
        horizontal_lines_eroded_image, ver, iterations=100
    )
    combined_image = cv2.add(vertical_lines_eroded_image, horizontal_lines_eroded_image)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    combined_image_dilated = cv2.dilate(combined_image, kernel, iterations=7)
    image_without_lines = cv2.subtract(
        perspective_corrected_image, combined_image_dilated
    )
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    image_without_lines_noise_removed = cv2.erode(
        image_without_lines, kernel, iterations=5
    )
    image_without_lines_noise_removed = cv2.dilate(
        image_without_lines_noise_removed, kernel, iterations=5
    )
    kernel_to_remove_gaps_between_words = np.array(
        [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
    )
    dilated_image = cv2.dilate(
        image_without_lines_noise_removed,
        kernel_to_remove_gaps_between_words,
        iterations=5,
    )
    simple_kernel = np.ones((5, 5), np.uint8)
    dilated_image = cv2.dilate(
        image_without_lines_noise_removed, simple_kernel, iterations=5
    )
    result = cv2.findContours(dilated_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    contours = result[0]
    image_with_contours_drawn = perspective_corrected_orig_image.copy()
    cv2.drawContours(image_with_contours_drawn, contours, -1, (0, 255, 0), 3)
    approximated_contours = []
    for contour in contours:
        approx = cv2.approxPolyDP(contour, 3, True)
        approximated_contours.append(approx)
    image_with_contours = perspective_corrected_orig_image.copy()
    cv2.drawContours(image_with_contours, approximated_contours, -1, (0, 255, 0), 5)
    bounding_boxes = []
    image_with_all_bounding_boxes = perspective_corrected_orig_image.copy()
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        bounding_boxes.append((x, y, w, h))
        image_with_all_bounding_boxes = cv2.rectangle(
            image_with_all_bounding_boxes, (x, y), (x + w, y + h), (0, 255, 0), 3
        )
    bounding_boxes = sorted(bounding_boxes, key=lambda x: x[1])
    mean_height = get_mean_height_of_bounding_boxes(bounding_boxes)
    rows = []
    half_of_mean_height = mean_height / 2
    current_row = [bounding_boxes[0]]
    for bounding_box in bounding_boxes[1:]:
        current_bounding_box_y = bounding_box[1]
        previous_bounding_box_y = current_row[-1][1]
        distance_between_bounding_boxes = abs(
            current_bounding_box_y - previous_bounding_box_y
        )
        if distance_between_bounding_boxes <= half_of_mean_height:
            current_row.append(bounding_box)
        else:
            rows.append(current_row)
            current_row = [bounding_box]
    rows.append(current_row)
    for row in rows:
        row.sort(key=lambda x: x[0])
    table = []
    current_row = []
    image_number = 0
    for row in rows:
        for bounding_box in row:
            x, y, w, h = bounding_box
            cropped_image = perspective_corrected_orig_image[y : y + h, x : x + w]
            image_slice_path = SLICES_FOLDER + "img_" + str(image_number) + ".jpg"
            cv2.imwrite(image_slice_path, cropped_image)
            results_from_ocr = get_result_from_tersseract(image_slice_path)
            current_row.append(results_from_ocr)
            image_number += 1
        table.append(current_row)
        current_row = []
    return table

# Обработка выходных данных

In [None]:
def delete_redundant_elements(table, iter=5):
    filtered_table = table.copy()

    for _ in range(iter):
        for row in filtered_table:
            for item in row:
                if (len(item) < 6) or (len(item) > 12):
                    row.remove(item)

    return filtered_table


def get_max_row_lenght(table):
    max = 0

    for row in table:
        if len(row) > max:
            max = len(row)

    return max


def delete_redundant_rows(table, iter=3):
    filtered_table = table.copy()
    max = get_max_row_lenght(table)

    for _ in range(iter):
        for row in filtered_table:
            if (len(row) <= max / 3) or (len(row) <= 2):
                filtered_table.remove(row)

    return filtered_table


def split_don_type(table):
    filtered_table = []

    for row in table:
        filtered_table.append(
            [splitted_item for item in row for splitted_item in item.split()]
        )

    return filtered_table


def split_long_row(table):
    updated_table = []

    for row in table:
        if len(row) > 10:
            half_length = len(row) // 2
            updated_table.append(row[:half_length])
            updated_table.append(row[half_length:])
        else:
            updated_table.append(row)

    return updated_table


def change_values(value: str, values: dict) -> str:
    if value in values.keys():
        return values[value]
    else:
        return value


def raw_table_filter(raw_pred):
    filtered_table = delete_redundant_elements(raw_pred)
    filtered_table = delete_redundant_rows(filtered_table)
    filtered_table = split_don_type(filtered_table)

    return filtered_table

In [None]:
# словарь с расшифровками

don_type = {
    "кр/д": "Цельная кровь",
    "крид": "Цельная кровь",
    "кри": "Цельная кровь",
    "т/ф": "Тромбоциты",
    "п/ф": "Плазма",
    "пл/д": "Плазма",
}

pay_type = {"(бв)": "Безвозмездно", "(6в)": "Безвозмездно", "(пл)": "Платно"}


def get_predictions_df(filtered_table, don_type, pay_type):
    max_len = get_max_row_lenght(filtered_table)
    row_len = 3
    new_table = []
    for i in range(len(filtered_table) * int(max_len / 3)):
        new_row = [0 for _ in range(row_len)]
        new_table.append(new_row)
    counter = 0
    row_counter = 0
    if max_len == 8:
        max_len += 1

    for i in range(len(filtered_table)):
        if max_len == 6:
            pass
        elif (
            max_len == 9
            and new_table[row_counter][2] == 0
            and new_table[row_counter].count(0) < 3
        ):
            row_counter += 0
        elif max_len == 9 and new_table[row_counter].count(0) == 3:
            if row_counter % 3 == 1:
                row_counter += 2
            elif row_counter % 3 == 2:
                row_counter += 1
        for j in range(len(filtered_table[i])):
            counter = 0
            try:
                datetime_object = pd.to_datetime(
                    filtered_table[i][j].strip("."), format="%d.%m.%Y"
                )
                try:
                    if new_table[row_counter - 1][2] == 0 and row_counter != 0:
                        row_counter += 1
                except:
                    pass
                if new_table[row_counter][counter] != 0:
                    row_counter += 1
                new_table[row_counter][counter] = filtered_table[i][j].strip(".")
                continue

            except:
                counter += 1

            if filtered_table[i][j] in don_type.keys():
                if new_table[row_counter][counter] != 0:
                    row_counter += 1
                new_table[row_counter][counter] = change_values(
                    filtered_table[i][j], don_type
                )
                continue
            else:
                counter += 1

            if filtered_table[i][j] in pay_type.keys():
                new_table[row_counter][counter] = change_values(
                    filtered_table[i][j], pay_type
                )
                row_counter += 1
                continue
            else:
                counter += 1

    new_table = pd.DataFrame(
        new_table, columns=["Дата донации", "Класс крови", "Тип донации"]
    )

    return new_table


def reshape(table, preds):
    if get_max_row_lenght(preds) == 6:
        temp_table_1 = table.iloc[::2, :]
        temp_table_2 = table.iloc[1::2, :]

        reshaped_table = pd.concat([temp_table_1, temp_table_2]).reset_index(drop=True)

        return reshaped_table

    else:
        temp_table_1 = table.iloc[::3, :]
        temp_table_2 = table.iloc[1::3, :]
        temp_table_3 = table.iloc[2::3, :]

        reshaped_table = pd.concat(
            [temp_table_1, temp_table_2, temp_table_3]
        ).reset_index(drop=True)

        return reshaped_table

# Расчет accuracy_score

In [None]:
def accuracy_score(table_pred, table_true):
    if table_pred.shape == table_true.shape:
        rows = int(table_pred.shape[0])
        cols = int(table_pred.shape[1])
        total = rows * cols
        correct = 0

        for row in range(rows):
            for col in range(cols):
                if table_pred.iloc[row, col] == table_true.iloc[row, col]:
                    correct += 1
                else:
                    continue

        return correct / total

    else:
        print("Shapes of table_pred and table_true does not match!")


def accuracy_check(table_pred, csv_orig_path):
    accuracy_columns = ["Дата донации", "Класс крови", "Тип донации"]
    table_orig = pd.read_csv(csv_orig_path)

    table_pred = table_pred[accuracy_columns]
    table_orig = table_orig[accuracy_columns]

    return accuracy_score(table_pred, table_orig)

# Расчет ACCURACY SCORE

In [None]:
images = [img for img in os.listdir(DIRECTORY) if img.endswith(".jpg")]
csvs = [csv for csv in os.listdir(DIRECTORY) if csv.endswith(".csv")]

images_test = ["213950.jpg", "225629 .jpg", "233749 .jpg", "238716.jpg"]

images_test_2 = [
    "213950.jpg",
    "225629 .jpg",
    "227414.jpg",
    "231820 .jpg",
    "233749 .jpg",
    "238716.jpg",
    #'243478 .jpg',
    "254586 .jpg",
]

In [None]:
# для каждой картинки из images
for img in images_test_2:
    # выводим имя картинки
    print(f"IMAGE: {img}")

    # обрабатываем картинку
    preprocessed_image, np_image = preprocessor(DIRECTORY + img)

    # экстрактим таблицу
    table_image, table_image_orig = table_extractor(preprocessed_image, np_image)

    # вытаскиваем текст из таблицы
    table_text = text_recognizer(table_image, table_image_orig)

    # фильтруем текст
    filtered_text = raw_table_filter(table_text)

    # приводим предсказания к DataFrame
    df_pred = get_predictions_df(filtered_text, don_type, pay_type)

    # меняем порядок выдачи записей
    df_pred = reshape(df_pred, filtered_text)

    # сохраняем предсказания в csv формат
    df_pred.to_csv(f"./csv/" + img[:-4].strip() + ".csv")

    # считаем accuracy
    acc = accuracy_check(df_pred, DIRECTORY + img[:-4].strip() + ".csv")
    print(f"ACCURACY = {acc}")
    print("-" * 84)

IMAGE: 213950.jpg
Detected table with confidence 0.985 at location [248.68, 2877.95, 3178.29, 4203.68]
ACCURACY = 1.0
------------------------------------------------------------------------------------
IMAGE: 225629 .jpg
Detected table with confidence 0.992 at location [238.97, 2099.19, 2749.85, 2843.4]
ACCURACY = 0.7407407407407407
------------------------------------------------------------------------------------
IMAGE: 227414.jpg
Detected table with confidence 0.988 at location [163.26, 1326.01, 1267.37, 1679.61]
Shapes of table_pred and table_true does not match!
ACCURACY = None
------------------------------------------------------------------------------------
IMAGE: 231820 .jpg
Detected table with confidence 0.994 at location [90.11, 1041.39, 1102.32, 1279.97]
Shapes of table_pred and table_true does not match!
ACCURACY = None
------------------------------------------------------------------------------------
IMAGE: 233749 .jpg
Detected table with confidence 0.989 at location

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


IndexError: ignored

# DEBUG CODE BELOW

In [None]:
# выводим имя картинки
print("IMAGE: 238716.jpg")

# обрабатываем картинку
preprocessed_image, np_image = preprocessor(DIRECTORY + "238716.jpg")

# экстрактим таблицу
table_image, table_image_orig = table_extractor(preprocessed_image, np_image)

# вытаскиваем текст из таблицы
table_text = text_recognizer(table_image, table_image_orig)
table_text

IMAGE: 238716.jpg
Detected table with confidence 0.992 at location [430.39, 2345.7, 2589.67, 3391.42]


[['', '1', 'к', 'Колво', 'Дата', 'а', 'Колво', 'Дата', 'а', 'Колво'],
 ['1', '2', '3', '4', '5', '6', 'Й', '8', '9'],
 ['кр/д (бв)',
  '400',
  '10.08.2017',
  'кр/д (бв)',
  '450',
  '20.10.2020',
  'кр/д (бв)',
  '450'],
 ['крид (бв)',
  '413',
  '16.12.2017',
  'кр/д (бв)',
  '400',
  '23.12.2020',
  'кр/д (бв)',
  '450'],
 ['кр/д (бв)',
  '413',
  '26.04.2018',
  'кр/д (бв)',
  '450',
  '17.02.2021',
  'пл/д (бв)',
  '600'],
 ['кр/д (бв)',
  '400',
  '12.07.2018',
  'кр/д (бв)',
  '400',
  '03.03.2021',
  'кр/д (бв)',
  '450'],
 ['кр/д (бв)',
  '400',
  '15.11.2018',
  'кр/д (бв)',
  '450',
  '04.05.2021',
  'кр/д (бв)',
  '450'],
 ['кр/д (бв)',
  '400',
  '27.03.2019',
  'кр/д (бв)',
  '370',
  '04.08.2021',
  'кр/д (бв)',
  '450'],
 ['кр/д (бв)',
  'кр/д (бв)',
  '400',
  '400',
  '06.06.2019',
  '17.08.2019',
  'кр/д (бв)',
  'кр/д (бв)',
  '450',
  '450',
  '29.10.2021',
  '29.12.2021',
  'кр/д (бв)',
  'кр/д (бв)',
  '450',
  '450',
  '1',
  '1',
  '1'],
 ['кр/д (бв)',
  '450'

In [None]:
# фильтруем текст
filtered_text = raw_table_filter(table_text)
filtered_text = split_long_row(filtered_text)
filtered_text


[['кр/д', '(бв)', '10.08.2017', 'кр/д', '(бв)', '20.10.2020', 'кр/д', '(бв)'],
 ['крид', '(бв)', '16.12.2017', 'кр/д', '(бв)', '23.12.2020', 'кр/д', '(бв)'],
 ['кр/д', '(бв)', '26.04.2018', 'кр/д', '(бв)', '17.02.2021', 'пл/д', '(бв)'],
 ['кр/д', '(бв)', '12.07.2018', 'кр/д', '(бв)', '03.03.2021', 'кр/д', '(бв)'],
 ['кр/д', '(бв)', '15.11.2018', 'кр/д', '(бв)', '04.05.2021', 'кр/д', '(бв)'],
 ['кр/д', '(бв)', '27.03.2019', 'кр/д', '(бв)', '04.08.2021', 'кр/д', '(бв)'],
 ['кр/д', '(бв)', 'кр/д', '(бв)', '06.06.2019', '17.08.2019', 'кр/д', '(бв)'],
 ['кр/д', '(бв)', '29.10.2021', '29.12.2021', 'кр/д', '(бв)', 'кр/д', '(бв)'],
 ['кр/д', '(бв)', '12.11.2019', 'кр/д', '(бв)', '22.03.2022', 'кр/д', '(бв)'],
 ['кр/д', '(бв)', '29.01.2020', 'кр/д', '(бв)', '26.05.2022', 'кр/д', '(бв)'],
 ['кр/д', '(бв)', '01.04.2020', 'кр/д', '(бв)', '10.08.2022', 'кр/д', '(бв)'],
 ['кр/д', '(бв)', '03.08.2020', 'кр/д', '(бв)', '14.12.2022', 'кр/д', '(бв)']]

## формируется много лишних нулевых строк для этой картинки

In [None]:
# приводим предсказания к DataFrame
df_pred = get_predictions_df(filtered_text, don_type, pay_type)
df_pred


IndexError: ignored

In [None]:
# меняем порядок выдачи записей
df_pred = reshape(df_pred, filtered_text)

# сохраняем предсказания в csv формат
df_pred.to_csv(f"./csv/" + "238716.csv")

# считаем accuracy
acc = accuracy_check(df_pred, DIRECTORY + "238716.csv")
print(f"ACCURACY = {acc}")
print("-" * 84)