In [None]:
import typing as t
from img2table.document import Image, PDF
import numpy as np
import cv2
from io import BytesIO
from dataclasses import dataclass
from pdf2image import convert_from_path

from img2table.ocr import EasyOCR
from img2table.tables.objects.extraction import ExtractedTable

import pandas as pd

@dataclass
class Report:
    report_table: pd.DataFrame
    report_code: str

CODE_KEYS = ["Форма по ОКУД", "Форма"]
CODE_VALUES = ["0710001", "0710002", "0710003", "0710004", "0710005"]
HEADER_KEY = ["Наименование показателя"]

class ParseTablePipeline:
    def __init__(self, doc_path: str = None, use_gpu: bool = False):
        self.ocr = EasyOCR(lang=["ru"], kw={"gpu": use_gpu})
        self.document = Image(
            src=doc_path,
            detect_rotation=True
        )

        self.preprocess_images()

    def preprocess_images(self):
        images:  t.List[np.ndarray] = []
        for image_array in self.document.images:
            fixed_image = self.whiten_stamp(image_array)
            images.append(fixed_image)

        self.document.images = images

    def whiten_stamp(self, image_array: t.List[np.ndarray]):
        img_cv2 = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)

        # Преобразуем изображение в цветовое пространство HSV
        hsv = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2HSV)

        # Определяем диапазон синего цвета в HSV
        lower_blue = np.array([90, 50, 50])
        upper_blue = np.array([130, 255, 255])

        # Создаем маску для синих оттенков
        mask = cv2.inRange(hsv, lower_blue, upper_blue)

        # Заменяем синий цвет на белый
        img_cv2[mask != 0] = [255, 255, 255]

        return img_cv2
    
    def merge_header(self, header_rows):
        # Объединение заголовков, которые были разорваны на несколько строк
        merged_headers = []
        for col in zip(*header_rows):
            merged_col = ' '.join([str(c).strip() for c in col if pd.notna(c)])
            merged_headers.append(merged_col)
        return merged_headers
    
    def postprocess_tables(self, tables: t.List[ExtractedTable]) -> t.List[Report]:
        found_code = None
        headers_found = False
        extracted_data = []  # Здесь будем хранить извлеченные данные
        column_headers = []

        for table in tables:
            df = table.df  # Таблица из списка
            if found_code is None:
                # Поиск кода
                for value in CODE_VALUES:
                    if (df == value).any().any():
                        found_code = value
                        break
                if found_code is None:
                    # Поиск ключей кода
                    for key in CODE_KEYS:
                        for i, row in df.iterrows():
                            if key in row.values:
                                key_index = row[row == key].index[0]
                                next_col_index = df.columns.get_loc(key_index) + 1
                                if next_col_index < len(df.columns):
                                    found_code = df.iloc[i, next_col_index]
                                    break
                if found_code is not None:
                    print(f"Code found: {found_code}")
            
            if found_code and not headers_found:
                # Поиск заголовков таблицы
                for i, row in df.iterrows():
                    if HEADER_KEY in row.values:
                        headers_found = True
                        # Сборка заголовков (может быть разрыв по строкам)
                        column_headers = self.merge_header(df.iloc[i:].values)
                        break
            
            if headers_found:
                # Извлекаем данные начиная с заголовков
                for i, row in df.iterrows():
                    if HEADER_KEY in row.values:
                        continue  # Пропускаем строку с заголовком
                    row_data = {}
                    row_key = row.iloc[0]  # Первый столбец - ключ
                    for j, header in enumerate(column_headers[1:], start=1):
                        row_data[header] = row.iloc[j]
                    extracted_data.append({row_key: row_data})

        return extracted_data

    def __call__(self):
        extracted_tables = self.document.extract_tables(
            ocr=self.ocr,
            implicit_rows=True,
            implicit_columns=True,
            borderless_tables=True,
            min_confidence=50
        )
        print('Tables extracted')

        # post_processed_reports: t.List[Report] = []
        # post_processed_reports.append(self.postprocess_tables(extracted_tables))

        table_dfs = [
            table.df for table in extracted_tables
        ]
        
        return table_dfs

