In [1]:
import cv2
import pytesseract
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
def preprocess(image):
    
    # conert to gray scale image
    gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # apply Binary threshold to get clear texts
    ret, threshold_image = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # gaussian blur to remove noise for edge detection
    gaussianBlur_img = cv2.GaussianBlur(gray_img,(3,3),0)
    
    return threshold_image, gaussianBlur_img

In [3]:
def extract_cells(gaussianBlur_img):
    # Canny edge detection
    canny = cv2.Canny(gaussianBlur_img, 50, 100)

    # detect lines in the image
    lines = cv2.HoughLinesP(canny, 2, np.pi/180, 100, np.array([]), minLineLength=180, maxLineGap=10)
    
    # display lines
    def display_lines(img, lines):
        img_lines = np.zeros_like(img)
        for line in lines:
            x1,y1,x2,y2 = line.reshape(4)
            cv2.line(img_lines, (x1,y1), (x2,y2), (255,0,0), 5)
        return img_lines

    lines_img = display_lines(gaussianBlur_img, lines)
    cv2.imshow('lines', lines_img)
    cv2.waitKey(10000)
    cv2.destroyAllWindows()
    cv2.waitKey(1)
    
    # get contours
    contours, hierarchy = cv2.findContours(lines_img, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

    cordinates = []
    for cnt in contours:
        x,y,w,h = cv2.boundingRect(cnt) # getting rectangular coordinates of detected contour
        cordinates.append((x,y,w,h))

    cordinates.sort(key=lambda cell: cell[0])
    cordinates.sort(key=lambda cell: cell[1])
    
    return cordinates[1:]

def extract_text_from_cells(threshold_img, cell_regions):
    cells_text = []
    for cell in cell_regions:
        x, y, w, h = cell
        cell_img = threshold_img[y:y+h, x:x+w]
        cell_text = pytesseract.image_to_string(cell_img)
        cells_text.append(cell_text)
    return cells_text

cells_text = []
N = len(cordinates)
for i in range(1,N):
        x, y, w, h = cordinates[i]
        if w != 692 :
            threshold_img = preprocess(img)
            cell_img = threshold_img[y:y+h, x:x+w]
            cell_text = pytesseract.image_to_string(cell_img)
            table_text.append(cell_text)

In [4]:

def create_dataframe(cell_data, rows, cols):
    
    data_matrix = np.reshape(cell_data, (rows, cols))
    df = pd.DataFrame(data_matrix)
    return df

def save_to_excel(df, filename):

    df.to_excel(filename, index=False)

def detect_and_extract_table(image_path, output_excel):
    """Detects and extracts table layout and text from an image.

    Args:
        image_path: The path to the input image.
        output_excel: The filename for the Excel output.
    """
    
    img = cv2.imread(image_path)
    threshold_img, gaussianBlur_img = preprocess(img)
    
    cell_regions = extract_cells(gaussianBlur_img)        
    cell_data = extract_text_from_cells(threshold_img, cell_regions)

    # Infer table dimensions from cell regions
    rows = int(np.sqrt(len(cell_regions)))
    cols = int(len(cell_regions) / rows)
    
    print(rows, cols, len(cell_regions), len(cell_data))
    
    df = create_dataframe(cell_data, rows, cols)
    if df is not None:
        save_to_excel(df, output_excel)
        print("Excel file created successfully.")
    else:
        print("Failed to create DataFrame. No data saved to Excel.")

In [5]:

# Example usage
image_path = 'Screenshot 2024-03-29 at 2.38.19 AM.png'
output_excel = 'output_table.xlsx'
detect_and_extract_table(image_path, output_excel)


6 7 42 42
Excel file created successfully.
