In [None]:
pip install pytesseract opencv-python pillow pdf2image

In [None]:
!apt-get install poppler-utils

In [None]:
!apt-get install tesseract-ocr

In [None]:
# Detectron2 + Tesseract OCR Table Extractor (Cell-wise for Proper Output)

# !pip install detectron2 opencv-python pytesseract pandas layoutparser scikit-learn pdf2image
# !apt-get install poppler-utils

import cv2
import pytesseract
import pandas as pd
import numpy as np
from PIL import Image
import layoutparser as lp
from IPython.display import display
import os
import urllib.request
from sklearn.cluster import DBSCAN
from pdf2image import convert_from_path

# Optional: If tesseract not in PATH (Windows)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Step 1: Upload and load image
from google.colab import files
uploaded = files.upload()
image_path = next(iter(uploaded))

# Handle PDF or image loading properly
if image_path.lower().endswith(".pdf"):
    images = convert_from_path(image_path)
    image = np.array(images[0].convert("RGB"))
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
else:
    image = np.array(Image.open(image_path).convert("RGB"))
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

# Step 2: Download model weights manually and use local path
model_url = "https://www.dropbox.com/s/dgy9c10wykk4lq4/model_final.pth?dl=1"
os.makedirs("./models", exist_ok=True)
model_path = "./models/model_final.pth"
if not os.path.exists(model_path):
    urllib.request.urlretrieve(model_url, model_path)

# Step 3: Use Detectron2-based model from LayoutParser (TableBank cell-level)
model = lp.Detectron2LayoutModel(
    config_path="lp://TableBank/faster_rcnn_R_50_FPN_3x/config",
    model_path=model_path,
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
    label_map={0: "Table Cell"}
)

layout = model.detect(image)
cell_blocks = [b for b in layout if b.type == 'Table Cell']

# Step 4: OCR each detected cell individually (psm 7 for single-line)
ocr_results = []
for block in sorted(cell_blocks, key=lambda b: (b.coordinates[1], b.coordinates[0])):
    x1, y1, x2, y2 = map(int, block.coordinates)
    cell_img = image[y1:y2, x1:x2]
    gray = cv2.cvtColor(cell_img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config="--psm 7")
    ocr_results.append({"text": text.strip(), "x": x1, "y": y1})

# Step 5: Group cells into rows based on Y-coordinates (clustering)
if not ocr_results:
    print("⚠️ No table cells detected in the document.")
    df = pd.DataFrame()
else:
    coords = np.array([[r["y"]] for r in ocr_results])
    clustering = DBSCAN(eps=15, min_samples=1).fit(coords)

    rows_dict = {}
    for idx, label in enumerate(clustering.labels_):
        if label not in rows_dict:
            rows_dict[label] = []
        rows_dict[label].append(ocr_results[idx])

    # Step 6: Sort cells in each row left-to-right, assemble DataFrame
    final_rows = []
    for label in sorted(rows_dict.keys()):
        row_cells = sorted(rows_dict[label], key=lambda x: x["x"])
        final_rows.append([cell["text"] for cell in row_cells])

    # Step 7: Save and show structured table
    df = pd.DataFrame(final_rows)
    display(df)
    df.to_csv("structured_table.csv", index=False)
    print("✅ Structured table saved as structured_table.csv")

Saving Barclays_uk_bank_statement.pdf to Barclays_uk_bank_statement (2).pdf


roi_heads.box_predictor.bbox_pred.{bias, weight}
roi_heads.box_predictor.cls_score.{bias, weight}


⚠️ No table cells detected in the document.
