In [3]:
pip install pytesseract


Note: you may need to restart the kernel to use updated packages.


In [4]:
# OCR initialization
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

extracted_text = []
for contour in contours:
    x, y, w, h = cv2.boundingRect(contour)
    roi = gray[y:y+h, x:x+w]
    text = pytesseract.image_to_string(roi, lang='eng')
    extracted_text.append({"text": text.strip(), "bbox": (x, y, w, h)})

In [7]:
import cv2
import pytesseract
import numpy as np
import json

# Input and Output File Paths
input_image_path = 'big_aa74620d2b7f8754c17a894cca921a9075843bbd.jpg'
output_image_path = 'redacted_image.jpg'
output_json_path = 'extracted_data.json'

# Load the image
image = cv2.imread(input_image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Threshold the image to binary
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)

# Reduce kernel size for dilation
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 7))  # Smaller kernel for finer segmentation
dilated = cv2.dilate(thresh, kernel, iterations=1)

# Find contours
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# Function to split wide contours
def split_wide_contour(contour, gray_image, max_width_ratio=1.5):
    x, y, w, h = cv2.boundingRect(contour)
    aspect_ratio = w / h
    boxes = []

    if aspect_ratio > max_width_ratio:  # If the contour is too wide
        # Use vertical projection to find gaps (white spaces)
        roi = gray_image[y:y+h, x:x+w]
        vertical_projection = np.sum(roi, axis=0)  # Sum pixel values column-wise
        split_indices = np.where(vertical_projection < np.max(vertical_projection) * 0.2)[0]

        # Split the box at detected gaps
        start_idx = 0
        for idx in split_indices:
            if idx - start_idx > 10:  # Ignore very small splits
                boxes.append((x + start_idx, y, idx - start_idx, h))
                start_idx = idx
        if start_idx < w:
            boxes.append((x + start_idx, y, w - start_idx, h))
    else:
        boxes.append((x, y, w, h))  # Single box if aspect ratio is fine

    return boxes

# Process contours and redact sensitive words
redacted_image = image.copy()
extracted_text = []
for contour in contours:
    bounding_boxes = split_wide_contour(contour, gray)  # Split wide contours if needed

    for x, y, w, h in bounding_boxes:
        if w > 10 and h > 10:  # Filter out small noise
            # Extract text
            roi = gray[y:y+h, x:x+w]
            try:
                custom_config = r'--oem 3 --psm 6'
                text = pytesseract.image_to_string(roi, lang='eng', config=custom_config).strip()
                extracted_text.append({"text": text, "bbox": (x, y, w, h)})

                # Example redaction logic
                sensitive_words = ["sensitive", "redact", "example","buddy","the"]  # Add your sensitive words
                if any(word in text.lower() for word in sensitive_words):
                    cv2.rectangle(redacted_image, (x, y), (x + w, y + h), (0, 0, 0), -1)  # Black out sensitive word
                else:
                    cv2.rectangle(redacted_image, (x, y), (x + w, y + h), (0, 255, 0), 2)  # Debug (Green box for non-sensitive)
            except Exception as e:
                print(f"Error processing ROI: {e}")

# Save the redacted image
cv2.imwrite(output_image_path, redacted_image)

# Save extracted text to a JSON file
with open(output_json_path, 'w') as file:
    json.dump(extracted_text, file)

# Optional: Display images
cv2.imshow("Original Image", image)
cv2.imshow("Redacted Image", redacted_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

print(f"Redacted image saved to {output_image_path}")
print(f"Extracted text saved to {output_json_path}")


Redacted image saved to redacted_image.jpg
Extracted text saved to extracted_data.json
