# Parsing using BeautifulSoup and GROBID

In [None]:
import PyPDF2
import pytesseract
import cv2
import pandas as pd
import numpy as np
import fitz
from matplotlib import pyplot as plt
import io
import os
from PIL import Image
from pdf2image import convert_from_bytes
import pypdfium2 as pdfium
import requests
from bs4 import BeautifulSoup

In [None]:
SCANNED_FILE = "example.pdf"
img = cv2.imread(SCANNED_FILE)


zoom_x = 2.0 # horizontal zoom
zoom_y = 2.0 # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y)

doc = fitz.open(SCANNED_FILE)

print("Generated pages: ")
for page in doc:
    pix = page.get_pixmap(matrix=mat)
    png = 'C:\\Repositories\\Circuit_Benchmarking\\pages' + SCANNED_FILE.split('\\')[-1].split('.')[0] + 'page-%i.png' % page.number
    print(png)
    pix.save(png)

original_image = cv2.imread('C:\\Repositories\\Circuit_Benchmarking\\pagesexamplepage-11.png')

In [None]:
# convert the image to grayscale
gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
plt.figure(figsize=(25, 15))
plt.imshow(gray_image, cmap='gray')
plt.show()

In [None]:
ret, threshold_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
plt.figure(figsize=(25, 15))
plt.imshow(threshold_image, cmap='gray')
plt.show()

In [None]:
rectangular_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (66, 66))

# Applying dilation on the threshold image
dilated_image = cv2.dilate(threshold_image, rectangular_kernel, iterations = 1)
plt.figure(figsize=(25, 15))
plt.imshow(dilated_image)
plt.show()

# Finding contours
contours, hierarchy = cv2.findContours(dilated_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

# Creating a copy of the image
copied_image = original_image.copy()

with open("C:\\Repositories\\Circuit_Benchmarking\\pages\\kernel66-66", "w+") as f:
    f.write("")
f.close()

mask = np.zeros(original_image.shape, np.uint8)
 
# Looping through the identified contours
# Then rectangular part is cropped and passed on to pytesseract
# pytesseract extracts the text inside each contours
# Extracted text is then written into a text file
for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)
     
    # Cropping the text block for giving input to OCR
    cropped = copied_image[y:y + h, x:x + w]
    
    with open("C:\\Repositories\\Circuit_Benchmarking\\pages\\kernel66-66", "a") as f:
        # Apply OCR on the cropped image
        text = pytesseract.image_to_string(cropped, lang='eng', config='--oem 3 --psm 1')
        print(text)
        
    masked = cv2.drawContours(mask, [cnt], 0, (255, 255, 255), -1)

plt.figure(figsize=(25, 15))
plt.imshow(masked, cmap='gray')
plt.show()

In [None]:
# Function to extract tables from the TEI XML
def extract_tables(tei_xml):
    tables = []

    soup = BeautifulSoup(tei_xml, 'lxml')
    table_elems = soup.find_all('Table')

    for table in table_elems:
        rows = []
        for row in table.find_all('row'):
            cells = [cell.get_text(strip=True) for cell in row.find_all('cell')]
            rows.append(cells)
        tables.append(rows)

    return tables

# Set up GROBID service
grobid_url = 'http://localhost:8070/api'
headers = {'Accept': 'application/x-bibtex'}

# Path to the PDF file
pdf_path = "C:\Repositories\Circuit_Benchmarking\example.pdf"

# Path to the output text file
output_file = "C:\Repositories\Circuit_Benchmarking\output_test.txt"

# Send PDF to GROBID service for processing
with open(pdf_path, 'rb') as pdf:
    response = requests.post(f'{grobid_url}/processFulltextDocument', files={'input': pdf_path})
    tei_xml = response.text

# Extract tables from GROBID output
tables = extract_tables(tei_xml)

# Process image using OpenCV
original_image = cv2.imread('C:\\Repositories\\Circuit_Benchmarking\\pagesexamplepage-11.png')
gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
threshold_image = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)

rectangular_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (66, 66))
dilated_image = cv2.dilate(threshold_image, rectangular_kernel, iterations=1)

contours, hierarchy = cv2.findContours(dilated_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
copied_image = original_image.copy()

# Initialize the output text file
with open(output_file, "w+") as f:
    f.write("")

mask = np.zeros(original_image.shape, np.uint8)

for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)
    cropped = copied_image[y:y + h, x:x + w]

    with open(output_file, "a") as f:
        # Apply OCR on the cropped image
        text = pytesseract.image_to_string(cropped, lang='eng', config='--oem 3 --psm 1')
        f.write(text)

    masked = cv2.drawContours(mask, [cnt], 0, (255, 255, 255), -1)

# Append extracted tables to the output text file
with open(output_file, "a") as f:
    for table in tables:
        for row in table:
            f.write('\t'.join(row))
            f.write('\n')
        f.write('\n')

# Show the processed image
plt.figure(figsize=(25, 15))
plt.imshow(masked, cmap='gray')
plt.show()

# The ones below work

In [None]:
from PIL import Image
from pdf2image import convert_from_path

def get_two_columned_page(im1, im2):
    page = Image.new('RGB', (im1.width, im1.height + im2.height))
    page.paste(im1, (0, 0))
    page.paste(im2, (0, im1.height))
    return page

pdf_images = convert_from_path("example.pdf", 500, poppler_path=r'C:\\bin\\poppler-0.68.0\\bin')
output_images = []

for page_number, page_image in enumerate(pdf_images):
    pil_image_1 = page_image.crop((0, 0, page_image.width // 2, page_image.height))
    pil_image_2 = page_image.crop((page_image.width // 2, 0, page_image.width, page_image.height))

    singular_page = get_two_columned_page(pil_image_1, pil_image_2)
    singular_page.save(f"image_{page_number+1}.png")
    output_images.append(singular_page)

# Create a PDF from the saved images
output_images[0].save("output.pdf", "PDF", resolution=100.0, save_all=True, append_images=output_images[1:])

In [None]:
import PyPDF2
import pytesseract
from PIL import Image
from pdf2image import convert_from_path

# Update this path to your Tesseract installation path
pytesseract.pytesseract.tesseract_cmd = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

# Convert the PDF file to a list of images
pdf_images = convert_from_path("output.pdf", poppler_path=r'C:\\bin\\poppler-0.68.0\\bin')

# Extract text from each image in the list
text = ''
for page_num, page_image in enumerate(pdf_images):
    page_text = pytesseract.image_to_string(page_image, lang="eng")
    text += page_text

# Writes to an arbitrary text file 
with open('output.txt', 'w') as fp:
    fp.write(text)