In [4]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTLine
import json
from unidecode import unidecode

In [5]:
def extract_text_by_columns(file_name, output_file_name, breakpoints, y_value_range, x_value_range):
    '''
    breakpoints: In case it is a multi column document, breakpoints should be x - coordinate between coordinates (coordinates can be obtained using pdfminer)
    y_value_range: Range of y coordinates excluding header footer
    x_value_range: Range of x coordinates to include specific horizontal sections
    '''
    # Generate column boundaries based on breakpoints
    column_boundaries = [(0, breakpoints[0])] if breakpoints else [(0, float('inf'))]
    for i in range(1, len(breakpoints)):
        column_boundaries.append((breakpoints[i-1] + 1, breakpoints[i]))
    if breakpoints:
        column_boundaries.append((breakpoints[-1] + 1, float('inf')))  # Last column to infinity

    def get_column_index(x):
        """Determine the column index based on the x-coordinate."""
        for index, (start, end) in enumerate(column_boundaries):
            if start <= x <= end:
                return index
        return None  # For elements outside the defined columns

    text_blocks = []
    new_sublist_flag = False

    # Process each page
    for page_layout in extract_pages(file_name):
        # Initialize a list to hold text elements for each column
        columns = [[] for _ in range(len(column_boundaries))]
        
        for element in page_layout:
            print(element)
            if isinstance(element, LTLine):
                new_sublist_flag = True  # Next text element should start a new sublist
            elif isinstance(element, LTTextContainer):
                column_index = get_column_index(element.x0)
                if column_index is not None:
                    # Check if y_value_range and x_value_range are given and element's coordinates are within them
                    if (y_value_range is None or (y_value_range[0] < element.y0 < y_value_range[1])) and \
                       (x_value_range is None or (x_value_range[0] < element.x0 < x_value_range[1])):
                        if new_sublist_flag or not columns[column_index]:
                            # Start a new sublist for this column
                            columns[column_index].append([])
                            new_sublist_flag = False
                        # Add the text to the last sublist in the appropriate column, removing '•'
                        text = element.get_text().strip().replace('•', '')
                        text = unidecode(text)
                        text = text.replace('SAFETY\\n', '')
                        text = text.replace('-\\n', '')
                        text = text.replace('\\n', ' ')
                        text = text.replace('  ', ' ')
                        text = text.replace('\"', '\'')
                        text = text.strip()
                        if text:  # Check if text is not an empty string after replacing
                            columns[column_index][-1].append((element.y0, text))        
        # Sort and process text within each column and sublist
        for column in columns:
            for sublist in column:
                sublist.sort(reverse=True, key=lambda x: x[0])  # Sort by y-coordinate, descending
                text_blocks.append([text for _, text in sublist])
    
    i = 0
    while i < len(text_blocks) - 1:
        if text_blocks[i] and text_blocks[i + 1]:
            if not text_blocks[i][-1].endswith('.') and text_blocks[i+1][0].endswith('.'):
                text_blocks[i][-1] += ' ' + text_blocks[i + 1].pop(0)
                if not text_blocks[i + 1]:
                    del text_blocks[i + 1]
                    continue
        i += 1

    # Exporting the adjusted text to a JSON file
    with open(f'{output_file_name}.json', 'w') as outfile:
        json.dump(text_blocks, outfile, indent=4)

In [6]:
extract_text_by_columns("sumo_gold.pdf", 'uwu', [318], [37, 363], [56, 388])

<LTFigure(Im0) 0.000,0.000,595.680,380.640 matrix=[595.68,0.00,0.00,380.64, (0.00,0.00)]>
<LTFigure(Im0) 0.000,0.000,597.600,382.080 matrix=[597.60,0.00,0.00,382.08, (0.00,0.00)]>
<LTTextBoxHorizontal(0) 191.160,263.386,432.581,307.044 'TATATATATATTTTTA SUMO GOLD\nA SUMO GOLD\nA SUMO GOLD\nA SUMO GOLD\nA SUMO GOLD\nCR-4 (BS-IV) & 4SPTC (BS-III)\n'>
<LTTextBoxHorizontal(1) 144.960,209.508,478.574,233.508 'Owner’s Manual & Service Book\n'>
<LTTextBoxHorizontal(2) 270.600,86.390,353.080,98.500 '•   Mumbai •   Pune •\n'>
<LTTextBoxHorizontal(3) 108.240,57.292,517.732,65.292 'The  contents  given  in  this  book  are  not  binding,  subject  to  change  without  notice  and  are  for  illustration  purpose  only.\n'>
<LTTextBoxHorizontal(4) 562.080,27.801,567.084,36.801 '1\n'>
<LTRect 583.950,0.000,623.700,411.000>
<LTLine 609.720,2.580,609.720,12.780>
<LTLine 610.320,13.860,622.080,13.860>
<LTLine 2.040,397.380,11.760,397.380>
<LTLine 13.680,399.300,13.680,408.060>
<LTLine 14.040,2.220,14.