In [1]:
import pdf2image
import spacy

import pytesseract
from pytesseract import Output

import numpy as np

import matplotlib.pyplot as plt

import cv2
from PIL import Image

In [2]:
def mark_entities(image, doc, data, num_parts, nlp):
    
    org_image = image[:, :, ::-1].copy() 
    
    data = {key: [value for index, value in enumerate(values) if (data['text'][index] != '') & (data['level'][index] == 5)] for key, values in data.items()}
    
    num_boxes = len(d['level'])
    
    cmap = plt.get_cmap('tab20')
    labels = nlp.components[3][1].labels
    colors = cmap(np.linspace(0, 1, len(labels)))
    
    color_dict = dict(zip(labels, colors[:,:3] * 255))
    
    for box in range(num_boxes):
        
        for current_parts in range(1, num_parts + 1):
            
            entities_org = [entity for entity in list(doc.ents) if len(entity.text.split()) == current_parts]
            entities = [entity.text for entity in list(doc.ents) if len(entity.text.split()) == current_parts]
            print(color_dict)
            string = ' '.join(data['text'][box:box + current_parts])
            
            if string.strip('!?@#$.,') in entities:
                
                label = entities_org[entities.index(string.strip('!?@#$.,'))].label_
                
                rectangles = {key: values[box:box + current_parts] for key, values in data.items()}
                
                rectangles = find_min_rect(rectangles)
                
                for rectangle in rectangles:
                
                    x1, y1, x2, y2 = rectangle
                    cv2.rectangle(image, (x1, y1), (x2, y2), color_dict[label], -1)
            
    alpha = .4       
    return cv2.addWeighted(image, alpha, org_image, 1 - alpha, 0)

In [3]:
def find_min_rect(rectangles):
    

    
    rectangles = [{'y1': rectangles['top'][index], 'x1': rectangles['left'][index], 'y2': rectangles['top'][index] + 
                   rectangles['height'][index], 'x2': rectangles['left'][index] + rectangles['width'][index]}
                  for index in range(len(rectangles['top']))]
    
    rows = [[rectangles[0]]]
    last_rectangle = rectangles[0]
    
    for rectangle in rectangles[1:]:
        
        if (rectangle['x1'] < last_rectangle['x1']):
            rows.append([])
            
        rows[len(rows) - 1].append(rectangle) 
        last_rectangle = rectangle
    
    final_rects = []
    
    for row in rows:
        final_rect = \
        [min([rectangle['x1'] for rectangle in row]),
         min([rectangle['y1'] for rectangle in row]),
         max([rectangle['x2'] for rectangle in row]),
         max([rectangle['y2'] for rectangle in row])]
        final_rects.append(final_rect)
        
    #x_coordinates = rectangles['top'] + [rectangles['top'][index] + rectangles['height'][index] for index, _ in enumerate(rectangles['height'])]
    #y_coordinates = rectangles['left'] + [rectangles['left'][index] + rectangles['width'][index] for index, _ in enumerate(rectangles['width'])]
    
    return  final_rects

In [4]:
pdf = 'example'

In [5]:
nlp = spacy.load("en_core_web_trf", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "tokenizer"])

In [6]:
nlp._config['components']['ner']

{'factory': 'ner',
 'incorrect_spans_key': None,
 'moves': None,
 'update_with_oracle_cut_size': 100,
 'model': {'@architectures': 'spacy.TransitionBasedParser.v2',
  'state_type': 'ner',
  'extra_state_tokens': False,
  'hidden_width': 64,
  'maxout_pieces': 2,
  'use_upper': False,
  'nO': None,
  'tok2vec': {'@architectures': 'spacy-transformers.TransformerListener.v1',
   'grad_factor': 1.0,
   'upstream': 'transformer',
   'pooling': {'@layers': 'reduce_mean.v1'}}}}

In [8]:
%%time

images = pdf2image.convert_from_path(f'input/{pdf}.pdf')

result_images = []

for image in images:
    d = pytesseract.image_to_data(image, output_type=Output.DICT)

    text = pytesseract.image_to_string(image)
    text = ' '.join(text.split())

    doc = nlp(text)

    image = np.array(image) 
    image = image[:, :, ::-1].copy() 
    
    image = mark_entities(image, doc, d, 6, nlp)
    
    result_images.append(Image.fromarray(image))

    
result_images[0].save(f'output/{pdf}-result.pdf', "PDF" ,resolution=100.0, save_all=True, append_images=result_images[1:])


KeyError: 'PERSON'