In [3]:
#poppler - http://blog.alivate.com.au/poppler-windows/   

from pdf2image import convert_from_path

pages = convert_from_path('./pdf/sample_papers.pdf')

for i,page in enumerate(pages):
    page.save(f'./image/pdf_images/{i+1}.jpg', 'JPEG')

In [4]:
import os

PATH = './image/pdf_images/'
fnames = sorted(os.listdir(PATH), key = lambda s:int(os.path.splitext(s)[0]))
fnames

['1.jpg',
 '2.jpg',
 '3.jpg',
 '4.jpg',
 '5.jpg',
 '6.jpg',
 '7.jpg',
 '8.jpg',
 '9.jpg',
 '10.jpg']

In [5]:
from pytesseract import pytesseract
import pandas as pd
import numpy as np
from itertools import chain
import cv2

def load_image_with_cv2(path, mode = 'rgb'):
    if mode=='grayscale':
        image = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2GRAY)
    else:
        image = cv2.imread(path)
    return image


def run_pytesseract(image, path_engine, config= '-l eng'):
    pytesseract.tesseract_cmd = path_engine
    result = pytesseract.image_to_string(image, config=config)
    return result


def image2data(path_img):
    data = pytesseract.image_to_data(path_img, config='-l eng+kor --oem 1 --psm 11')
    lines = data.split('\n')
    columns, df_data = None, None
    for i,line in enumerate(lines):
        line = line.split('\t')
        if i==0:
            columns = line.copy()
            df_data = {col:[] for col in line}
            continue
        for col, l in zip(columns,line):
            df_data[col]+=[l]
    return pd.DataFrame(df_data)


def get_bounding_box(img_data, padding=50):
    valids = []
    for le, to, wi, he, te in image_data[['left','top', 'width', 'height', 'text']].values.tolist():
        if te.strip()!='':
            valids.append((max(0,int(le)-padding), max(0,int(to)-padding), int(wi)+padding*2, int(he)+padding*2))
    return valids


def mark_region(bbox_points, image_shape, flag='left-top-width-height'):
    if flag=='left-top-width-height':
        region = np.zeros(image_shape)
        for l,t,w,h in bbox_points:
            region[t:t+h, l:l+w] = 255
        return region
    
def merge_region(bbox_points, image_shape, flag='left-top-width-height', get_bb_ratio=False):
    if flag=='left-top-width-height':
        region = np.zeros(image_shape)
        for l,t,w,h in bbox_points:
            region[t:t+h, l:l+w] = 255
        top_bottom = []
        has_top = False
        for i,line in enumerate(region):
            if sum(line) > 0.:
                if not has_top:
                    top_bottom.append([i])
                    has_top=True
            else:
                if has_top:
                    top_bottom[-1].append(i)
                    has_top = False
        tblrs = []
        for top, bottom in top_bottom:
            left_right = []
            has_left = False
            obj = region[top:bottom, :]
            for j in range(region.shape[-1]):
                line = obj[:,j]
                if sum(line) > 0.:
                    if not has_left:
                        left_right.append([j])
                        has_left=True
                else:
                    if has_left:
                        left_right[-1].append(j)
                        has_left = False
            
            tbs = [[top,bottom]]*len(left_right)
            if get_bb_ratio:
                pairs = [(lr[0]/image_shape[1], tb[0]/image_shape[0], max(0,lr[1]-lr[0])/image_shape[1], (max(0,tb[1]-tb[0]))/image_shape[0]) for tb,lr in zip(tbs, left_right)]
            else:
                pairs = [(lr[0], tb[0],max(0,lr[1]-lr[0]), tb[1]-tb[0]) for tb,lr in zip(tbs, left_right)]
                
            tblrs.append(pairs)
        tblrs = list(chain(*tblrs))
        return tblrs
    
def fast_way(image, flag='left-top-width-height', padding=10,get_bb_ratio=False):
    image_shape = image.shape
    image = 255-image
    if flag=='left-top-width-height':
        region = np.zeros(image_shape)
        top_bottom = []
        has_top = False
        for i,line in enumerate(image):
            if sum(line) > 0.:
                if not has_top:
                    top_bottom.append([i])
                    has_top=True
            else:
                if has_top:
                    top_bottom[-1].append(i)
                    has_top = False
        tblrs = []
        for top, bottom in top_bottom:
            left_right = []
            has_left = False
            obj = image[top:bottom, :]
            for j in range(image.shape[-1]):
                line = obj[:,j]
                if sum(line) > 0.:
                    if not has_left:
                        left_right.append([j])
                        has_left=True
                else:
                    if has_left:
                        left_right[-1].append(j)
                        has_left = False
            
            tbs = [[top,bottom]]*len(left_right)
            if get_bb_ratio:
                pairs = [(lr[0]/image_shape[1], tb[0]/image_shape[0], max(0,lr[1]-lr[0])/image_shape[1], (max(0,tb[1]-tb[0]))/image_shape[0]) for tb,lr in zip(tbs, left_right)]
            else:
                pairs = [(lr[0]-padding, tb[0]-padding,max(0,lr[1]-lr[0])+padding*2, max(0,tb[1]-tb[0])+padding*2) for tb,lr in zip(tbs, left_right)]
                
            tblrs.append(pairs)
        tblrs = list(chain(*tblrs))
        return tblrs

In [6]:
import time

path_engine = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
get_region = lambda image,l,t,w,h:image[t:t+h, l:l+w] 

bboxes = []
start = time.time()
for fname in fnames:
    print(fname)
    path = os.path.join(PATH,fname)
    image = load_image_with_cv2(path, mode='grayscale')
    image_data = image2data(path)
    bb = get_bounding_box(image_data, padding=10)
    merged_bbox = merge_region(bb, image.shape, get_bb_ratio=False)
    new_region = mark_region(merged_bbox, image.shape)
    bboxes.append(merged_bbox)
print(f'{len(fnames)}개의 이미지: {time.time()-start}초 소요됨.')

1.jpg
2.jpg
3.jpg
4.jpg
5.jpg
6.jpg
7.jpg
8.jpg
9.jpg
10.jpg
10개의 이미지: 377.56705951690674초 소요됨.


In [8]:
import time

#해당 방법은 배경이 흰색(255)인 경우 적용 가능합니다.

path_engine = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
get_region = lambda image,l,t,w,h:image[t:t+h, l:l+w] 

bboxes = []
start = time.time()
for fname in fnames:
    print(fname)
    path = os.path.join(PATH,fname)
    image = load_image_with_cv2(path, mode='grayscale')
    bb = fast_way(image, padding=10)
    merged_bbox = merge_region(bb, image.shape, get_bb_ratio=False)
    new_region = mark_region(merged_bbox, image.shape)
    bboxes.append(merged_bbox)
print(f'{len(fnames)}개의 이미지: {time.time()-start}초 소요됨.')

1.jpg
2.jpg
3.jpg
4.jpg
5.jpg
6.jpg
7.jpg
8.jpg
9.jpg
10.jpg
10개의 이미지: 29.925760746002197초 소요됨.


In [10]:
bboxes

[[(133, 118, 1382, 101),
  (390, 254, 868, 76),
  (398, 342, 861, 117),
  (133, 485, 1382, 38),
  (174, 534, 1300, 100),
  (357, 654, 934, 173),
  (198, 838, 1260, 468),
  (197, 1317, 710, 54),
  (133, 1429, 678, 325),
  (846, 1429, 660, 325),
  (142, 1774, 1372, 268),
  (149, 2085, 550, 54),
  (1253, 2085, 246, 54)],
 [(134, 101, 1381, 86),
  (142, 206, 668, 1828),
  (837, 206, 678, 1828),
  (149, 2085, 621, 54)],
 [(134, 101, 1381, 86),
  (142, 206, 164, 52),
  (846, 206, 156, 52),
  (142, 294, 668, 1740),
  (838, 294, 676, 1740),
  (1253, 2085, 262, 54)],
 [(134, 101, 1381, 86),
  (142, 206, 668, 668),
  (846, 206, 660, 668),
  (133, 902, 1382, 1116),
  (149, 2085, 621, 54)],
 [(134, 101, 1381, 86),
  (133, 206, 1382, 1196),
  (142, 1454, 668, 588),
  (846, 1454, 660, 588),
  (1253, 2078, 262, 61)],
 [(134, 101, 1381, 86),
  (133, 206, 1382, 972),
  (142, 1221, 668, 813),
  (837, 1221, 678, 813),
  (149, 2085, 621, 54)],
 [(134, 101, 1381, 86),
  (133, 206, 1382, 524),
  (142, 758, 

In [11]:
import pdfplumber
texts = ''
image
with pdfplumber.open("./pdf/sample_papers.pdf") as pdf:
    for page, bbox in zip(pdf.pages,bboxes):
        i_height, i_width = image.shape 
        r = page.height/i_height
        plumber_region = lambda bb: (bb[0]*r, bb[1]*r, bb[2]*r+bb[0]*r, bb[3]*r+bb[1]*r)
        for bb in bbox:
            bb = plumber_region(bb)
            boxed = page.within_bbox(bb)
            text = boxed.extract_text()
            if isinstance(text, str):
                texts+=(text+'\n')

In [12]:
texts

"지역사회간호학회지 제24권 제1호, 2013년 3월 ISSN 1225-9594\nJ Korean Acad Community Health Nurs Vol. 24 No. 1, 1-10, March 2013 http://dx.doi.org/10.12799/jkachn.2013.24.1.1\n지역사회거주 노인의 황반변성 관련 요인\n김철규1·박윤경2·박승미3\n‧\n청주대학교 간호학과1, 한국실명예방재단2, 호서대학교 간호학과 기초과학연구소3\nFactors Associated with Senile Macular Degeneration in Elders within\nCommunities\nKim, Chul-Gyu1 · Park, Yungeong2 · Park, Seungmi3\n1Department of Nursing, Cheongju University, Cheongju\n2Korean‧ Foundation for the Prevention of Blindness, Seoul\n3Department of Nursing Research Institute for Basic Sciences, Hoseo University, Asan, Korea\nPurpose: This study was conducted to examine the degree of senile macular degeneration in elders aged 65 or \nolder and identify factors associated with senile macular degeneration in elders within communities. Methods: \nParticipants in this cross-sectional descriptive study were 388 elders without cataract, glaucoma, and diabetic \nretinopathy. Data were collected through face to face interviews using a q