-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfocr
executable file
·41 lines (31 loc) · 1.03 KB
/
pdfocr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#! /usr/bin/env python
#!-*- coding: utf8 -*-
from wand.image import Image
from PIL import Image as PI
from pytesseract import *
import io
import sys
if len(sys.argv)<3:
print "USAGE: pdfocr.py input.pdf output.txt"
sys.exit(1)
req_image = []
final_text = []
pdf_file = sys.argv[1]
text_file_path = sys.argv[2]
print "Criando imagens a partir do PDF..."
image_pdf = Image(filename=pdf_file, resolution=300)
image_jpeg = image_pdf.convert('jpeg')
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
for idx, img in enumerate(req_image, start=1):
print "\n>>> Decodificando página " + str(idx) + "... \n"
txt = pytesseract.image_to_string(PI.open(io.BytesIO(img)), lang='por')
print txt
final_text.append(txt.encode('utf-8', "ignore"))
with open(text_file_path, "w+") as text_file:
text_file.write("%s" % "TEXTO CONVERTIDO VIA PDFOCR.PY")
text_file = open(text_file_path, "a+")
for pag in final_text:
text_file.write("%s" % pag)
text_file.close()