In [48]:
try:
    from PIL import Image
except ImportError:
    import Image
import cv2
import pytesseract
from pytesseract import Output
import os
import numpy as np
import pandas as pd
import re
from pdf2image import convert_from_bytes

In [45]:
PATH = os.getcwd()
file_list = [f for f in os.listdir(path=PATH) if f.endswith('.pdf') or f.endswith('.PDF')]
print(file_list)

['20231108170040648.pdf']


In [76]:
for file in file_list:
    pdf_file = convert_from_bytes(pdf_file=open(os.path.join(PATH,file), 'rb').read(),output_folder="./",fmt="png",output_file="test", paths_only=True)
    print(pdf_file)
    img = cv2.imread(pdf_file[0])
    # d = pytesseract.image_to_data(img, output_type=Output.DICT)
    # cv2.rectangle(img, (1232,220),(1232+352, 220+152),(0, 255, 0), 2)
    rgb = cv2.cvtColor(img[220:220+152,1232:1232+352], cv2.COLOR_BGR2RGB)
    text = pytesseract.image_to_string(rgb,config='--psm 12')
    print(text)
    
    # [80:280, 150:330]
    # im[y:y+h,x:x+w]
    #  [x, y, w, h]

# (1232, 220, 352, 152)

['./test0001-1.png']
TRANSACTION TYPE

Invoice

INVOICE NO

3002686



In [54]:

# Some help functions 
def get_conf(page_gray):
    '''return a average confidence value of OCR result '''
    df = pytesseract.image_to_data(page_gray,output_type='data.frame')
    df.drop(df[df.conf==-1].index.values,inplace=True)
    df.reset_index()
    return df.conf.mean()
  
def deskew(image):
    '''deskew the image'''
    gray = cv2.bitwise_not(image)
    temp_arr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    coords = np.column_stack(np.where(temp_arr > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated
  
'''
Main part of OCR:
pages_df: save eextracted text for each pdf file, index by page
OCR_dic : dict for saving df of each pdf, filename is the key
'''
PATH = os.getcwd()
file_list = [f for f in os.listdir(path=PATH) if f.endswith('.pdf') or f.endswith('.PDF')]
print(file_list)

OCR_dic={} 
for file in file_list:
    # convert pdf into image
    pdf_file = convert_from_bytes(pdf_file=open(os.path.join(PATH,file), 'rb').read(),output_folder="./",fmt="jpg")
    # create a df to save each pdf's text
    pages_df = pd.DataFrame(columns=['conf','text'])
    for (i,page) in enumerate(pdf_file) :
        try:
            # transfer image of pdf_file into array
            page_arr = np.asarray(page)
            # transfer into grayscale
            page_arr_gray = cv2.cvtColor(page_arr,cv2.COLOR_BGR2GRAY)
            # deskew the page
            page_deskew = deskew(page_arr_gray)
            # cal confidence value
            page_conf = get_conf(page_deskew)
            # extract string 
            pages_df = pages_df._append({'conf': page_conf,'text': pytesseract.image_to_string(page_deskew, config='--psm 5')}, ignore_index=True)
        except:
            # if can't extract then give some notes into df
            pages_df = pages_df._append({'conf': -1,'text': 'N/A'}, ignore_index=True)
            continue
    # save df into a dict with filename as key        
    OCR_dic[file]=pages_df
    print('{} is done'.format(file))
    print(OCR_dic)

['20231108170040648.pdf']
20231108170040648.pdf is done
{'20231108170040648.pdf':         conf                                               text
0  91.376072  LEVEL 4, TOWER BLOCK SENeRue\nMINISTRY OF FINA...}


  pages_df = pages_df._append({'conf': page_conf,'text': pytesseract.image_to_string(page_deskew, config='--psm 5')}, ignore_index=True)


In [28]:
PATH = os.getcwd()
file_list = [f for f in os.listdir(path=PATH) if f.endswith('.pdf') or f.endswith('.PDF')]

def combine_texts(list_of_text):
    '''Taking a list of texts and combining them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

 
OCR_dic={} 
for file in file_list:
    # convert pdf into image
    pdf_file = convert_from_bytes(open(os.path.join(PATH,file), 'rb').read())
    # create a df to save each pdf's text
    pages_df = pd.DataFrame(columns=['conf','text'])
    for (i,page) in enumerate(pdf_file) :
        try:
            # transfer image of pdf_file into array
            page_arr = np.asarray(page)
            # transfer into grayscale
            page_arr_gray = cv2.cvtColor(page_arr,cv2.COLOR_BGR2GRAY)
            page_arr_gray = cv2.fastNlMeansDenoising(page_arr_gray,None,3,7,21)
            page_deskew = deskew(page_arr_gray)
            # cal confidence value
            page_conf = get_conf(page_deskew)
            # extract string 
            d = pytesseract.image_to_data(page_deskew,output_type=pytesseract.Output.DICT)
            d_df = pd.DataFrame.from_dict(d)
            # get block number
            block_num = int(d_df.loc[d_df['level']==2,['block_num']].max())
            # drop header and footer by index
            head_index = d_df[d_df['block_num']==1].index.values
            foot_index = d_df[d_df['block_num']==block_num].index.values
            d_df.drop(head_index,inplace=True)
            d_df.drop(foot_index,inplace=True)
            # combine text in dataframe
            text = combine_texts(d_df.loc[d_df['level']==5,'text'].values)
            pages_df = pages_df._append({'conf': page_conf,'text': text}, ignore_index=True)
        except Exception as e:
            # if can't extract then give some notes into df
            if hasattr(e,'message'):
                pages_df = pages_df._append({'conf': -1,'text': e.message}, ignore_index=True)
            else:
                pages_df = pages_df._append({'conf': -1,'text': e}, ignore_index=True)
            continue
    # save df into a dict with filename as key        
    OCR_dic[file]=pages_df
    # print('{} is done'.format(file))
    print(pages_df["text"])

  block_num = int(d_df.loc[d_df['level']==2,['block_num']].max())
  pages_df = pages_df._append({'conf': page_conf,'text': text}, ignore_index=True)


0    Te te ee et SINGAPORE 369977 Singapore       R...
Name: text, dtype: object
0                                        LEVEL 4, T...
Name: text, dtype: object


  block_num = int(d_df.loc[d_df['level']==2,['block_num']].max())
  pages_df = pages_df._append({'conf': page_conf,'text': text}, ignore_index=True)
