In [None]:
#Installing necessary packages
!sudo apt install tesseract-ocr
!pip install pytesseract
!pip install pdf2image
!apt-get install poppler-utils 

In [149]:
import pytesseract as py
import shutil
import os
try:
    from PIL import Image, ImageOps
except ImportError:
    import Image


# Converting PDF to Image to pass through Tesseract

In [175]:

from pdf2image import convert_from_path
from pytesseract import *
pages = convert_from_path('Doc2.pdf', 500)
#pages = convert_from_path('Doc1.pdf', 500)
for page in pages:
    page.save('out.png', 'PNG')


# Basic preprocessing of the image to improve the text quality

In [176]:
#preprocessing
import numpy as np 
import cv2
# load image
image = cv2.imread('out.png')
# create grayscale
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# perform threshold
retr, mask = cv2.threshold(gray_image, 120, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)   

# remove noise / close gaps
kernel =  np.ones((2,2),np.uint8)
result = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)

#Save image
cv2.imwrite("Result.png", result)


True

In [None]:
#Getting the contents of the file as a string
text =  image_to_string(image) 
result = text.split("\n")

# Extracting Total Invoice Value and Total VAT
Since these values are  often present outside the table structure, it is convenient to extract contents from the image as a string and then parse through it


In [177]:
#Extracting Total amount - assumption made - the largest numerical value(floating point number) in the text would be the total amount
import re
#removing spaces if any between the numbers
strip_spaces=re.sub(r'(?<=\d)\s+','', text)
#adding spaces if numbers are attached to strings to avoid missing any numbers
strip_char=re.sub("[A-Za-z]+", lambda ele: " " + ele[0] + " ", strip_spaces)

remove_cmma=strip_char.replace(",","")

all_no = (re.findall("\d+\.\d+", remove_cmma))
#print(all_no)


sorted_num = sorted(all_no, key = lambda x:float(x))
#print(sorted_num)

total_inv_amt = sorted_num[-1]
print(total_inv_amt)




1038.79
180.29


In [None]:
#Extracting total VAT - assumption made - the 2nd largest amount in the document is the total amount excluding VAT, hence the difference between the 2 will give the total VAT amount
total_vat=round(float(sorted_num[-1])-float(sorted_num[-2]),2)
print(total_vat)

# Extracting Quantity and VAT for each line item
Since these values are often present inside a table structure, we try to identify the table first and then extract values from it

In [233]:
df = py.image_to_data(Image.open(r'Result.png'), config="-c textord_tabfind_find_tables=0 ", output_type='data.frame')


In [None]:
df

In [237]:
#Finding (x2,y2) for all values
df['x2']=df['left']+df['width']
df['y2']=df['top']+df['height']

#Finding the center point for all values
df['x_center']=(df['left']+df['x2'])/2
df['y_center']=(df['top']+df['y2'])/2

#Finding the difference between the center point of all values and the keyword values
qty_keys = ['Quantity','Qty']
qty_x_center=df.loc[(df['text'].isin(qty_keys)) , 'x_center'].iloc[0]
qty_y_center=df.loc[((df['text'].isin(qty_keys))), 'y_center'].iloc[0]
vat_x_center=df.loc[(df['text']=='VAT') , 'x_center'].iloc[0]
vat_y_center=df.loc[((df['text']=='VAT')), 'y_center'].iloc[0]

#Finding the height of quantity and total keywords as it usually marks the beginning and end of the table. We will be picking up only the values within the table for our analysis
total_top=df.loc[df['text'] == 'Total', 'top'].iloc[0]
qty_top=df.loc[df['text'].isin(qty_keys), 'top'].iloc[0]

In [238]:

print(qty_x_center)
print(qty_y_center)
print(vat_x_center)
print(vat_y_center)
print(total_top)
print(qty_top)


376.5
2192.0
3205.5
2187.5
3819
2166


In [239]:
#Remove rows without text
df = df.dropna()
 
# To reset the indices
df = df.reset_index(drop = True)


#Finding the difference between the center of each word and the center of the keywords
df['qty_x_cen_diff']=df['x_center']-qty_x_center
df['qty_y_cen_diff']=df['y_center']-qty_y_center
df['vat_x_cen_diff']=df['x_center']-vat_x_center
df['vat_y_cen_diff']=df['y_center']-vat_y_center

#Filter rows that are above the word quantity and below the word top
df = df[(df.top >qty_top) & (df.top<total_top)]

#Creating 2 different dataframes to store only rows with integer values for quantity and floating point values for vat
df_qty=df[(df['text'].str.contains('^\d+$')) & ((df.top<total_top)) ]
df_vat=df[(df['text'].str.contains('\d+\.\d+')) & ((df.top<total_top)) ]

In [240]:
#Finding the values having closest center points to the keywords
df_1=(df_qty[df_qty['qty_x_cen_diff'].between(-30, 30)])
df_2=(df_vat[df_vat['vat_x_cen_diff'].between(-550, 550)])

In [241]:

quantities = df_1['text'].to_list()
quantities

['4', '1']

**Comment:** Quantity values from the 2nd document were not captured properly by tesseract. If that can be improved, the above logic will hold good and we will be able to retrieve the quantity values from the document

In [242]:
vat = df_2['text'].to_list()
vat

['4.60', '1.38', '0.00', '0.00', '5.98']

**Comment:** 
 

*   The 2nd document does not have individual VAT amount for the line items
*   The first document has 2 column titles with the word 'VAT' in them, hence additional values closer to both columns are captured. This can be reduced further by fine-tuning the table extraction



