# OCR

### Create a dataframe with bounding boxes for a receipt.

In [1]:
import cv2 
import pytesseract
from pytesseract import Output
import numpy as np
import pandas as pd 

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [2]:
# Image path (input & output)
image = r'C:\Users\Tan Jia Xin\Documents\Aimazing\GCN\info_extraction_receipts\data\raw\img\000.jpg'
output = r'C:\Users\Tan Jia Xin\Documents\AImazing\GCN\img\tess_000.jpg'

# Using cv2.imread() method
# cv2.imread() method loads an image from the specified file. If the image cannot be read, it returns an empty matrix.
img = cv2.imread(image)

# Displaying the image
# cv2.imshow('image', img)

# Displaying the matrix
img

array([[[247, 252, 253],
        [247, 252, 253],
        [247, 252, 253],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[248, 253, 254],
        [248, 253, 254],
        [248, 253, 254],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[250, 255, 255],
        [250, 255, 255],
        [250, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       ...,

       [[192, 202, 209],
        [198, 208, 215],
        [211, 221, 228],
        ...,
        [232, 235, 243],
        [222, 225, 233],
        [214, 217, 225]],

       [[196, 206, 213],
        [202, 212, 219],
        [211, 221, 228],
        ...,
        [215, 218, 226],
        [202, 205, 213],
        [192, 195, 203]],

       [[202, 212, 219],
        [206, 216, 223],
        [212, 222, 229],
        ...,
        [197, 200, 208],
        [185, 188, 196],
        [175, 178, 186]]

image_to_string Returns unmodified output as string from Tesseract OCR processing  
image_to_boxes Returns result containing recognized characters and their box boundaries  
image_to_data Returns result containing box boundaries, confidences, and other information. Requires Tesseract 3.05+. For more information, please check the Tesseract TSV documentation

In [3]:
# Additional custom configuration
# page segmentation mode (psm) 6 = Assume a single uniform block of text.
# OCR engine mode (oem) 3 = Default, based on what is available (legacy or LSTM).
custom_config = r'--oem 3 --psm 6'

# Using this dictionary, we can get each word detected, their bounding box information, the text in them and the 
# confidence scores for each (conf) and store the output in dict.
d = pytesseract.image_to_data(img, output_type=Output.DICT) #, config=custom_config)
print(d.keys())
d

# Print out d (all key value pairs in d) in a table
pd.set_option('display.max_rows', 500)
d_table = pytesseract.image_to_data(img, output_type='data.frame')
d_table

dict_keys(['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text'])


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
0,1,1,0,0,0,0,0,0,463,1013,-1.0,
1,2,1,1,0,0,0,75,32,244,31,-1.0,
2,3,1,1,1,0,0,75,32,244,31,-1.0,
3,4,1,1,1,1,0,75,32,244,31,-1.0,
4,5,1,1,1,1,1,75,32,51,23,92.892517,tan
5,5,1,1,1,1,2,138,37,91,18,92.174042,woon
6,5,1,1,1,1,3,241,37,78,26,92.543823,yann
7,2,1,2,0,0,0,73,95,346,136,-1.0,
8,3,1,2,1,0,0,73,95,346,136,-1.0,
9,4,1,2,1,1,0,73,95,346,17,-1.0,


Syntax: cv2.rectangle(image, start_point, end_point, color, thickness)  
Parameters:  
image: It is the image on which rectangle is to be drawn.  
start_point: It is the starting coordinates of rectangle. The coordinates are represented as tuples of two values i.e. (X coordinate value, Y coordinate value).  
end_point: It is the ending coordinates of rectangle. The coordinates are represented as tuples of two values i.e. (X coordinate value, Y coordinate value).  
color: It is the color of border line of rectangle to be drawn. For BGR, we pass a tuple. eg: (255, 0, 0) for blue color.  
thickness: It is the thickness of the rectangle border line in px. Thickness of -1 px will fill the rectangle shape by the specified color.  
Return Value: It returns an image.  
  
  
Syntax: cv2.putText(image, text, org, font, fontScale, color[, thickness[, lineType[, bottomLeftOrigin]]])  
Parameters:  
image: It is the image on which text is to be drawn.  
text: Text string to be drawn.  
org: It is the coordinates of the bottom-left corner of the text string in the image. The coordinates are represented as tuples of two values i.e. (X coordinate value, Y coordinate value).  
font: It denotes the font type. Some of font types are FONT_HERSHEY_SIMPLEX, FONT_HERSHEY_PLAIN, , etc.  
fontScale: Font scale factor that is multiplied by the font-specific base size.  
color: It is the color of text string to be drawn. For BGR, we pass a tuple. eg: (0, 0, 255) for red color.  
thickness: It is the thickness of the line in px.  
lineType: This is an optional parameter.It gives the type of the line to be used.  
bottomLeftOrigin: This is an optional parameter. When it is true, the image data origin is at the bottom-left corner. Otherwise, it is at the top-left corner.  
Return Value: It returns an image.  

The shape of an image is accessed by img.shape. It returns a tuple of the number of rows, columns, and channels (if the image is color):  
print( img.shape ) will return (342, 548, 3)

In [None]:
# For graph modeling
xmin,ymin,xmax,ymax,Object = [],[],[],[],[]
df = pd.DataFrame() 

# Initialize a counter var for the 'text' detected in d for forloop use later
n_boxes = len(d['text'])

for i in range(n_boxes):
    if int(float(d['conf'][i])) >= 0.74:
        (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        text = d['text'][i]
      
        img = cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 1)
        img = cv2.putText(img, text, (x, y - 1), cv2.FONT_HERSHEY_DUPLEX, 0.6, (0, 0, 255), 1)

        img_height, img_width = img.shape[0], img.shape[1]
        
        xmin.append(x)
        ymin.append(y)
        xmax.append(x + w)
        ymax.append(y + h) 
        Object.append(text)

df['xmin'], df['ymin'], df['xmax'], df['ymax'], df['Object']  = xmin,ymin,xmax,ymax,Object 

# Remove empty object from df
df = df[df.Object != " "]

print(df)

df.to_csv('test550_scratchpart2' + '.csv' ,index = False)
cv2.imwrite('test550_scratchpart2' + '.jpg', img)

# Syntax: cv2.imwrite(filename, image) to save the image
cv2.imwrite(output, img)

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

    xmin  ymin  xmax  ymax         Object
0     75    32   126    55            tan
1    138    37   229    55           woon
2    241    37   319    63           yann
3     73    97   125   110           BOOK
4    132    97   178   110            TAK
5    186    95   261   112         (TAMAN
6    269    95   327   112          DAYA)
7    338    96   370   109            SDN
8    380    96   419   109            BHD
9    112   147   155   161          NO.5?
10   165   147   204   163          55,57
11   211   147   221   159              &
12   229   147   248   162            59,
13   260   147   303   160          JALAN
14   311   146   353   159           SAGU
15   364   146   381   162            18,
16   196   170   247   184          TAMAN
17   255   170   295   184           DAYA
18   164   195   209   208          81100
19   215   194   268   207          JOHOR
20   276   194   329   208         BAHRU,
21   219   217   271   231         JOHOR.
22    93   275   148   322        