# Optical Character Recognition 

## Python-tesseract is a wrapper for Google's Tesseract-OCR Engine

In [None]:
!pip install pytesseract # This module helps convert images to text.

In [None]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# open source library for computer vision, machine learning, and image processing applications.
# !pip install opencv-python

In [None]:
import cv2
from PIL import Image
from matplotlib import pyplot as plt
import numpy as np
import os

In [None]:
# image_name = 'Images/tax_ex.jpg'
# image_name = 'Images/indonesian_passport_example.jpg'
# image_name = 'Images/Stop_Sign.jpg'
image_name = 'Images/Yield_Sign.jpg'

In [None]:
# Reading in sample image
# sample_image
image = cv2.imread(image_name)
# If you want to resize image...
# # image = cv2.resize(image, (500,500))

In [None]:
cv2.imshow("Sample Image", image)
# Extraction of text from image
text = pytesseract.image_to_string(image)
print(text)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Attempting with Gray scale to get all lettering?

In [None]:
# Reading in sample image
image = cv2.imread(image_name)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Gray scale
cv2.imshow("Grey Scaled Image", image)
# Extraction of text from image
text = pytesseract.image_to_string(image)
print(text)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Gain Division (Normalization)

In [None]:
# Reading an image in default mode:
image = cv2.imread(image_name)

# Get local maximum:
kernelSize = 5
maxKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernelSize, kernelSize))
# processing ops. based on shapes
localMax = cv2.morphologyEx(image, cv2.MORPH_CLOSE, maxKernel, None, None, 1, cv2.BORDER_REFLECT101) 

# Perform gain division
gainDivision = np.where(localMax == 0, 0, (image/localMax))

# Clip the values to [0,255]
gainDivision = np.clip((255 * gainDivision), 0, 255)

# Convert the mat type from float to uint8:
gainDivision = gainDivision.astype("uint8")

In [None]:
cv2.imshow("White Background", gainDivision) # (Already white background so not much happening here.)
text = pytesseract.image_to_string(gainDivision)
print(text)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Otsu's Thresholding

In [None]:
# Convert RGB to grayscale:
grayscaleImage = cv2.cvtColor(gainDivision, cv2.COLOR_BGR2GRAY)

# Get binary image via Otsu:
_, binaryImage = cv2.threshold(grayscaleImage, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

In [None]:
cv2.imshow("Otsu Thresholding", binaryImage)
text = pytesseract.image_to_string(binaryImage)
print(text)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Ensure closing of characters (for background color filling)

In [None]:
# Set kernel (structuring element) size:
kernelSize = 3
# Set morph operation iterations:
opIterations = 1

# Get the structuring element:
morphKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernelSize, kernelSize))

# Perform closing:
binaryImage = cv2.morphologyEx( binaryImage, cv2.MORPH_CLOSE, morphKernel, None, None, opIterations, cv2.BORDER_REFLECT101 )

In [None]:
cv2.imshow("Character filling", binaryImage)
text = pytesseract.image_to_string(binaryImage)
print(text)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Flood filling

In [None]:
# Flood fill (white + black): -- > Tesseract works best with white background and black text.
cv2.floodFill(binaryImage, mask=None, seedPoint=(int(0), int(0)), newVal=(255))

In [None]:
cv2.imshow("Flood filling", binaryImage)
text = pytesseract.image_to_string(binaryImage) # Didn't work all the way.
print(text)
cv2.waitKey(0)
cv2.destroyAllWindows()

## PDF Images

In [None]:
import fitz
pdffile = 'Images/Berkshire_hathaway_68.pdf'
doc = fitz.open(pdffile)
page = doc.loadPage(0)  # number of page
pix = page.getPixmap()
output_path = "Images/Berkshire_hathaway_68.png"
pix.writePNG(output_path)

In [None]:
image = cv2.imread(output_path)

In [None]:
cv2.imshow("Sample PDF Image", image)
# Extraction of text from image
text = pytesseract.image_to_string(image)
print(text)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
image = Image.open(output_path)
image = image.resize((1782,2322),Image.ANTIALIAS)
image.save(fp="newimage_1.png")

In [None]:
image = cv2.imread('newimage_1.png')
cv2.imshow("Sample PDF Image Resized", image)
# Extraction of text from image
text = pytesseract.image_to_string(image)
print(text)
cv2.waitKey(0)
cv2.destroyAllWindows()