In [1]:
pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.6.tar.gz (13 kB)
Building wheels for collected packages: pytesseract
  Building wheel for pytesseract (setup.py) ... [?25ldone
[?25h  Created wheel for pytesseract: filename=pytesseract-0.3.6-py2.py3-none-any.whl size=13631 sha256=3417507cdafdbd76ad5bdf6e3a160fee769d10871c8ee71c25685626c39059e7
  Stored in directory: /Users/macbookpro/Library/Caches/pip/wheels/f1/2f/a5/574c57fb22cfcf24f315c8feda132fd0463a9b07ef78394d07
Successfully built pytesseract
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.6
Note: you may need to restart the kernel to use updated packages.


In [1]:
import re
import cv2
import numpy as np
import shutil
import os
import random
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt

try:
    from PIL import Image
except ImportError:
    import Image

In [2]:
receipt = cv2.imread('../Sample_images/receipt.jpg')
cv2.imshow("sample", receipt)
cv2.waitKey(0)
cv2.destroyAllWindows()

## **Preprocessing functions**

In [41]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.GaussianBlur(image, (5,5))

# thresholding
def thresholding(image):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 7, 4)
                            
# return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

# dilation: enhance the bright area
def dilate(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.dilate(image, kernel, iterations=1)

# erosion: enhance the dark area
def erode(image):
  kernel = np.ones((5,5), np.unint8)
  return cv2.erode(image, kernel, iterations=1)

# opening: erosion follow by a dilation
def opening(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

# closing: Dilation followed by Erosion. Removing black holes inside the object 
def closing(image):
  kernel = np.ones((5,5), np.uint8)
  return cv2.morphologyEx(opening(image), cv2.MORPH_CLOSE, kernel)

# canny
def canny(image):
      return cv2.Canny(image, 100, 200)

# deskew image
def deskew(image):
    coords = np.column_stack(np.where(image>0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

# template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

## **Compute accuracy funtion**

In [46]:
import string
import unicodedata
import editdistance

def ocr_metrics(predicts, ground_truth, norm_accentuation=False, norm_punctuation=False):
    """Calculate Character Error Rate (CER), Word Error Rate (WER) and Sequence Error Rate (SER)"""

    if len(predicts) == 0 or len(ground_truth) == 0:
        return (1, 1, 1)

    cer, wer, ser = [], [], []

    for (pd, gt) in zip(predicts, ground_truth):

        if norm_accentuation:
            pd = unicodedata.normalize("NFKD", pd).encode("ASCII", "ignore").decode("ASCII")
            gt = unicodedata.normalize("NFKD", gt).encode("ASCII", "ignore").decode("ASCII")

        if norm_punctuation:
            pd = pd.translate(str.maketrans("", "", string.punctuation))
            gt = gt.translate(str.maketrans("", "", string.punctuation))

        pd_cer, gt_cer = list(pd.lower()), list(gt.lower())
        dist = editdistance.eval(pd_cer, gt_cer)
        cer.append(dist / (max(len(pd_cer), len(gt_cer))))

        pd_wer, gt_wer = pd.lower().split(), gt.lower().split()
        dist = editdistance.eval(pd_wer, gt_wer)
        wer.append(dist / (max(len(pd_wer), len(gt_wer))))

        pd_ser, gt_ser = [pd], [gt]
        dist = editdistance.eval(pd_ser, gt_ser)
        ser.append(dist / (max(len(pd_ser), len(gt_ser))))

    cer_f = sum(cer) / len(cer)
    wer_f = sum(wer) / len(wer)
    ser_f = sum(ser) / len(ser)

    return (cer_f, wer_f, ser_f)

In [44]:
# evaluate = ocr_metrics(predicts=all_predictions,
#                                   ground_truth=valid_orig_txt,
#                                   norm_accentuation=False,
#                                   norm_punctuation=False)

# e_corpus = "\n".join([
#     "Metrics:",
#     "Character Error Rate: {}".format(evaluate[0]),
#     "Word Error Rate:      {}".format(evaluate[1]),
#     "Sequence Error Rate:  {}".format(evaluate[2]),
# ])

## **Testing**

### **1. Tesseract only**

In [3]:
pytesseract.get_tesseract_version()

LooseVersion ('4.1.1')

In [4]:
# Adding custom options
# custom_config = r'--oem 3 --psm 6'
# print(pytesseract.image_to_string(receipt, config=custom_config))
pytesseract.image_to_string((Image.open('../Sample_images/receipt.jpg')))

' \n\n \n\nCanna Company, In.\n123 High Street, Canna City, CA\n301-765-4321\nJanuary 1,2018 4:20pm\n\nReceipt #289111\n\nPatient MMIC # 010787693 Expiration: 12/10/2018\n\nProduct Price\n(Overpriced Kush - Quarter Ounce $100.00\nHappy Hippie Haze- Quarter Ounce $75.00\nAffordable Afghan - Quarter Ounce $45.00\nTrendyCo Vape Cartridge Sour Grapes - 1/2 gram $3000\n\n‘The cannabis excise taxes are included in the total amount ofthis invoice.\n\nOriginal Price: $250.00\nSenior Discount 109% $25.00\nPretax Tota: $225.00\nLocal Cannabis Sales Tax 2% $450\nSubtotal: $229.50\nMMIC Card Sales Tax 0% $0.00\nTotal: $229.50\nPaid: $229.50\nChange: $0.00\n\nHappy New Year!\n\n \n\n \n\x0c'

### **2. CTPN + Tesseract**

### **3. CTPN + Preprocessing + Tesseract**

### **4. Dewarp + CTPN + Tesseract**

### **5. Dewarp + CTPN + Preprocessing + Tesseract**