## 文档扫描及处理及OCR识别

In [1]:
import cv2
import numpy as np

In [2]:
def cv_show(img):
    cv2.imshow('img',img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

### 读取图像及图像预处理

In [3]:
def resize(image,width = None,heieht = None,inter = cv2.INTER_AREA):
    h,w = image.shape[0:2]
    if width is None and heieht is None:
        return image
    if width is None:
        r = heieht/h
        #print(h,w,r,int(w*r),heieht)
        return cv2.resize(image,(int(w*r),heieht),inter)
    else:
        r = width/w
        return cv2.resize(image,(width,int(h*r)),inter)

In [4]:
img = cv2.imread('./images/page.jpg')
ratio = img.shape[0]/500.0 #用于后面坐标点的复原
dupli = img.copy() #用于后面的复原
img = resize(dupli,heieht=500)
cv_show(img)

In [5]:
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
gray = cv2.GaussianBlur(gray,(5,5),0)
edge = cv2.Canny(gray,75,200)
show = np.hstack((gray,edge))
cv_show(show)

### 轮廓检测获取目标区域

In [6]:
#获取轮廓
cnts = cv2.findContours(edge.copy(),cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)[1] #找出了318个轮廓
#再以轮廓内的面积大小排序，取前五个
cnts = sorted(cnts,key = cv2.contourArea,reverse=True)[:5]

In [7]:
#获取轮廓的坐标点
for i in cnts:
    length = cv2.arcLength(i,True)
    approx = cv2.approxPolyDP(i,0.02*length,True)
    if len(approx) == 4:
        break
# a = cv2.approxPolyDP(cnts[4],28, True)

In [8]:
#画出轮廓
f = img.copy()
cv2.drawContours(f,[approx],-1,(0,255,0),2)
cv_show(f)

### 以下进行目标区域的透视变换

In [9]:
def order_point(pts):
    #用于将轮廓坐标点排序，轮廓检测得到的坐标不一定按我们想要的顺序，故重排
    #将四个坐标点排序，顺序为：左上，右上，右下，左下
    rect = np.zeros_like(pts,dtype='float32')
    
    #先找出左上右下点，它们的x、y坐标相加，应该分别是最小和最大的
    cor_sum = pts.sum(axis = 1)
    rect[0] = pts[np.argmin(cor_sum)]
    rect[2] = pts[np.argmax(cor_sum)]
    
    #再找右上左下点，它们的x、y的距离应该分别是最小和最大的（y-x）
    cor_diff = np.diff(pts,1)
    rect[1] = pts[np.argmin(cor_diff)]
    rect[3] = pts[np.argmax(cor_diff)]
    
    return rect

In [10]:
def img_transform(img,pts):
    #这里只做图像的透视变换，故如果想要原图上的坐标变化需要传入原图及轮廓在原图上的坐标
    rects = order_point(pts)
    tl,tr,br,bl = rects
    
    #计算目标区域最宽和最高，因为我们希望转换后的图片是一个以这个宽高的矩形
    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
    width = int(max(widthA,widthB))
    
    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
    height = int(max(heightA,heightB))
    
    dst = np.array([
        [0,0],
        [width-1,0],
        [width-1,height-1],
        [0,height-1]
    ],dtype = 'float32')
    #计算变换矩阵
    M = cv2.getPerspectiveTransform(rects,dst)
    warped = cv2.warpPerspective(img,M,(width,height))
    
    return warped

In [11]:
#透视变换
warped = img_transform(dupli,approx.reshape(4,2)*ratio)
cv_show(warped)
#二值化处理
warped = cv2.cvtColor(warped,cv2.COLOR_BGR2GRAY)
cv_show(warped)
ref = cv2.threshold(warped,135,255,cv2.THRESH_BINARY)[1]
cv_show(ref)
cv2.imwrite('scan.jpg', ref)

True

### 以下进行OCR

- 基本上OCR都是基于Tesseract-OCR软件搞的  
- 软件下载地址：https://digi.bib.uni-mannheim.de/tesseract/  
- 安装完成后在环境变量和系统变量中添加软件安装路径，同时新建TESSDATA_PREFIX系统变量，值为软件绝对路径  
- pip install pytesseract 在python中使用该库调用  
- 安装完成后将安装包中anaconda lib site-packges pytesseract pytesseract.py文件的tesseract软件相对地址改为绝对地址

In [12]:
from PIL import Image
import pytesseract
import cv2
import os

In [13]:
preprocess = 'blur'

In [15]:
image = ref
#image = cv2.imread('scan.jpg')
#gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)#就算是二值图片保存的时候还是会保存成3通道

In [21]:
if preprocess == "thresh":
    gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

if preprocess == "blur":
    gray = cv2.medianBlur(gray, 3)
    
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)

True

In [17]:
#text = pytesseract.image_to_string(Image.open(filename))
text = pytesseract.image_to_string(image)
print(text)
#os.remove(filename)

4.3 ACCESSING AND MANIPULATING PIXELS

On Line 14 we manipulate the top-left pixel in the im-
age, which is located at coordinate (0,0) and set it to have
a value of (0, 0, 255). If we were reading this pixel value
in RGB format, we would have a value of 0 for red, 0 for
green, and 255 for blue, thus making it a pure blue color.

However, as I mentioned above, we need to take special
care when working with OpenCV. Our pixels are actually
stored in BGR format, not RGB format.

We actually read this pixel as 255 for red, 0 for green, and
0 for blue, making it a red color, Hot a blue color.

After setting the top-left pixel to have a red color on Line
14, we then grab the pixel value and print it back to con-
sole on Lines 15 and 16, just to demonstrate that we have
indeed successfully changed the color of the pixel.

Accessing and setting a single pixel value is simple enough,
but what if we wanted to use NumPy's array slicing capa-
bilities to access larger rectangular portions of the i