In [84]:
from PIL import Image, ImageOps
import cv2
import json
import numpy as np

import matplotlib.pyplot as plt


img_path = '../OCR_data/output/'

In [82]:
def PIL_img_show(img):
    Image.fromarray(img).show()

def contours_filter(contours, hierarchy):#汉字的一个特点是文字内部有空洞，由此形成连通域，但是检测句子所在行的话应该去掉这部分
    first_child = hierarchy[0][0][2]
    next_child = first_child
    all_children=[]
    while(next_child!=-1):
        all_children.append(next_child)
        next_child = hierarchy[0][next_child][0]
    contours_filtered = [contours[x] for x in all_children]
    
#     print(len(all_children), all_children)
    return contours_filtered

def get_large_contours(bin_img):#must be dilated
    img_height, img_width = bin_img.shape[:2]
    
    kernel_size = (int(img_height/500), int(img_width/150))#经验值。水平因为文字距离近，所以腐蚀多一点，垂直因为文字离得远，腐蚀少一点也无大碍
    kernel = np.ones(kernel_size,np.uint8)
    dilate_img = cv2.dilate(bin_img,kernel,iterations = 1)
    
    # cv2.RETR_EXTERNAL表示只检测外轮廓
    # cv2.RETR_LIST检测的轮廓不建立等级关系
    # cv2.RETR_CCOMP建立两个等级的轮廓，上面的一层为外边界，里面的一层为内孔的边界信息。如果内孔内还有一个连通物体，这个物体的边界也在顶层。
    # cv2.RETR_TREE建立一个等级树结构的轮廓。
    image, contours, hierarchy = cv2.findContours(dilate_img,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)#CHAIN_APPROX_SIMPLE

    contours_filtered = contours_filter(contours, hierarchy)
    if contours_filtered==[]:
        return [],[]
    
    
    areas=[]
    rects=[]#矩形信息包括：矩形中心，高度，宽度，旋转角
    boxs=[]#矩形信息包括：矩形四个点的坐标，经过rects信息计算而来，经过了计算把浮点数转化为图像坐标，矩形可以不是水平的。

    for cnt in contours_filtered:
        area = cv2.contourArea(cnt)
        areas.append(area)

        rect = cv2.minAreaRect(cnt) # 得到最小外接矩形的（中心(x,y), (宽,高), 旋转角度）
        rects.append(rect)

        box = cv2.boxPoints(rect) # cv2.boxPoints(rect) for OpenCV 3.x 获取最小外接矩形的4个顶点
        box = np.int0(box)
        boxs.append(box)

#     tmp_img = erosion_img-erosion_img
#     cv2.drawContours(tmp_img,boxs,-1,(100,100,100),5)
#     PIL_img_show(tmp_img)
    
    boxs_horizontal=[]#将boxs化为方向水平，但是如果句子是倾斜的则圈出的范围过于宽泛。
    for box in boxs:
        x1=min(box[0][0], box[1][0], box[2][0], box[3][0])
        x2=max(box[0][0], box[1][0], box[2][0], box[3][0])
        y1=min(box[0][1], box[1][1], box[2][1], box[3][1])
        y2=max(box[0][1], box[1][1], box[2][1], box[3][1])
        
        rect_area = max(abs(x2-x1), abs(y2-y1))*min(abs(x2-x1),abs(y2-y1))
        if rect_area/img_height/img_width<0.0005:#太小的区域，只能是噪点和零星的文字，忽略掉
            continue
        boxs_horizontal.append(np.array([[x1,y1],[x2,y1],[x2,y2],[x1,y2]]))
    
    if boxs_horizontal==[]:
        return [],[]
#     print(len(boxs_horizontal))

    areas=[]
    for cnt in boxs_horizontal:
        areas.append( cv2.contourArea(cnt))
    
    zipped = zip(boxs_horizontal, areas)
    zipped_sorted = sorted(zipped, key=lambda x:x[1], reverse=True)
    boxs_hor_sorted, areas_sorted = zip(*zipped_sorted)
    return list(areas_sorted), list(boxs_hor_sorted)
    
#     areas_sorted = areas.copy()
#     areas_sorted.sort()
#     areas_sorted = [x/(img_height*img_width)*100 for x in areas_sorted]

#     return areas_sorted, boxs_horizontal

def is_X_ray(bin_img, areas_sorted, boxs_hor_sorted, topK_chosen_rate=0.8, topK_average_threshold=3.5):#多个大的连通区域
    if areas_sorted==[]:
        return 'no_black_contour'
        
    topK_num_sum = sum(areas_sorted[i] if x>areas_sorted[0]*0.8 else 0 for (i, x) in enumerate(areas_sorted))#yes 0.8 is a magic number
    topK_idxs = [i for (i, x) in enumerate(areas_sorted)  if x>areas_sorted[0]*0.8 ]
    topK_num_cnt = len(topK_idxs)
    #print('topK_num_cnt:', topK_num_cnt)
    
    if topK_num_sum/topK_num_cnt>topK_average_threshold:#超过了阈值（经验值3.5），说明存在很大的连通区域
        for topK_idx in topK_idxs:
            topK_box = boxs_hor_sorted[topK_idx]
            black_area_rate = black_area_in_box(bin_img, topK_box)
            #print(black_area_rate)#黑色连通域的黑色像素比例可能高达0.98
            if black_area_rate>0.999:#敏感信息遮挡的话是纯黑背景，而且矩形方框是平行于图像，故黑色面积和连通区域比例是100%
                return 'sensitive_info'
            elif is_black_stripe(bin_img, topK_box, is_stripe_threshold=7)==1:
                return 'has_black_stripe'
            else:
                return 'has_black_contour'
    else:
        return 'no_black_contour'

def black_area_in_box(bin_img, box_hor):
    x1,y1=box_hor[0]
    x2,y2=box_hor[2]
    
    blk_img = bin_img[y1:y2, x1:x2]
    img_height, img_width = blk_img.shape[:2]
    return sum(sum(1-blk_img/255))/img_height/img_width

def is_black_stripe(bin_img, box_hor, is_stripe_threshold=7):
    x1,y1=box_hor[0]
    x2,y2=box_hor[2]
    
    rect_length=max(abs(y2-y1),abs(x2-x1))
    rect_width=min(abs(y2-y1),abs(x2-x1))
    if rect_length/rect_width>is_stripe_threshold:#长宽差距很大，则是黑色分割条
        #print('长宽差距很大')
        return 1
    else:
        return 0

In [83]:
for img_id in range(42):
    img_name = 'img_'+str(1000000+img_id)+'.png'
    img = cv2.imread(img_path + img_name)
    
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    bin_threshold, bin_img = cv2.threshold(gray_img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    
#     kernel_size = (int(img_height/500), int(img_width/150))
#     kernel = np.ones(kernel_size,np.uint8)
#     dilate_img = cv2.dilate(bin_img,kernel,iterations = 1)

    areas_sorted, boxs_hor_sorted = get_large_contours(bin_img)
    
#     tmp_img = dilate_img - dilate_img
#     cv2.drawContours(tmp_img,boxs_hor_sorted,-1,(100,100,100),5)
#     PIL_img_show(tmp_img)
    
    classify_result = is_X_ray(bin_img, areas_sorted, boxs_hor_sorted)
    print(img_id, classify_result)

0 no_black_contour
1 has_black_stripe
2 has_black_stripe
3 has_black_stripe
4 has_black_stripe
5 has_black_stripe
6 has_black_contour
7 has_black_contour
8 has_black_contour
9 has_black_contour
10 has_black_contour
11 no_black_contour
12 has_black_contour
13 has_black_contour
14 no_black_contour
15 no_black_contour
16 has_black_contour
17 no_black_contour
18 no_black_contour
19 no_black_contour
20 no_black_contour
21 no_black_contour
22 has_black_stripe
23 has_black_stripe
24 has_black_stripe
25 has_black_stripe
26 sensitive_info
27 sensitive_info
28 sensitive_info
29 sensitive_info
30 sensitive_info
31 sensitive_info
32 sensitive_info
33 sensitive_info
34 sensitive_info
35 sensitive_info
36 sensitive_info
37 sensitive_info
38 sensitive_info
39 sensitive_info
40 has_black_contour
41 sensitive_info


In [13]:
a=[[0.9,8.3],[11,32],[2,7]]
b=[2,1,3]
b_sorted=b.copy()
print(b)
b_sorted.sort()
print(b_sorted, b)
c=sorted(zip(a,b),key=lambda x:x[1], reverse=True) 
a, b=zip(*c)
print(a,b)
print(list(a), list(b))

sum(sum(np.array(a)))

print(sum())

[2, 1, 3]
[1, 2, 3] [2, 1, 3]
([2, 7], [0.9, 8.3], [11, 32]) (3, 2, 1)
[[2, 7], [0.9, 8.3], [11, 32]] [3, 2, 1]


61.199999999999996

In [88]:
img = cv2.imread(img_path + 'img_'+str(1000040)+'.png')
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_heigth, img_width = gray_img.shape[:2]
print(img_heigth, img_width)


bin_threshold, bin_img = cv2.threshold(gray_img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
PIL_img_show(gray_img)
PIL_img_show(bin_img)

# for x in range(img_width):
#     print(gray_img.item(550, x), bin_img.item(550, x))

2338 1653


In [89]:
kernel_size = (int(img_heigth/500), int(img_width/150))
kernel = np.ones(kernel_size,np.uint8)
dilate_img = cv2.dilate(bin_img,kernel,iterations = 1)
PIL_img_show(dilate_img)

# for x in range(img_width):
#     print(erosion_img.item(550, x))

In [90]:
#image,contours,hierarchy=cv2.findContours(erosion_img,1,2)

# cv2.RETR_EXTERNAL表示只检测外轮廓
# cv2.RETR_LIST检测的轮廓不建立等级关系
# cv2.RETR_CCOMP建立两个等级的轮廓，上面的一层为外边界，里面的一层为内孔的边界信息。如果内孔内还有一个连通物体，这个物体的边界也在顶层。
# cv2.RETR_TREE建立一个等级树结构的轮廓。
image, contours, hierarchy = cv2.findContours(dilate_img,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)#CHAIN_APPROX_SIMPLE

tmp_img = dilate_img - dilate_img
cv2.drawContours(tmp_img,contours,-1,(100,100,100),5)
PIL_img_show(tmp_img)

print(len(contours))
areas=[]
for cnt in contours:
    areas.append( cv2.contourArea(cnt))




302


In [50]:
#print(contours[areas.index(max(areas))])

areas_sorted = areas.copy()
areas_sorted.sort()
areas_sorted = [x/(img_heigth*img_width)*100 for x in areas_sorted]
if areas_sorted[-1]>99.5:# to remove the largest contours if the largest is the whole image.
    print('remove largest one')
    areas_sorted = areas_sorted[:-1]
print(areas_sorted[-10:])

s=0
for x in range(img_heigth):
    for y in range(img_width):
        s+=erosion_img.item(x,y)
print(sum(areas_sorted), 1-s/img_heigth/img_width)

remove largest one
[0.03921235587624413, 0.042191726053115615, 0.04493519301950168, 0.05244914713125969, 0.05908064849268333, 0.061177565919220446, 0.07446678010989943, 2.4403750266920112, 2.752527397099893, 2.895309999535183]
8.771292012177497 -241.17123218013361


In [22]:
topK_num_sum = sum(areas_sorted[i] if x>areas_sorted[-1]*0.8 else 0 for (i, x) in enumerate(areas_sorted))#yes 0.8 is a magic number
topK_num_cnt = sum(1 if x>areas_sorted[-1]*0.8 else 0 for (i, x) in enumerate(areas_sorted))
print(topK_num_sum, topK_num_cnt, topK_num_sum/topK_num_cnt>3.5)

53.04432617782325 6 True
