In [1]:
from PIL import Image, ImageOps
import cv2
import json
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import shutil

from image_classification_X_ray import get_large_contours
from image_classification_X_ray import is_X_ray

from image_classification_StraightLine import get_straight_line_cnt

from image_classification_column_hist import get_column_hist_and_fitting

from image_classification_text_using_SWT import text_area_div_likely_text_area

results_set=['text', 'form', 'X-ray', 'others']

In [2]:
def make_folder_for_all_classify_result(results_set):
    for result in results_set:
        isExists=os.path.exists(classify_results_path+result)
        if not isExists:
            os.makedirs(classify_results_path+result)
            print('folder %s has been made.'%result)
        else:
            print('folder %s is exist.'%result)

class CLASSIFY_RULE:
    def __init__(self, LINE_CNT__THRESHOLD, TEXT_RATE_THRESHOLD):#经验值分别为9和1.0
        #['no_black_contour', 'has_black_contour', 'has_black_stripe', 'sensitive_info', 'unk']
        self.LINE_CNT__THRESHOLD = LINE_CNT__THRESHOLD
        self.TEXT_RATE_THRESHOLD = TEXT_RATE_THRESHOLD
        
    def classify(self, img_id, black_area_classify_result, straight_lines_cnt, text_rate):
        if text_rate>=self.TEXT_RATE_THRESHOLD:
            classify_result = 'text'
        elif black_area_classify_result == 'has_black_contour':
            classify_result = 'X-ray'
        elif straight_lines_cnt>=self.LINE_CNT__THRESHOLD or black_area_classify_result == 'has_black_stripe':
            classify_result = 'form'
        else:
            classify_result = 'others'
            
        return classify_result
    

In [4]:
img_path = '../OCR_data/output/'
classify_results_path = './'

start_time=time.time()#计时开始
print('running...')

classify_rule = CLASSIFY_RULE(LINE_CNT__THRESHOLD=8, TEXT_RATE_THRESHOLD=1.0)


#支持读取的图片不是按照img_10000xx.png格式命名，支持命名的数字非连续，支持png之外的其他图片格式。支持文件夹里面有其他格式的文件，例如txt。
make_folder_for_all_classify_result(results_set)
likely_img_file = os.listdir(img_path)
for img_file in likely_img_file:
    if img_file[-3:] not in ['bmp','png','jpg', 'jpeg', 'BMP', 'PNG', 'JPG', 'JPEG']:#非该后缀名列表中的文件不处理，图片格式不够可以加
        continue
    else:
        img = cv2.imread(img_path + img_file)
    
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        img_height, img_width = gray_img.shape[:2]

        bin_threshold, bin_img = cv2.threshold(gray_img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)

        #计算连通域，并且进行滤波，每个汉字中笔画包围的小区域去除，保留外层连通域，并进行从大到小的排列
        areas_sorted, boxs_hor_sorted = get_large_contours(bin_img)
        black_area_classify_result = is_X_ray(bin_img, areas_sorted, boxs_hor_sorted)

        #下面注释的代码可以画出当前找到的连通域，很花时间，非调试不使用
#         tmp_img = gray_img - gray_img
#         cv2.drawContours(tmp_img,boxs_hor_sorted,-1,(100,100,100),5)
#         PIL_img_show(tmp_img)
    
        #计算图像中的直线数量，有阈值设定，在image_classification_StraightLine.py文件中。已经调节好，不轻易更改
        straight_lines_cnt = get_straight_line_cnt(bin_img)

        #计算文字的占有率，即文字区域面积除以总的连通域面积，经过尝试是文字居多的图片该值大于1
        text_rate = text_area_div_likely_text_area(bin_img, gray_img)

        black_area_classify_set = ['no_black_contour', 'has_black_contour', 'has_black_stripe', 'sensitive_info', 'unk']
        print(img_file, black_area_classify_result, straight_lines_cnt, text_rate)

        #按规则分类，规则在 CLASSIFY_RULE 类中定义
        classify_result = classify_rule.classify(img_file, black_area_classify_result, straight_lines_cnt, text_rate)
        #print(img_file, classify_result)

        #经过测试，此处复制消耗时间大约是0.1秒，不影响性能
        #shutil.copyfile(img_path+img_file, classify_results_path+classify_result+'/'+img_file)      #复制文件

        #下面的指标需要进一步设计，目前的鲁棒性不算很高，一般般，无法完全区分出表格和某些文字，to do
        #curve_fitting_res, has_several_wave, _ = get_column_hist_and_fitting(bin_img)
        #print(img_id, black_area_classify_result, straight_lines_cnt, curve_fitting_res, has_several_wave, text_rate_metric)

#计算总体耗时
#实际实践中，具体消耗是大约一张1M的图片处理耗时4.5秒。
print('running time: %5.2f minutes.'%((time.time()-start_time)/60))

running...
folder text is exist.
folder form is exist.
folder X-ray is exist.
folder others is exist.
img_1000000.png no_black_contour 0 0.7981490099693865
img_1000001.png has_black_stripe 0 0.755346116927198
img_1000002.png has_black_stripe 0 0.5728119016876506
img_1000003.png has_black_stripe 0 0.7962160033734819
img_1000004.png has_black_stripe 0 0.7390099323819792
img_1000005.png has_black_stripe 0 0.5613152294403403
img_1000006.png has_black_contour 0 0.724011518724172
img_1000007.png has_black_contour 0 0.7289651528130519
img_1000008.png has_black_contour 0 0.6652909992226428
img_1000009.png has_black_contour 0 0.8283164978076756
img_1000010.png has_black_contour 0 0.9113635670752832
img_1000011.png no_black_contour 0 1.2982051558726946
img_1000012.png has_black_contour 0 0.57302275752682
img_1000013.png has_black_contour 0 0.7648009193130064
img_1000014.png no_black_contour 0 1.3747309461244088
img_1000015.png no_black_contour 0 1.18318869865035
img_1000016.png has_black_contour

In [7]:
start_time=time.time()
print('running...')

classify_rule = CLASSIFY_RULE(LINE_CNT__THRESHOLD=8, TEXT_RATE_THRESHOLD=1.0)

for img_id in range(42):
    img_name = 'img_'+str(1000000+img_id)+'.png'
    img = cv2.imread(img_path + img_name)
    
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img_height, img_width = gray_img.shape[:2]

    bin_threshold, bin_img = cv2.threshold(gray_img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)

    
    
#     tmp_img = dilate_img - dilate_img
#     cv2.drawContours(tmp_img,boxs_hor_sorted,-1,(100,100,100),5)
#     PIL_img_show(tmp_img)
    areas_sorted, boxs_hor_sorted = get_large_contours(bin_img)
    black_area_classify_result = is_X_ray(bin_img, areas_sorted, boxs_hor_sorted)
    
    straight_lines_cnt = get_straight_line_cnt(bin_img)
    
    
    text_rate = text_area_div_likely_text_area(bin_img, gray_img)
    
    black_area_classify_set = ['no_black_contour', 'has_black_contour', 'has_black_stripe', 'sensitive_info', 'unk']
    #print(img_id, black_area_classify_result, straight_lines_cnt, text_rate)
    
    print(img_id, classify_rule.classify(img_id, black_area_classify_result, straight_lines_cnt, text_rate))
    
    
    #curve_fitting_res, has_several_wave, _ = get_column_hist_and_fitting(bin_img)
    #print(img_id, black_area_classify_result, straight_lines_cnt, curve_fitting_res, has_several_wave, text_rate_metric)
    
print('running time: %5.2f minutes.'%((time.time()-start_time)/60))

starting...
0 others
1 form
2 form
3 form
4 form
5 form
6 X-ray
7 X-ray
8 X-ray
9 X-ray
10 X-ray
11 text
12 X-ray
13 X-ray
14 text
15 text
16 X-ray
17 others
18 text
19 text
20 text
21 text
22 form
23 form
24 form
25 form
26 others
27 others
28 others
29 others
30 form
31 form
32 form
33 form
34 form
35 form
36 form
37 form
38 form
39 others
40 X-ray
41 others
running time:  3.18 minutes.


In [18]:
import pillowfight
def PIL_img_show(img):
    Image.fromarray(img).show()

In [99]:
img_name = 'img_'+str(1000000+24)+'.png'
img = cv2.imread(img_path + img_name)
    
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_height, img_width = gray_img.shape[:2]

bin_threshold, bin_img = cv2.threshold(gray_img,0,100,cv2.THRESH_BINARY+cv2.THRESH_OTSU)

#bin_img = gray_img[190:240, 550:750]

a=Image.fromarray(bin_img)
a.show()
#SWT_OUTPUT_BW_TEXT
#SWT_OUTPUT_GRAYSCALE_TEXT
#SWT_OUTPUT_ORIGINAL_BOXES
img_out = pillowfight.swt(a, output_type=pillowfight.SWT_OUTPUT_ORIGINAL_BOXES)
img_out.show()


In [None]:
bin_threshold, bin_img = cv2.threshold(gray_img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
print(bin_img.shape)
bin_img==np.ones((img_height, img_width))*254

In [100]:
print(a.size, img_out.size)

x,y =bin_img.shape[:2]
print(x,y)
original_img_mask = np.uint8(np.ones((x,y))*255)
print(original_img_mask.shape, original_img_mask)

swt_out_img = np.array(img_out)[:,:,0]
print(swt_out_img.shape, swt_out_img)
b=np.uint8(swt_out_img==original_img_mask)
print(b)
Image.fromarray(255*b).show()

(2446, 3462) (2446, 3462)
3462 2446
(3462, 2446) [[255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 ...
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]]
(3462, 2446) [[255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 ...
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]]
[[1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]]


In [93]:

swt_out_img = np.array(img_out)[:,:,2]
PIL_img_show(swt_out_img)

a=np.array([0,0,1,2])
b=np.array([0,0,0,0])
print(sum(a==b))

print(np.array(img_out))
print(b)

2
[[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]]
[0 0 0 0]
