# End-to-end Document analysis by PadddleOCR

# 1. Document recognition

From images of document => To structural files 

Title & text contents are in .csv files, tables are in .xlsx files, cropped figures are in .jpg files. 

Does not support OCR to figures.


### 1.1. Layout Parser (PaddeDetection model)

Given an input image, Layout parser model can detect text, title and figure regions. 


### 1.2. Text Detection & Text Recognition

Given cropped images containing text regions as input, Text Detection model detects bounding boxes of text lines.

Next, Text Recognition model recognizes text => .csv files.


### 1.3. Table Structure Recognition 

Given cropped tables, this model parse table's structure and recognizes text on it. 

In [361]:
import cv2
import os
import layoutparser as lp

model = lp.PaddleDetectionLayoutModel(config_path="lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config",
                                threshold=0.5,
                                label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"},
                                enforce_cpu=False,
                                enable_mkldnn=True)

In [372]:
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image
import pandas as pd


def extract_content(content_blocks, target_folder, image):
    text_img = cv2.transpose(image)
    text_img_list = []

    for idx in range(len(content_blocks)):
        x_1, y_1, x_2, y_2 = content_blocks[idx][0]
        text_image = cv2.transpose(text_img[int(x_1):int(x_2), int(y_1):int(y_2)])
        type_of_text = content_blocks[idx][1]

        text_img_list.append([text_image, type_of_text])
        if not os.path.exists(str(image_path.split('.')[0] + f'/{target_folder}')):
            os.makedirs(str(image_path.split('.')[0] + f'/{target_folder}')) 
            
        cv2.imwrite(str(image_path.split('.')[0] + f'/{target_folder}/{idx}.jpg'), text_image)

    ocr = PaddleOCR(use_angle_cls=True, lang='en') 
    text_results = [(ocr.ocr(text_img, cls=True), type_of_text) for text_img, type_of_text in (text_img_list)]

    return text_results

In [373]:
%cd C:/Users/ASUS/PaddleOCR

# detect
image_path = 'C:/Users/ASUS/OneDrive/Desktop/paper/0004.jpg'

def end2end_process_doc_recognition(image_path):
    image = cv2.imread(image_path)
    image = image[..., ::-1]

    layout = model.detect(image)

    """Text Region Recognition
    """
    text_blocks = lp.Layout([b for b in layout if b.type=='Text'])
    figure_blocks = lp.Layout([b for b in layout if b.type=='Figure'])
    title_blocks = lp.Layout([b for b in layout if b.type=='Title'])
    table_blocks = lp.Layout([b for b in layout if b.type=='Table'])

    # text areas may be detected within the image area, delete these areas
    text_blocks = lp.Layout([b for b in text_blocks if not any(b.is_in(b_fig) for b_fig in figure_blocks)])

    # sort text areas and assign ID
    h, w = image.shape[:2]

    left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image)

    left_blocks = text_blocks.filter_by(left_interval, center=True)
    left_blocks.sort(key = lambda b:b.coordinates[1])

    right_blocks = [b for b in text_blocks if b not in left_blocks]
    right_blocks.sort(key = lambda b:b.coordinates[1])

    # the two lists are merged and the indexes are added in order
    text_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)])

    # assemble text and title boxes
    text_title_blocks = text_blocks + title_blocks
    text_title_blocks_ =  [(text_title_blocks[i].coordinates, text_title_blocks[i].type) for i in range(len(text_title_blocks))]
    sorted_text_title_blocks = sorted(text_title_blocks_, key=lambda x: (x[0][1], x[0][0]), reverse=False)


    """Table Recognition
    """
    box_img = cv2.transpose(image)
    box_img = cv2.cvtColor(box_img, cv2.COLOR_BGR2RGB)
    box_img_list = []


    for idx in range(len(table_blocks)):
        x_1, y_1, x_2, y_2 = table_blocks[idx].coordinates
        box_img = cv2.transpose(box_img[int(x_1):int(x_2), int(y_1):int(y_2)])
        box_img_list.append(box_img)
        cv2.imwrite(str(image_path.split('.')[0] + f'/table_results/{idx}.jpg'), box_img)

    # Text recognizer in table
    from paddleocr import PPStructure, save_structure_res
    table_engine = PPStructure(layout=False, show_log=True)

    results = [table_engine(img) for img in  box_img_list]

    if not os.path.exists(str(image_path.split('.')[0] + f'/table_results')):
        os.makedirs(str(image_path.split('.')[0] + f'/table_results'))

    for result in results:
        save_structure_res(result, str(image_path.split('.')[0] + f'/table_results/'), os.path.basename(image_path).split('.')[0])
    

    """Text & Title Recognition
    """
    text_title_results = extract_content(content_blocks=sorted_text_title_blocks, target_folder='text_title_results', image=image)

    result_df = pd.DataFrame(columns=['line', 'content', 'conf'])

    for idx, (result, type_of_text) in enumerate(text_title_results):
        img_draw = cv2.imread(str(image_path.split('.')[0] + f'/text_title_results/{idx}.jpg'))
        
        boxes = [line[0] for line in result]
        txts = [line[1][0] for line in result]
        scores = [line[1][1] for line in result]
        
        im_show = draw_ocr(img_draw, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
        im_show = Image.fromarray(im_show)

        for row in range(len(result)):
            row_dct = {'line': str(f'{idx}_{row}_{type_of_text}'), 'content': txts[row], 'conf': scores[row]}
            row_dct = pd.DataFrame([row_dct])
            result_df = pd.concat([result_df, row_dct])
        
        im_show.save(str(image_path.split('.')[0] + f'/text_title_results/result_{idx}.jpg'))

    result_df.to_csv(str(image_path.split('.')[0] + f'/text_data.csv'), index=False)

    """Figure recognition
    """
    fig_img = cv2.transpose(image)
    fig_img = cv2.cvtColor(fig_img, cv2.COLOR_BGR2RGB)
    figure_img_list = []


    for idx in range(len(figure_blocks)):
        x_1, y_1, x_2, y_2 = figure_blocks[idx].coordinates
        fig_img_ = cv2.transpose(fig_img[int(x_1):int(x_2), int(y_1):int(y_2)])
        figure_img_list.append(fig_img_)
        if not os.path.exists(str(image_path.split('.')[0] + f'/figure_results')):
            os.makedirs(str(image_path.split('.')[0] + f'/figure_results')) 
            
        cv2.imwrite(str(image_path.split('.')[0] + f'/figure_results/{idx}.jpg'), fig_img_)


C:\Users\ASUS\PaddleOCR


In [None]:
end2end_process_doc_recognition(image_path)

# 2. Post-processing recognized content
## 2.1. Concatenate lines of paragraphs and connect content of all pages

In [49]:
import pandas as pd 
pd.options.mode.chained_assignment = None 

paper_csv_file = pd.DataFrame(columns=['line', 'content', 'conf'])

for i in range(1, 31):
    csv_path = 'C:/Users/ASUS/OneDrive/Desktop/paper/{0:04d}/text_data.csv'.format(i)
    csv_file = pd.read_csv(csv_path)
    paper_csv_file = pd.concat([paper_csv_file, csv_file])

paper_csv_file.reset_index(inplace=True)
del paper_csv_file['index']

In [50]:
processed_csv_file = pd.DataFrame(columns=['content', 'type'])
prev_line_idx = 0
processed_line_idx = -1

for total_index in range(paper_csv_file.shape[0]):
    parag_idx, line_idx, type_ = paper_csv_file['line'][total_index].split('_')
    parag_idx = int(parag_idx)
    line_idx = int(line_idx)

    if (parag_idx <= prev_line_idx) and (total_index != 0) and (type_ == processed_csv_file['type'][processed_line_idx]): 
        processed_csv_file['content'][processed_line_idx] =  \
                str(processed_csv_file['content'][processed_line_idx] + " " + paper_csv_file['content'][total_index])
    
    else:
        line_dict = {'content': paper_csv_file['content'][total_index], 'type': type_}
        line_dict = pd.DataFrame([line_dict])
        processed_csv_file = processed_csv_file.append(line_dict, ignore_index=True)
        processed_line_idx += 1

    prev_line_idx = parag_idx 

## 2.2. Syntax and Grammar Correction (in context of a complete sentence)

In [67]:
from textblob import TextBlob
import re

In [68]:
from textblob import TextBlob

for idx in range(len(processed_csv_file)):
    gfg = TextBlob(processed_csv_file.content[idx])
    gfg = gfg.correct()

    processed_csv_file.content[idx] = str(gfg)
    break

In [69]:
import re
from gingerit.gingerit import GingerIt


def syntax_grammar_correction_paragraph(processed_csv_file):
    for idx in range(len(processed_csv_file)):
        doc = processed_csv_file['content'][idx]
        word_list = re.findall("[a-zA-Z,.0-9!@#$%^&*-_+=~/]+", doc)
        updated_doc = " ".join(word_list)

        split_sentences = updated_doc.split('. ')
        corrected_sentences = list()
        for sentence in split_sentences:
            corrected_text = GingerIt().parse(sentence)
            corrected_sentences.append(corrected_text['result'])

        corrected_paragraph = ". ".join(corrected_sentences)
        processed_csv_file['content'][idx] = corrected_paragraph
    
    return processed_csv_file

In [72]:
processed_csv_file = syntax_grammar_correction_paragraph(processed_csv_file)