## Testing the best Paddle model

In [1]:
import os
import json
import pandas as pd
from paddleocr import PaddleOCR
import time
from tqdm import tqdm
from Levenshtein import distance as levenshtein_distance
import cv2
import numpy as np
import tarfile
import gc


annotations_folder = '/home/maxkhamuliak/projects/OCR-comparsion/annotations'
images_folder = '/home/maxkhamuliak/projects/OCR-comparsion/images'
detection_model_folder = '/home/maxkhamuliak/projects/OCR-comparsion/detection_models'
recognition_model_folder = '/home/maxkhamuliak/projects/OCR-comparsion/recognition_folder'

default_recognition_model = '/home/maxkhamuliak/projects/OCR-comparsion/recognition_folder/en_PP-OCRv3_rec_infer'
default_detection_model = '/home/maxkhamuliak/projects/OCR-comparsion/detection_models/en_PP-OCRv3_det_infer'

det_model_file = '/home/maxkhamuliak/projects/OCR-comparsion/Paddle_lite_models/det_slim_lite.nb'
rec_model_file = '/home/maxkhamuliak/projects/OCR-comparsion/Paddle_lite_models/rec_lite.nb'

new_recognition_models = [
    '/home/maxkhamuliak/projects/OCR-comparsion/recognition_folder/en_PP-OCRv4_rec_infer',
    '/home/maxkhamuliak/projects/OCR-comparsion/recognition_folder/en_PP-OCRv4_rec_train'
]


# Load data
data = []
for filename in os.listdir(annotations_folder):
    if filename.endswith('.json'):
        with open(os.path.join(annotations_folder, filename), 'r') as file:
            annotation_data = json.load(file)
            image_filename = filename.replace('.json', '.png')
            text = " ".join(item['text'] for item in annotation_data['form']).strip()
            data.append({'filename': image_filename, 'text': text})

df = pd.DataFrame(data)
df.head()


Unnamed: 0,filename,text
0,82092117.png,TO: DATE: 3 Fax: NOTE: 82092117 614 -466 -5087...
1,82200067_0069.png,TO: FROM: x SUBJECT: DIVISION: DIVISION: DIV...
2,82250337_0338.png,TO: FROM: DATE: MANUFACTURER: BRAND: Oct. Dec....
3,82251504.png,17 cc: : From: Area: Region: 5 X Chains: Indep...
4,82252956_2958.png,AUG 4 SEP 15 JUN 23 MAY 12 REGION: DIVISION: 7...


#### Helper Functions for OCR


In [2]:
def evaluate_models(detection_model, recognition_model):
    ocr = PaddleOCR(det_model_dir=detection_model, rec_model_dir=recognition_model, use_gpu=False, lang="en")
    
    results = []
    total_time = 0
    for _, row in df.iterrows():
        image_path = os.path.join(images_folder, row['filename'])
        image = cv2.imread(image_path)
        
        start_time = time.time()
        ocr_result = ocr.ocr(image, cls=False)
        end_time = time.time()
        
        if not ocr_result:
            print(f"No text detected in image: {row['filename']}")
            continue
        
        predicted_text = " ".join([item[1][0] for item in ocr_result[0]])
        
        accuracy = len(set(row['text'].split()) & set(predicted_text.split())) / len(set(row['text'].split()))
        levenshtein = levenshtein_distance(row['text'], predicted_text)
        
        results.append({
            'filename': row['filename'],
            'accuracy': accuracy,
            'levenshtein': levenshtein,
            'time': end_time - start_time
        })
        
        total_time += end_time - start_time
    
    avg_time_per_image = total_time / len(results) if results else 0
    return pd.DataFrame(results), avg_time_per_image
        

def extract_tar_files(folder):
    for filename in os.listdir(folder):
        if filename.endswith('.tar'):
            tar_path = os.path.join(folder, filename)
            with tarfile.open(tar_path, 'r') as tar:
                tar.extractall(path=folder)
            os.remove(tar_path)

# Extract tar files
extract_tar_files(detection_model_folder)
extract_tar_files(recognition_model_folder)

# Get the list of detection models
detection_models = [os.path.join(detection_model_folder, model) for model in os.listdir(detection_model_folder) if os.path.isdir(os.path.join(detection_model_folder, model))]

# Get the list of recognition models
recognition_models = [os.path.join(recognition_model_folder, model) for model in os.listdir(recognition_model_folder) if os.path.isdir(os.path.join(recognition_model_folder, model))]


### Testing All Detection Model Types

In [9]:
detection_results = []
for det_model in detection_models:
    results, avg_time = evaluate_models(det_model, default_recognition_model)
    results['detection_model'] = os.path.basename(det_model)
    results['avg_time_per_image'] = avg_time
    detection_results.append(results)

[2024/06/19 16:00:09] ppocr DEBUG: Namespace(alpha=1.0, benchmark=False, beta=1.0, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/maxkhamuliak/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/maxkhamuliak/projects/OCR-comparsion/detection_models/en_PP-OCRv3_det_distill_train', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='f

### Testing recognition models

In [10]:
recognition_results = []
for rec_model in recognition_models:
    try:
        results, avg_time = evaluate_models(default_detection_model, rec_model)
        results['recognition_model'] = os.path.basename(rec_model)
        results['avg_time_per_image'] = avg_time
        recognition_results.append(results)
    except IndexError:
        print(f"Error evaluating recognition model: {os.path.basename(rec_model)}")
    

[2024/06/19 16:13:53] ppocr DEBUG: Namespace(alpha=1.0, benchmark=False, beta=1.0, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/maxkhamuliak/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/maxkhamuliak/projects/OCR-comparsion/detection_models/en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2

[2024/06/19 16:13:57] ppocr DEBUG: dt_boxes num : 44, elapse : 0.535841703414917
[2024/06/19 16:14:11] ppocr DEBUG: rec_res num  : 44, elapse : 14.707687616348267
[2024/06/19 16:14:12] ppocr DEBUG: dt_boxes num : 71, elapse : 0.7167508602142334
[2024/06/19 16:14:30] ppocr DEBUG: rec_res num  : 71, elapse : 17.868001699447632
[2024/06/19 16:14:31] ppocr DEBUG: dt_boxes num : 51, elapse : 0.8132715225219727
[2024/06/19 16:14:47] ppocr DEBUG: rec_res num  : 51, elapse : 16.27497148513794
[2024/06/19 16:14:48] ppocr DEBUG: dt_boxes num : 47, elapse : 0.411376953125
[2024/06/19 16:14:57] ppocr DEBUG: rec_res num  : 47, elapse : 9.77440619468689
[2024/06/19 16:14:58] ppocr DEBUG: dt_boxes num : 51, elapse : 0.40482115745544434
[2024/06/19 16:15:06] ppocr DEBUG: rec_res num  : 51, elapse : 8.068122148513794
[2024/06/19 16:15:06] ppocr DEBUG: dt_boxes num : 45, elapse : 0.4685969352722168
[2024/06/19 16:15:17] ppocr DEBUG: rec_res num  : 45, elapse : 10.713351249694824
[2024/06/19 16:15:18] pp

In [11]:
# Concatenate detection results and calculate average metrics
detection_final_results = pd.concat(detection_results, ignore_index=True)
detection_average_results = detection_final_results.groupby('detection_model').agg({
    'accuracy': 'mean',
    'levenshtein': 'mean',
    'avg_time_per_image': 'mean'
}).reset_index()

# Concatenate recognition results and calculate average metrics
recognition_final_results = pd.concat(recognition_results, ignore_index=True)
recognition_average_results = recognition_final_results.groupby('recognition_model').agg({
    'accuracy': 'mean',
    'levenshtein': 'mean',
    'avg_time_per_image': 'mean'
}).reset_index()

# Rename columns for better readability
detection_average_results.columns = ['Model', 'Average Accuracy', 'Average Levenshtein Distance', 'Average Time per Image (s)']
recognition_average_results.columns = ['Model', 'Average Accuracy', 'Average Levenshtein Distance', 'Average Time per Image (s)']

print("Detection Models Comparison:")
print(detection_average_results)

print("\nRecognition Models Comparison:")
print(recognition_average_results)

Detection Models Comparison:
                                Model  Average Accuracy  \
0       en_PP-OCRv3_det_distill_train          0.641862   
1               en_PP-OCRv3_det_infer          0.635716   
2  en_PP-OCRv3_det_slim_distill_train          0.641862   
3          en_PP-OCRv3_det_slim_infer          0.660067   

   Average Levenshtein Distance  Average Time per Image (s)  
0                        433.12                    4.375742  
1                        437.42                    3.929579  
2                        433.12                    3.313494  
3                        433.68                    4.662250  

Recognition Models Comparison:
                                  Model  Average Accuracy  \
0                 en_PP-OCRv3_rec_infer          0.635716   
1            en_PP-OCRv3_rec_slim_infer          0.613069   
2       en_number_mobile_v2.0_rec_infer          0.017171   
3  en_number_mobile_v2.0_rec_slim_infer          0.002460   

   Average Levenshtein Dist

### Testing PaddleOCR V4


In [3]:
# Test new recognition models
new_recognition_results = []
for rec_model in new_recognition_models:
    try:
        print(f"Evaluating {os.path.basename(rec_model)}...")
        results, avg_time = evaluate_models(default_detection_model, rec_model)
        results['recognition_model'] = os.path.basename(rec_model)
        results['avg_time_per_image'] = avg_time
        new_recognition_results.append(results)
        print(f"Finished evaluating {os.path.basename(rec_model)}")
    except Exception as e:
        print(f"Error evaluating recognition model {os.path.basename(rec_model)}: {str(e)}")



Evaluating en_PP-OCRv4_rec_infer...
[2024/06/29 19:57:01] ppocr DEBUG: Namespace(alpha=1.0, benchmark=False, beta=1.0, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/maxkhamuliak/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/maxkhamuliak/projects/OCR-comparsion/detection_models/en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model

100%|██████████| 9.96M/9.96M [00:06<00:00, 1.59MiB/s]

[2024/06/29 20:02:06] ppocr DEBUG: Namespace(alpha=1.0, benchmark=False, beta=1.0, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/maxkhamuliak/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/maxkhamuliak/projects/OCR-comparsion/detection_models/en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2




[2024/06/29 20:02:07] ppocr DEBUG: dt_boxes num : 44, elapse : 0.2948122024536133
[2024/06/29 20:02:10] ppocr DEBUG: rec_res num  : 44, elapse : 2.5726239681243896
[2024/06/29 20:02:10] ppocr DEBUG: dt_boxes num : 71, elapse : 0.2050478458404541
[2024/06/29 20:02:12] ppocr DEBUG: rec_res num  : 71, elapse : 1.9399006366729736
[2024/06/29 20:02:12] ppocr DEBUG: dt_boxes num : 51, elapse : 0.19165325164794922
[2024/06/29 20:02:14] ppocr DEBUG: rec_res num  : 51, elapse : 2.3102102279663086
[2024/06/29 20:02:15] ppocr DEBUG: dt_boxes num : 47, elapse : 0.20935297012329102
[2024/06/29 20:02:17] ppocr DEBUG: rec_res num  : 47, elapse : 2.0198864936828613
[2024/06/29 20:02:17] ppocr DEBUG: dt_boxes num : 51, elapse : 0.20222043991088867
[2024/06/29 20:02:19] ppocr DEBUG: rec_res num  : 51, elapse : 2.422048568725586
[2024/06/29 20:02:20] ppocr DEBUG: dt_boxes num : 45, elapse : 0.31643033027648926
[2024/06/29 20:02:23] ppocr DEBUG: rec_res num  : 45, elapse : 3.1509788036346436
[2024/06/29 2

In [6]:
# Concatenate new recognition results and calculate average metrics
new_recognition_final_results = pd.concat(new_recognition_results, ignore_index=True)
new_recognition_average_results = new_recognition_final_results.groupby('recognition_model').agg({
    'accuracy': 'mean',
    'levenshtein': 'mean',
    'avg_time_per_image': 'mean'
}).reset_index()

# Rename columns for better readability
new_recognition_average_results.columns = ['Model', 'Average Accuracy', 'Average Levenshtein Distance', 'Average Time per Image (s)']

print("\nNew Recognition Models Comparison (OCRv4):")
print(new_recognition_average_results)



New Recognition Models Comparison (OCRv4):
                   Model  Average Accuracy  Average Levenshtein Distance  \
0  en_PP-OCRv4_rec_infer          0.600452                        435.74   
1  en_PP-OCRv4_rec_train          0.635716                        437.42   

   Average Time per Image (s)  
0                    5.879405  
1                    2.832642  
