In [2]:
import os
import re
import string

import numpy as np
from tqdm import tqdm
from asrtoolkit import wer, cer

import _pickle as pickle

import sys
sys.path.insert(0, '..')

from enums.language import Language

In [3]:
def get_folder_paths():
    newseye_path = os.path.join('..', 'data', 'newseye')

    icdar_2017_path = os.path.join(newseye_path, '2017', 'full')
    icdar_2019_path = os.path.join(newseye_path, '2019', 'full')

    result = {
        Language.English: [
            os.path.join(icdar_2017_path, 'eng_monograph'),
            os.path.join(icdar_2017_path, 'eng_periodical'),
            os.path.join(icdar_2019_path, 'EN')
        ],
        Language.Dutch: [
            os.path.join(icdar_2019_path, 'NL', 'NL1')
        ],
        Language.French: [
            os.path.join(icdar_2017_path, 'fr_monograph'),
            os.path.join(icdar_2017_path, 'fr_periodical'),
            os.path.join(icdar_2019_path, 'FR', 'FR1'),
            os.path.join(icdar_2019_path, 'FR', 'FR2'),
            os.path.join(icdar_2019_path, 'FR', 'FR3'),
        ],
        Language.German: [
            os.path.join(icdar_2019_path, 'DE', 'DE1'),
            os.path.join(icdar_2019_path, 'DE', 'DE2'),
            os.path.join(icdar_2019_path, 'DE', 'DE3'),
            os.path.join(icdar_2019_path, 'DE', 'DE4'),
            os.path.join(icdar_2019_path, 'DE', 'DE5'),
            os.path.join(icdar_2019_path, 'DE', 'DE6'),
            os.path.join(icdar_2019_path, 'DE', 'DE7'),
        ]
    }

    return result

In [4]:
def save_cache(cache_filepath, cache_obj):
    with open(cache_filepath, 'wb') as cache_file:
        pickle.dump(cache_obj, cache_file)

def calculate_error_rates(specific_language: Language = None):
    result = {}

    paths_by_language = get_folder_paths()

    for language, folder_paths in paths_by_language.items():
        if specific_language is not None and language != specific_language: continue

        paths = []
        for folder_path in folder_paths:
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                paths.append(file_path)

        result[language] = {
            'wer': np.zeros(len(paths)),
            'cer': np.zeros(len(paths))
        }

        result[language]['wer'].fill(-1)
        result[language]['cer'].fill(-1)

        cache_filepath = os.path.join('results', f'errors_cache_{language.value}.pickle')
        if os.path.exists(cache_filepath):
            with open(cache_filepath, 'rb') as cache_file:
                cache = pickle.load(cache_file)
        else:
            cache = result[language]

        for i, file_path in enumerate(tqdm(paths, desc=f'Computing \'{language.value}\'', total=len(paths))):
            with open(file_path, 'r', encoding='utf-8') as text_file:
                if cache['wer'][i] != -1:
                    result[language]['wer'][i] = cache['wer'][i]
                    result[language]['cer'][i] = cache['cer'][i]
                    continue

                file_lines = text_file.readlines()
                gt_line = file_lines[2][14:]
                ocr_line = file_lines[1][14:]

                n = 500
                if len(gt_line) > n:
                    gt_chunks = [gt_line[i:i+n] for i in range(0, len(gt_line), n)]
                    ocr_chunks = [ocr_line[i:i+n] for i in range(0, len(ocr_line), n)]

                    temp_wer = []
                    temp_cer = []
                    for gt_chunk, ocr_chunk in zip(gt_chunks, ocr_chunks):
                        temp_wer.append(wer(gt_chunk, ocr_chunk))
                        temp_cer.append(cer(gt_chunk, ocr_chunk))

                        result[language]['wer'][i] = np.mean(temp_wer)
                        result[language]['cer'][i] = np.mean(temp_cer)
                else:
                    result[language]['wer'][i] = wer(gt_line, ocr_line)
                    result[language]['cer'][i] = cer(gt_line, ocr_line)

                if i % 10 == 0:
                    save_cache(cache_filepath, cache)

    return result

In [5]:
error_rates_per_language = calculate_error_rates()

Computing 'english': 100%|██████████| 963/963 [00:21<00:00, 44.30it/s]
Computing 'dutch': 100%|██████████| 150/150 [00:02<00:00, 51.79it/s]
Computing 'french': 100%|██████████| 3993/3993 [01:30<00:00, 44.06it/s]
Computing 'german': 100%|██████████| 10032/10032 [02:39<00:00, 62.85it/s]


In [8]:
for language, error_rates in error_rates_per_language.items():
    print(f'{language.value} WER: {error_rates["wer"].mean()}')
    print(f'{language.value} CER: {error_rates["cer"].mean()}')

english WER: 38.453125273560005
english CER: 84.58698025710028
dutch WER: 183.04292156823422
dutch CER: 662.2354336199279
french WER: 17.437550110440373
french CER: 39.91430664575433
german WER: 77.19739226912753
german CER: 28.980495066405396


In [12]:
error_rates_per_language[Language.Dutch]['cer'][2]

105.0894524296487