In [1]:
import os
import re
import string
from tqdm import tqdm
from scipy.spatial.distance import cdist

from nltk.tokenize import RegexpTokenizer

import sys
sys.path.insert(0, '..')

from enums.language import Language
from enums.configuration import Configuration
from enums.ocr_output_type import OCROutputType

In [2]:
# Load data

def get_folder_paths(language: Language):
    newseye_path = os.path.join('..', 'data', 'newseye')
    icdar_2017_path = os.path.join(newseye_path, '2017', 'full')
    icdar_2019_path = os.path.join(newseye_path, '2019', 'full')

    result = None
    if language == Language.English:
        result = [
            os.path.join(icdar_2017_path, 'eng_monograph'),
            os.path.join(icdar_2017_path, 'eng_periodical'),
            os.path.join(icdar_2019_path, 'EN')
        ]
    elif language == Language.Dutch:
        result = [
            os.path.join(icdar_2019_path, 'NL', 'NL1')
        ]
    elif language == Language.French:
        result = [
            os.path.join(icdar_2017_path, 'fr_monograph'),
            os.path.join(icdar_2017_path, 'fr_periodical'),
            os.path.join(icdar_2019_path, 'FR', 'FR1'),
            os.path.join(icdar_2019_path, 'FR', 'FR2'),
            os.path.join(icdar_2019_path, 'FR', 'FR3')
        ]
    elif language == Language.German:
        result = [
            os.path.join(icdar_2019_path, 'DE', 'DE1'),
            os.path.join(icdar_2019_path, 'DE', 'DE2'),
            os.path.join(icdar_2019_path, 'DE', 'DE3'),
            os.path.join(icdar_2019_path, 'DE', 'DE4'),
            os.path.join(icdar_2019_path, 'DE', 'DE5'),
            os.path.join(icdar_2019_path, 'DE', 'DE6'),
            os.path.join(icdar_2019_path, 'DE', 'DE7')
        ]

    return result


In [3]:
def read_documents(tokenizer, language: Language, ocr_output_type: OCROutputType):
    documents = []

    folder_paths = get_folder_paths(language)
    for folder_path in folder_paths:
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as text_file:
                file_lines = text_file.readlines()
                gt_line = file_lines[2] if ocr_output_type == OCROutputType.GroundTruth else file_lines[1]
                processed_line = gt_line[14:].replace('#', '').replace('@', '')

                text_nonum = re.sub(r'\d+', '', processed_line)
                text_nopunct = "".join([char.lower() for char in text_nonum if char not in string.punctuation])
                text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
                result = tokenizer.tokenize(text_no_doublespace)
                documents.append(result)

    return documents

In [4]:
tokenizer = RegexpTokenizer(r'\w+')

In [5]:
for language in [Language.Dutch, Language.English, Language.French, Language.German]:
    for ocr_output_type in OCROutputType:
        glove_filepath = os.path.join('results', 'glove')
        result_filepath = os.path.join(glove_filepath, f'{language.value}_{ocr_output_type.value}_corpus.txt')
        if os.path.exists(result_filepath):
            continue

        documents = read_documents(tokenizer, language, ocr_output_type)

        if not os.path.exists(glove_filepath):
            os.mkdir(glove_filepath)

        with open(result_filepath, 'w', encoding='utf-8') as result_file:
            for document in documents:
                document_str = ' '.join(document)
                if len(document_str.strip()) == 0: continue

                result_file.write(document_str)
                result_file.write('\n')

In [6]:
import numpy as np
import _pickle as pickle

def save_python_obj(obj: object, path: str, name: str) -> bool:
    try:
        filepath = os.path.join(path, f'{name}.pickle')
        with open(filepath, 'wb') as handle:
            pickle.dump(obj, handle, protocol=-1)

        return True
    except Exception:
        return False

def load_python_obj(path: str, name: str, extension_included: bool = False) -> object:
    obj = None
    try:
        extension = '' if extension_included else '.pickle'
        filepath = os.path.join(path, f'{name}{extension}')
        with (open(filepath, "rb")) as openfile:
            obj = pickle.load(openfile)

    except FileNotFoundError:
        return None

    return obj

In [7]:
vectors_filepath = os.path.join('results', 'glove', 'vectors')

vectors_by_words = load_python_obj(vectors_filepath, 'vectors-by-words')
if vectors_by_words is None:
    vectors_by_words = {}

    for language in [Language.Dutch, Language.English, Language.French, Language.German]:
        vectors_by_words[language] = {}
        for ocr_output_type in OCROutputType:
            vectors_by_words[language][ocr_output_type] = {}
            filepath = os.path.join(vectors_filepath, f'{language.value}_{ocr_output_type.value}_vectors.txt')
            print(filepath)
            with open(filepath, 'rb') as vectors_file:
                for i, line in enumerate(vectors_file.readlines()):
                    split_line = line.split()
                    word = split_line[0]

                    # if i == 1675:
                    #     print(word)
                    #     print(str(word) == 'tte')

                    numbers = np.array([float(x) for x in split_line[1:]])

                    try:
                        vectors_by_words[language][ocr_output_type][word.decode()] = numbers
                    except:
                        print(f'Failed for word {word}')

                print(len(vectors_by_words[language][ocr_output_type].keys()))

    save_python_obj(vectors_by_words, vectors_filepath, 'vectors-by-words')

In [8]:
unique_tokens = {}

for language in [Language.Dutch, Language.English, Language.French, Language.German]:
    unique_tokens[language] = None

    for config in [Configuration.SkipGram, Configuration.CBOW, Configuration.PPMI]:
        cache_path = os.path.join('..', '.cache', 'ocr-evaluation', language.value, config.value)
        raw_vocab_obj = load_python_obj(cache_path, f'vocab-icdar-2017-icdar-2019-{OCROutputType.Raw.value}')
        gt_vocab_obj = load_python_obj(cache_path, f'vocab-icdar-2017-icdar-2019-{OCROutputType.GroundTruth.value}')
        if raw_vocab_obj is None or gt_vocab_obj is None:
            print(cache_path)
            continue

        # extract the tokens from the vocabularies
        raw_tokens = list(raw_vocab_obj[0].keys())[4:]
        gt_tokens = list(gt_vocab_obj[0].keys())[4:]
        intersected_tokens = list(set(raw_tokens) & set(gt_tokens))

        if unique_tokens[language] is None:
            unique_tokens[language] = intersected_tokens
        else:
            unique_tokens[language] = list(set(intersected_tokens) & set(unique_tokens[language]))

..\.cache\ocr-evaluation\dutch\ppmi
..\.cache\ocr-evaluation\french\ppmi


In [9]:
_ = { language: {
    word: idx for (idx, word) in enumerate(unique_tokens[language])
}
    for language in unique_tokens.keys()
}

In [10]:
overlaps = {}
percentages = list(range(1, 101, 1))  # 1..20

for language in [Language.Dutch, Language.English, Language.French, Language.German]:
    print(f'Processing {language}')
    words_amounts = [
        int(len(unique_tokens[language]) * (float(percentage)/ 100))
        for percentage in percentages]

    max_n = max(words_amounts)

    cached_name = f'overlaps_{language.value}_glove'
    cached_value = load_python_obj('results', cached_name)
    if cached_value is not None:
        overlaps[language] = cached_value
        continue

    overlaps[language] = { percentage : { token: [] for token in unique_tokens[language] } for percentage in percentages }

    print(len(vectors_by_words[language][OCROutputType.Raw].keys()))
    raw_vectors = np.array([vectors_by_words[language][OCROutputType.Raw][word] for word in unique_tokens[language]])
    raw_similarity = 1 - cdist(raw_vectors, raw_vectors, metric='cosine')
    gt_vectors = np.array([vectors_by_words[language][OCROutputType.GroundTruth][word] for word in unique_tokens[language]])
    gt_similarity = 1 - cdist(gt_vectors, gt_vectors, metric='cosine')

    for token in tqdm(unique_tokens[language], desc=f'Processing tokens for \'{language.value}\'', total=len(unique_tokens[language])):
        raw_indices = np.argsort(raw_similarity[_[language][token]])[::-1][:max_n]
        gt_indices = np.argsort(gt_similarity[_[language][token]])[::-1][:max_n]

        for n, percentage in zip(words_amounts, percentages):
            current_gt = gt_indices[:n]
            current_raw = raw_indices[:n]

            current_overlaps = len(set(current_gt) & set(current_raw))
            overlaps[language][percentage][token].append(current_overlaps)

    save_python_obj(overlaps[language], 'results', cached_name)

Processing dutch
Processing english
Processing french
53272


Processing tokens for 'french': 100%|██████████| 18391/18391 [39:22<00:00,  7.79it/s]


Processing german
72732


Processing tokens for 'german': 100%|██████████| 8036/8036 [07:28<00:00, 17.91it/s]
