In [1]:
import gensim
from gensim.models import Word2Vec

import os
import re
import string
import numpy as np
from tqdm import tqdm
import _pickle as pickle

from nltk.tokenize import RegexpTokenizer

import sys
sys.path.insert(0, '..')

from enums.language import Language
from enums.configuration import Configuration
from enums.ocr_output_type import OCROutputType

In [2]:
# Load data

def get_folder_paths(language: Language):
    newseye_path = os.path.join('..', 'data', 'newseye')
    icdar_2017_path = os.path.join(newseye_path, '2017', 'full')
    icdar_2019_path = os.path.join(newseye_path, '2019', 'full')

    result = None
    if language == Language.English:
        result = [
            os.path.join(icdar_2017_path, 'eng_monograph'),
            os.path.join(icdar_2017_path, 'eng_periodical'),
            os.path.join(icdar_2019_path, 'EN')
        ]
    elif language == Language.Dutch:
        result = [
            os.path.join(icdar_2019_path, 'NL', 'NL1')
        ]
    elif language == Language.French:
        result = [
            os.path.join(icdar_2017_path, 'fr_monograph'),
            os.path.join(icdar_2017_path, 'fr_periodical'),
            os.path.join(icdar_2019_path, 'FR', 'FR1'),
            os.path.join(icdar_2019_path, 'FR', 'FR2'),
            os.path.join(icdar_2019_path, 'FR', 'FR3')
        ]
    elif language == Language.German:
        result = [
            os.path.join(icdar_2019_path, 'DE', 'DE1'),
            os.path.join(icdar_2019_path, 'DE', 'DE2'),
            os.path.join(icdar_2019_path, 'DE', 'DE3'),
            os.path.join(icdar_2019_path, 'DE', 'DE4'),
            os.path.join(icdar_2019_path, 'DE', 'DE5'),
            os.path.join(icdar_2019_path, 'DE', 'DE6'),
            os.path.join(icdar_2019_path, 'DE', 'DE7')
        ]

    return result


In [3]:
tokenizer = RegexpTokenizer(r'\w+')

In [4]:
def read_documents(tokenizer, language: Language, ocr_output_type: OCROutputType):
    documents = []

    folder_paths = get_folder_paths(language)
    for folder_path in folder_paths:
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as text_file:
                file_lines = text_file.readlines()
                gt_line = file_lines[2] if ocr_output_type == OCROutputType.GroundTruth else file_lines[1]
                processed_line = gt_line[14:].replace('#', '').replace('@', '')

                text_nonum = re.sub(r'\d+', '', processed_line)
                text_nopunct = "".join([char.lower() for char in text_nonum if char not in string.punctuation])
                text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
                result = tokenizer.tokenize(text_no_doublespace)
                documents.append(result)

    return documents

In [5]:
def get_model_path(
    language: Language,
    configuration: Configuration,
    randomly_initialized: bool,
    ocr_output_type: OCROutputType,
    learning_rate: float):
    rnd_suffix = 'random' if randomly_initialized else 'pretr'

    model_name = f'gensim_{language.value}_{configuration.value}_{rnd_suffix}_{ocr_output_type.value}_lr{learning_rate}.model'

    results_folder = 'results'
    if not os.path.exists(results_folder):
        os.mkdir(results_folder)

    result = os.path.join(results_folder, model_name)
    return result

In [6]:
def load_model(model_path):
    if not os.path.exists(model_path):
        return None

    model = Word2Vec.load(model_path)
    return model


In [7]:
def get_word2vec_model_info(language: Language):
    if language == Language.English:
        return 'GoogleNews-vectors-negative300.bin', True
    elif language == Language.Dutch:
        return 'combined-320.txt', False
    elif language == Language.French:
        return 'frwiki_20180420_300d.txt', False
    elif language == Language.German:
        return 'dewiki_20180420_300d.txt', False

    error_message = 'Unsupported word2vec language'
    raise Exception(error_message)

def get_pretrained_matrix(language: Language):
    data_path = os.path.join('..', 'data', 'ocr-evaluation', 'word2vec', language.value)
    word2vec_model_name, word2vec_binary = get_word2vec_model_info(language)
    word2vec_model_path = os.path.join(data_path, word2vec_model_name)
    word2vec_model  = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=word2vec_binary)
    return word2vec_model, word2vec_model_path, word2vec_binary

In [8]:
# TRAIN

def create_model(
    corpus,
    model_path: str,
    configuration: Configuration,
    randomly_initialized: bool,
    language: Language,
    learning_rate: float):
    sg = 1 if configuration == Configuration.SkipGram else 0
    vector_size = 320 if language == Language.Dutch else 300

    # initialize the model
    model = Word2Vec(vector_size=vector_size, window=5, min_count=5, workers=2, sg=sg, alpha=learning_rate)

    # build the vocabulary
    model.build_vocab(corpus, progress_per=1000)

    if not randomly_initialized:
        word2vec_weights, word2vec_model_path, word2vec_binary = get_pretrained_matrix(language)
        model.build_vocab(list(word2vec_weights.key_to_index.keys()), update=True)
        model.wv.vectors_lockf = np.ones((len(model.wv.key_to_index), 1)) # fix for word2vec issue
        model.wv.intersect_word2vec_format(word2vec_model_path, binary=word2vec_binary, lockf=1.0)

    # train the model
    model.train(corpus, total_examples=model.corpus_count, epochs=300, report_delay=1)

    # save the model
    model.save(model_path)

    return model

In [9]:
unique_tokens = {}
models = {}

for language in [Language.Dutch, Language.English, Language.French, Language.German]:
    models[language] = {}
    unique_tokens[language] = None
    for configuration in [Configuration.CBOW, Configuration.SkipGram]:
        models[language][configuration] = {}
        for learning_rate in [0.001, 0.0001]:
            models[language][configuration][learning_rate] = {}
            for randomly_initialized in [True]:
                models[language][configuration][learning_rate][randomly_initialized] = {}
                for ocr_output_type in [OCROutputType.GroundTruth, OCROutputType.Raw]:
                    print(f'Training: [\'{language.value}\', {configuration.value}, lr: {learning_rate}, {randomly_initialized}, {ocr_output_type.value}]')
                    documents = read_documents(tokenizer, language, ocr_output_type)
                    model_path = get_model_path(language, configuration, randomly_initialized, ocr_output_type, learning_rate)
                    model = load_model(model_path)
                    if model is None:
                        print('Model is not loaded. Creating and training now...')
                        model = create_model(documents, model_path, configuration, randomly_initialized, language, learning_rate)

                    models[language][configuration][learning_rate][randomly_initialized][ocr_output_type] = model
                    tokens = list(model.wv.key_to_index.keys())
                    if unique_tokens[language] is None:
                        unique_tokens[language] = tokens
                    else:
                        unique_tokens[language] = list(set(tokens) & set(unique_tokens[language]))


Training: ['dutch', cbow, lr: 0.001, True, ground-truth]
Training: ['dutch', cbow, lr: 0.001, True, raw]
Training: ['dutch', cbow, lr: 0.0001, True, ground-truth]
Training: ['dutch', cbow, lr: 0.0001, True, raw]
Training: ['dutch', skip-gram, lr: 0.001, True, ground-truth]
Training: ['dutch', skip-gram, lr: 0.001, True, raw]
Training: ['dutch', skip-gram, lr: 0.0001, True, ground-truth]
Training: ['dutch', skip-gram, lr: 0.0001, True, raw]
Training: ['english', cbow, lr: 0.001, True, ground-truth]
Training: ['english', cbow, lr: 0.001, True, raw]
Training: ['english', cbow, lr: 0.0001, True, ground-truth]
Training: ['english', cbow, lr: 0.0001, True, raw]
Training: ['english', skip-gram, lr: 0.001, True, ground-truth]
Training: ['english', skip-gram, lr: 0.001, True, raw]
Training: ['english', skip-gram, lr: 0.0001, True, ground-truth]
Training: ['english', skip-gram, lr: 0.0001, True, raw]
Training: ['french', cbow, lr: 0.001, True, ground-truth]
Model is not loaded. Creating and trai

In [10]:
target_words = {
    Language.English: ['man', 'new', 'time', 'day', 'good', 'old', 'little', 'one', 'two', 'three'],
    Language.Dutch: ['man', 'jaar', 'tijd', 'dag', 'huis', 'dier', 'werk', 'naam', 'groot', 'kleine', 'twee', 'drie', 'vier', 'vijf']
}




# from gensim import similarities

# similarities.MatrixSimilarity(vectors)

from scipy.spatial.distance import cdist

# for word in target_words[language]:
# #     print(f'-- \'{word}\':')
# #     print(model.wv.most_similar(positive=[word]))
    
#     break

In [11]:
def save_python_obj(obj: object, path: str, name: str) -> bool:
    try:
        filepath = os.path.join(path, f'{name}.pickle')
        with open(filepath, 'wb') as handle:
            pickle.dump(obj, handle, protocol=-1)

        return True
    except Exception:
        return False

def load_python_obj(path: str, name: str, extension_included: bool = False) -> object:
    obj = None
    try:
        extension = '' if extension_included else '.pickle'
        filepath = os.path.join(path, f'{name}{extension}')
        with (open(filepath, "rb")) as openfile:
            obj = pickle.load(openfile)

    except FileNotFoundError:
        return None

    return obj

In [12]:
for language in [Language.Dutch, Language.English, Language.French, Language.German]:
    for config in [Configuration.SkipGram, Configuration.CBOW, Configuration.PPMI]:
        cache_path = os.path.join('..', '.cache', 'ocr-evaluation', language.value, config.value)
        raw_vocab_obj = load_python_obj(cache_path, f'vocab-icdar-2017-icdar-2019-{OCROutputType.Raw.value}')
        gt_vocab_obj = load_python_obj(cache_path, f'vocab-icdar-2017-icdar-2019-{OCROutputType.GroundTruth.value}')
        if raw_vocab_obj is None or gt_vocab_obj is None:
            print(cache_path)
            continue

        # extract the tokens from the vocabularies
        raw_tokens = list(raw_vocab_obj[0].keys())[4:]
        gt_tokens = list(gt_vocab_obj[0].keys())[4:]
        intersected_tokens = list(set(raw_tokens) & set(gt_tokens))
        unique_tokens[language] = list(set(intersected_tokens) & set(unique_tokens[language]))

..\.cache\ocr-evaluation\french\ppmi
..\.cache\ocr-evaluation\german\ppmi


In [13]:
_ = { language: {
    word: idx for (idx, word) in enumerate(unique_tokens[language])
}
    for language in unique_tokens.keys()
}

In [14]:
overlaps = {}
percentages = list(range(1, 101, 1))  # 1..20

for language in [Language.Dutch, Language.English, Language.French, Language.German]:
    overlaps[language] = {}
    words_amounts = [ 
        int(len(unique_tokens[language]) * (float(percentage)/ 100))
        for percentage in percentages]

    max_n = max(words_amounts)

    for configuration in [Configuration.CBOW, Configuration.SkipGram]:
        overlaps[language][configuration] = {}
        for learning_rate in [0.001, 0.0001]:
            overlaps[language][configuration][learning_rate] = {}
            for randomly_initialized in [True]:
                rnd_suffix = 'random' if randomly_initialized else 'pretr'
                cached_name = f'overlaps_{language.value}_{configuration.value}_lr{learning_rate}_{rnd_suffix}'
                cached_value = load_python_obj('results', cached_name)
                if cached_value is not None:
                    overlaps[language][configuration][learning_rate][randomly_initialized] = cached_value
                    continue

                overlaps[language][configuration][learning_rate][randomly_initialized] = { percentage : { token: [] for token in unique_tokens[language] } for percentage in percentages }

                raw_vectors = np.array([models[language][configuration][learning_rate][True][OCROutputType.Raw].wv[word] for word in unique_tokens[language]])
                raw_similarity = 1 - cdist(raw_vectors, raw_vectors, metric='cosine')
                gt_vectors = np.array([models[language][configuration][learning_rate][True][OCROutputType.GroundTruth].wv[word] for word in unique_tokens[language]])
                gt_similarity = 1 - cdist(gt_vectors, gt_vectors, metric='cosine')

                for token in tqdm(unique_tokens[language], desc=f'Processing tokens for \'{language.value}, {configuration.value}, lr: {learning_rate}, {rnd_suffix}\'', total=len(unique_tokens[language])):
                    raw_indices = np.argsort(raw_similarity[_[language][token]])[::-1][:max_n]
                    gt_indices = np.argsort(gt_similarity[_[language][token]])[::-1][:max_n]

                    # gt_most_similar = models[language][configuration][learning_rate][randomly_initialized][OCROutputType.GroundTruth].wv.most_similar(token, topn=max_n)
                    # gt_most_similar = [x[0] for x in gt_most_similar]
                    # raw_most_similar = models[language][configuration][learning_rate][randomly_initialized][OCROutputType.Raw].wv.most_similar(token, topn=max_n)
                    # raw_most_similar = [x[0] for x in raw_most_similar]

                    for n, percentage in zip(words_amounts, percentages):
                        # current_gt = gt_most_similar[:n]
                        # current_raw = raw_most_similar[:n]
                        current_gt = gt_indices[:n]
                        current_raw = raw_indices[:n]

                        current_overlaps = len(set(current_gt) & set(current_raw))
                        overlaps[language][configuration][learning_rate][randomly_initialized][percentage][token].append(current_overlaps)

                save_python_obj(overlaps[language][configuration][learning_rate][randomly_initialized], 'results', cached_name)

Processing tokens for 'french, cbow, lr: 0.001, random': 100%|██████████| 18391/18391 [55:53<00:00,  5.48it/s]
Processing tokens for 'french, cbow, lr: 0.0001, random': 100%|██████████| 18391/18391 [56:02<00:00,  5.47it/s]
Processing tokens for 'french, skip-gram, lr: 0.001, random': 100%|██████████| 18391/18391 [56:16<00:00,  5.45it/s]
Processing tokens for 'french, skip-gram, lr: 0.0001, random': 100%|██████████| 18391/18391 [59:11<00:00,  5.18it/s]
Processing tokens for 'german, cbow, lr: 0.001, random': 100%|██████████| 8036/8036 [11:03<00:00, 12.12it/s]
Processing tokens for 'german, cbow, lr: 0.0001, random': 100%|██████████| 8036/8036 [11:10<00:00, 11.98it/s]
Processing tokens for 'german, skip-gram, lr: 0.001, random': 100%|██████████| 8036/8036 [11:29<00:00, 11.66it/s]
Processing tokens for 'german, skip-gram, lr: 0.0001, random': 100%|██████████| 8036/8036 [11:20<00:00, 11.81it/s]


In [15]:
# overlaps[Language.Dutch][Configuration.CBOW][0.001][True][10]
# overlaps[Language.Dutch][Configuration.CBOW][0.0001][True][10]

In [16]:
print(models[Language.Dutch][Configuration.SkipGram][0.001][True][OCROutputType.GroundTruth].layer1_size)
print(models[Language.Dutch][Configuration.CBOW][0.001][True][OCROutputType.GroundTruth].layer1_size)
print(models[Language.English][Configuration.SkipGram][0.001][True][OCROutputType.GroundTruth].layer1_size)
print(models[Language.English][Configuration.CBOW][0.001][True][OCROutputType.GroundTruth].layer1_size)

320
320
300
300
