In [8]:
import os
import re
import string

from nltk.tokenize import RegexpTokenizer

import sys
sys.path.insert(0, '..')

from enums.language import Language
from enums.ocr_output_type import OCROutputType

In [6]:
# Load data

def get_folder_paths(language: Language):
    newseye_path = os.path.join('..', 'data', 'newseye')

    result = None
    if language == Language.English:
        icdar_2017_1_path = os.path.join(newseye_path, '2017', 'full', 'eng_monograph')
        icdar_2017_2_path = os.path.join(newseye_path, '2017', 'full', 'eng_periodical')
        icdar_2019_path = os.path.join(newseye_path, '2019', 'full', 'EN')
        result = [icdar_2017_1_path, icdar_2017_2_path, icdar_2019_path]
    elif language == Language.Dutch:
        icdar_2019_path = os.path.join(newseye_path, '2019', 'full', 'NL', 'NL1')
        result = [icdar_2019_path]

    return result


In [7]:
def read_documents(tokenizer, language: Language, ocr_output_type: OCROutputType):
    documents = []

    folder_paths = get_folder_paths(language)
    for folder_path in folder_paths:
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as text_file:
                file_lines = text_file.readlines()
                gt_line = file_lines[2] if ocr_output_type == OCROutputType.GroundTruth else file_lines[1]
                processed_line = gt_line[14:].replace('#', '').replace('@', '')

                text_nonum = re.sub(r'\d+', '', processed_line)
                text_nopunct = "".join([char.lower() for char in text_nonum if char not in string.punctuation])
                text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
                result = tokenizer.tokenize(text_no_doublespace)
                documents.append(result)

    return documents

In [9]:
tokenizer = RegexpTokenizer(r'\w+')

In [28]:
language = Language.Dutch
ocr_output_type = OCROutputType.GroundTruth


In [29]:
documents = read_documents(tokenizer, language, ocr_output_type)

In [30]:
glove_filepath = os.path.join('results', 'glove')
if not os.path.exists(glove_filepath):
    os.mkdir(glove_filepath)

result_filepath = os.path.join(glove_filepath, f'{language.value}_{ocr_output_type.value}_corpus.txt')
with open(result_filepath, 'w', encoding='utf-8') as result_file:
    for document in documents:
        document_str = ' '.join(document)
        if len(document_str.strip()) == 0: continue

        result_file.write(document_str)
        result_file.write('\n')