In [1]:
from gensim.models import Word2Vec

import os
import re
import string

from nltk.tokenize import RegexpTokenizer


In [2]:
# Load data

newseye_path = os.path.join('..', 'data', 'newseye')
icdar_2017_1_path = os.path.join(newseye_path, '2017', 'full', 'eng_monograph')
icdar_2017_2_path = os.path.join(newseye_path, '2017', 'full', 'eng_periodical')
icdar_2019_path = os.path.join(newseye_path, '2019', 'full', 'EN')


In [3]:
tokenizer = RegexpTokenizer(r'\w+')

In [4]:
documents = []

for icdar_path in [icdar_2017_1_path, icdar_2017_2_path, icdar_2019_path]:
    for filename in os.listdir(icdar_path):
        file_path = os.path.join(icdar_path, filename)
        with open(file_path, 'r', encoding='utf-8') as text_file:
            file_lines = text_file.readlines()
            gt_line = file_lines[2]
            processed_line = gt_line.replace('[ GS_aligned]', '').replace('#', '').replace('@', '')

            text_nonum = re.sub(r'\d+', '', processed_line)
            text_nopunct = "".join([char.lower() for char in text_nonum if char not in string.punctuation])
            text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
            result = tokenizer.tokenize(text_no_doublespace)
            documents.append(result)

In [5]:
model_path = 'gensim_default_eng.model'

In [6]:
def load_model():
    if not os.path.exists(model_path):
        return None

    model = Word2Vec.load(model_path)
    return model


In [7]:
# TRAIN

def create_model(corpus):
    model = Word2Vec(vector_size=300, window=5, min_count=5, workers=2)
    model.build_vocab(corpus, progress_per=10000)
    model.train(corpus, total_examples=model.corpus_count, epochs=300, report_delay=1)
    model.save()
    return model

In [8]:
model = load_model()
if model is None:
    print('Model is not loaded. Creating and training now...')
    model = create_model(documents)

In [11]:
words = ['man', 'new', 'time', 'day', 'good', 'old', 'little', 'one', 'two', 'three']
for word in words:
    print(f'-- \'{word}\':')
    print(model.wv.most_similar(positive=[word]))

-- 'man':
[('he', 0.430123507976532), ('i', 0.37541136145591736), ('it', 0.36927860975265503), ('woman', 0.3595450818538666), ('gentleman', 0.35697460174560547), ('him', 0.35161837935447693), ('mans', 0.3401086628437042), ('and', 0.32658615708351135), ('that', 0.3250766396522522), ('so', 0.3246637284755707)]
-- 'new':
[('annuity', 0.21009431779384613), ('unpublished', 0.2047516405582428), ('teaching', 0.19760270416736603), ('middlesbrough', 0.1955123245716095), ('supplied', 0.1879047453403473), ('quince', 0.18704593181610107), ('british', 0.18611449003219604), ('cards', 0.18508203327655792), ('homer', 0.17994344234466553), ('appointed', 0.17965558171272278)]
-- 'time':
[('he', 0.36121225357055664), ('it', 0.3312700092792511), ('that', 0.2923469841480255), ('them', 0.29134300351142883), ('period', 0.2901051938533783), ('and', 0.28276896476745605), ('but', 0.2818032205104828), ('i', 0.27945730090141296), ('was', 0.2750898003578186), ('the', 0.2749282717704773)]
-- 'day':
[('morning', 0.3