In [1]:
import os

output_dir = 'model/'
if (not os.path.exists(output_dir)):
    os.mkdir(output_dir)

### Wikipedia

Download file: elwiki-latest-pages-articles.xml.bz2 from https://dumps.wikimedia.org/elwiki/latest/

In [2]:
import logging
from gensim.corpora import WikiCorpus

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

wiki = WikiCorpus("./Data/elwiki-latest-pages-articles.xml.bz2",
                # lemmatize=False, 
                  dictionary={})

In [3]:
# Get tokens
tokens = list(wiki.get_texts())

print('[INFO] Total number of token sets: ', len(tokens))

2023-05-10 16:07:30,184 : INFO : finished iterating over Wikipedia corpus of 199568 documents with 103575944 positions (total 446720 articles, 104577897 positions before pruning articles shorter than 50 words)


[INFO] Total number of token sets:  199568


### Greek National Printing House

Download data from: [Google drive](https://drive.google.com/drive/folders/1tg8nexlZ8hbCexPtsNFJHM_uPJWVyx0l)

In [4]:
from pathlib import Path
from tqdm import tqdm
from utils.utils import preprocess


paths = [str(x) for x in Path('Data').glob('**/*.txt')]
print('[INFO] Number of files: ', len(paths))

# Load text and files and split tokens
for filename in tqdm(paths):
    with open(filename, encoding='utf-8') as f:
        text = f.read()
        tokens.append( preprocess(text) )

print('[INFO] Total number of token sets: ', len(tokens))

[INFO] Number of files:  31125


100%|██████████| 31125/31125 [04:27<00:00, 116.28it/s]

[INFO] Total number of token sets:  230693





### Train Word2Vec model


The parameters:
- min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)
- window = int - The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)
- vector_size = int - Dimensionality of the feature vectors. - (50, 300)
- sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)
- alpha = float - The initial learning rate - (0.01, 0.05)
- min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
- negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
- workers = int - Use these many worker threads to train the model (=faster training with multicore machines)

#### Preprocess

In [5]:
from utils.utils import has_numbers

# ΤΟDO
# 1. Remove accessions
# 2. Capitalized or not?

# Exclude words with less than 3 letters - Remove words containing numbers
tokens = [[word for word in token_set if len(word) > 2 and has_numbers(word) == False] for token_set in tqdm(tokens)]

100%|██████████| 230693/230693 [11:43<00:00, 327.93it/s] 


#### Stemming

In [6]:
import snowballstemmer

stemmer = snowballstemmer.stemmer('greek');

tokens = [[stemmer.stemWords(x.split())[0] for x in token_set] for token_set in tqdm(tokens)]

100%|██████████| 230693/230693 [7:14:51<00:00,  8.84it/s]    


#### Word2Vec train

In [7]:
import multiprocessing
from gensim.models.word2vec import Word2Vec

params = {'min_count': 2,
          'window': 2, 
          'vector_size': 96,
          'workers': max(1, multiprocessing.cpu_count()-1), 
          'sample': 1e-3,}

word2vec = Word2Vec(tokens, **params)

2023-05-10 23:41:28,273 : INFO : collecting all words and their counts
2023-05-10 23:41:28,278 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-05-10 23:41:31,250 : INFO : PROGRESS: at sentence #10000, processed 8813452 words, keeping 233746 word types
2023-05-10 23:41:36,880 : INFO : PROGRESS: at sentence #20000, processed 14835147 words, keeping 331164 word types
2023-05-10 23:41:41,651 : INFO : PROGRESS: at sentence #30000, processed 19975898 words, keeping 400304 word types
2023-05-10 23:41:46,504 : INFO : PROGRESS: at sentence #40000, processed 24807681 words, keeping 465785 word types
2023-05-10 23:41:51,821 : INFO : PROGRESS: at sentence #50000, processed 29214003 words, keeping 515812 word types
2023-05-10 23:41:58,095 : INFO : PROGRESS: at sentence #60000, processed 33987875 words, keeping 570074 word types
2023-05-10 23:42:02,927 : INFO : PROGRESS: at sentence #70000, processed 38131730 words, keeping 618052 word types
2023-05-10 23:42:08,462 : 

In [10]:
word2vec.wv['ΚΥΒΕΡΝΗΣΕΩΣ']

#### Save model

In [9]:
# Save model
word2vec.save("model/Word2Vec.model")

2023-05-11 00:31:02,493 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'model/Word2Vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-05-11T00:31:02.493382', 'gensim': '4.3.1', 'python': '3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'saving'}
2023-05-11 00:31:02,509 : INFO : storing np array 'vectors' to model/Word2Vec.model.wv.vectors.npy
2023-05-11 00:31:03,328 : INFO : storing np array 'syn1neg' to model/Word2Vec.model.syn1neg.npy
2023-05-11 00:31:03,771 : INFO : not storing attribute cum_table
2023-05-11 00:31:04,825 : INFO : saved model/Word2Vec.model
