In [1]:
# Cell 1: Imports & Logging Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# Cell 2: Load and override config
from config import Settings
settings = Settings(
    MODEL_TYPE="fasttext",  # or "word2vec"
    MODEL_NAME="fasttext_rowiki-latest-pages-articles",
    MODEL_RESUME=True,       # Resume from checkpoint if available
    EPOCHS=2                 # You can test with fewer epochs
)

In [3]:
from utils.download import download
from pathlib import Path

if not Path(settings.DATASET_PATH).exists():
    logging.info("Dataset / dump not found. Downloading...")
    download(settings.DATASET_URL, str(settings.DATASET_PATH))
    logging.info(f"Download complete: {settings.DATASET_PATH}")

In [4]:
# Cell 3: Load or generate tokenized corpus
from utils.corpus_loader import load_or_tokenize_wiki
from pathlib import Path

corpus_checkpoint = Path(f"./checkpoints/{settings.MODEL_NAME}.pkl")

sentences = load_or_tokenize_wiki(
    dataset_path=settings.DATASET_PATH,
    checkpoint_path=corpus_checkpoint
)

In [5]:
# Cell 4: Train the model (resumes if checkpoint exists)
from scripts.train import train_embedding_model

model = train_embedding_model(
    model_type=settings.MODEL_TYPE,
    sentences=sentences,
    save_dir=settings.MODEL_DIR,
    model_name=settings.MODEL_NAME,
    vector_size=settings.VECTOR_SIZE,
    window=settings.WINDOW,
    min_count=settings.MIN_COUNT,
    epochs=settings.EPOCHS,
    resume=settings.MODEL_RESUME
)

2025-04-09 18:04:59,039 : INFO : loading FastText object from models/fasttext_rowiki-latest-pages-articles.model
2025-04-09 18:04:59,142 : INFO : loading wv recursively from models/fasttext_rowiki-latest-pages-articles.model.wv.* with mmap=None
2025-04-09 18:04:59,143 : INFO : loading vectors_vocab from models/fasttext_rowiki-latest-pages-articles.model.wv.vectors_vocab.npy with mmap=None
2025-04-09 18:04:59,171 : INFO : loading vectors_ngrams from models/fasttext_rowiki-latest-pages-articles.model.wv.vectors_ngrams.npy with mmap=None


Resuming training from existing model at models/fasttext_rowiki-latest-pages-articles.model


2025-04-09 18:04:59,243 : INFO : setting ignored attribute vectors to None
2025-04-09 18:04:59,243 : INFO : setting ignored attribute buckets_word to None
2025-04-09 18:05:14,146 : INFO : loading syn1neg from models/fasttext_rowiki-latest-pages-articles.model.syn1neg.npy with mmap=None
2025-04-09 18:05:14,168 : INFO : setting ignored attribute cum_table to None
2025-04-09 18:05:16,298 : INFO : FastText lifecycle event {'fname': 'models/fasttext_rowiki-latest-pages-articles.model', 'datetime': '2025-04-09T18:05:16.298285', 'gensim': '4.3.3', 'python': '3.12.9 (main, Mar 31 2025, 00:00:00) [GCC 14.2.1 20250110 (Red Hat 14.2.1-7)]', 'platform': 'Linux-6.13.9-200.fc41.x86_64-x86_64-with-glibc2.40', 'event': 'loaded'}
2025-04-09 18:05:16,298 : INFO : collecting all words and their counts
2025-04-09 18:05:16,298 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-04-09 18:05:16,781 : INFO : PROGRESS: at sentence #10000, processed 6789820 words, keeping 278118 word

Training fasttext model for 2 epochs...
Epoch 1/2


2025-04-09 18:05:49,687 : INFO : EPOCH 0 - PROGRESS: at 0.18% examples, 665778 words/s, in_qsize 16, out_qsize 0
2025-04-09 18:05:50,710 : INFO : EPOCH 0 - PROGRESS: at 0.68% examples, 670962 words/s, in_qsize 15, out_qsize 2
2025-04-09 18:05:51,716 : INFO : EPOCH 0 - PROGRESS: at 1.05% examples, 678010 words/s, in_qsize 16, out_qsize 0
2025-04-09 18:05:52,718 : INFO : EPOCH 0 - PROGRESS: at 1.52% examples, 676350 words/s, in_qsize 16, out_qsize 1
2025-04-09 18:05:53,719 : INFO : EPOCH 0 - PROGRESS: at 2.11% examples, 675671 words/s, in_qsize 16, out_qsize 0
2025-04-09 18:05:54,733 : INFO : EPOCH 0 - PROGRESS: at 2.40% examples, 674011 words/s, in_qsize 16, out_qsize 0
2025-04-09 18:05:55,741 : INFO : EPOCH 0 - PROGRESS: at 2.74% examples, 675739 words/s, in_qsize 16, out_qsize 2
2025-04-09 18:05:56,748 : INFO : EPOCH 0 - PROGRESS: at 3.21% examples, 678066 words/s, in_qsize 15, out_qsize 0
2025-04-09 18:05:57,754 : INFO : EPOCH 0 - PROGRESS: at 4.39% examples, 679795 words/s, in_qsize

Checkpoint saved after epoch 1
Epoch 2/2


2025-04-09 18:07:50,163 : INFO : EPOCH 0 - PROGRESS: at 0.17% examples, 655416 words/s, in_qsize 15, out_qsize 1
2025-04-09 18:07:51,176 : INFO : EPOCH 0 - PROGRESS: at 0.68% examples, 667625 words/s, in_qsize 16, out_qsize 4
2025-04-09 18:07:52,209 : INFO : EPOCH 0 - PROGRESS: at 1.07% examples, 676583 words/s, in_qsize 16, out_qsize 1
2025-04-09 18:07:53,215 : INFO : EPOCH 0 - PROGRESS: at 1.58% examples, 680472 words/s, in_qsize 16, out_qsize 0
2025-04-09 18:07:54,216 : INFO : EPOCH 0 - PROGRESS: at 2.13% examples, 678083 words/s, in_qsize 16, out_qsize 0
2025-04-09 18:07:55,224 : INFO : EPOCH 0 - PROGRESS: at 2.40% examples, 674222 words/s, in_qsize 15, out_qsize 0
2025-04-09 18:07:56,229 : INFO : EPOCH 0 - PROGRESS: at 2.72% examples, 671518 words/s, in_qsize 16, out_qsize 1
2025-04-09 18:07:57,244 : INFO : EPOCH 0 - PROGRESS: at 3.18% examples, 670894 words/s, in_qsize 15, out_qsize 0
2025-04-09 18:07:58,248 : INFO : EPOCH 0 - PROGRESS: at 4.33% examples, 672279 words/s, in_qsize

Checkpoint saved after epoch 2
Final training loss: 0.0
Fasttext model and vectors saved to models/fasttext_rowiki-latest-pages-articles.model and models/fasttext_rowiki-latest-pages-articles.vec


In [6]:
# Cell 5: Some basic queries
from scripts.evaluate import run_simple_queries

run_simple_queries(model)

Most similar words for 'muzica': [('muzicală', 0.9023458361625671), ('muzică', 0.8947693705558777), ('muzicii', 0.8743366003036499), ('muzicienii', 0.857370138168335), ('muzicale', 0.856338620185852), ('improvizația', 0.8535023331642151), ('muzicala', 0.8522033095359802), ('muzical', 0.851935088634491), ('melodiile', 0.851550281047821), ('versurile', 0.846199631690979), ('sonoritățile', 0.8425189852714539), ('compozitorii', 0.8417927026748657), ('improvizație', 0.8413375616073608), ('baladă', 0.8407309055328369), ('orchestrală', 0.8403103351593018)]
Similarity between 'mancare' and 'nutritie': 0.63

Word that doesn't match in ['vulpe', 'iepure', 'motocicleta', 'pisica']: motocicleta

Vector for 'tehnologie':
[-0.86006415 -0.05672178 -0.01114123  0.38843784  0.16710384  0.2455586
  0.52848756  0.37640527 -0.34850553  0.35148403 -0.07862923 -1.2645364
 -0.56967294  0.38616452  0.06586246 -0.12181038 -0.02517048  0.06443859
  0.25537202 -0.10526106  0.73516434  0.24557745 -0.4547587  -0.2