In [14]:
# Cell 1: Imports & Logging Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [15]:
# Cell 2: Load and override config
from config import Settings
settings = Settings(
    MODEL_TYPE="word2vec",  # or "fasttext"
    MODEL_NAME="fasttext_rowiki-latest-pages-articles",
    MODEL_RESUME=True,       # Resume from checkpoint if available
    EPOCHS=2                 # You can test with fewer epochs
)

In [16]:
from utils.download import download
from pathlib import Path

if not Path(settings.DATASET_PATH).exists():
    logging.info("Dataset / dump not found. Downloading...")
    download(settings.DATASET_URL, str(settings.DATASET_PATH))
    logging.info(f"Download complete: {settings.DATASET_PATH}")

In [17]:
# Cell 3: Load or generate tokenized corpus
from utils.corpus_loader import load_or_tokenize_wiki
from pathlib import Path

corpus_checkpoint = Path(f"./checkpoints/{settings.MODEL_NAME}.pkl")

sentences = load_or_tokenize_wiki(
    dataset_path=settings.DATASET_PATH,
    checkpoint_path=corpus_checkpoint
)

In [None]:
# Cell 4: Train the model (resumes if checkpoint exists)
from scripts.train import train_embedding_model

model = train_embedding_model(
    model_type=settings.MODEL_TYPE,
    sentences=sentences,
    save_dir=settings.MODEL_DIR,
    model_name=settings.MODEL_NAME,
    vector_size=settings.VECTOR_SIZE,
    window=settings.WINDOW,
    min_count=settings.MIN_COUNT,
    epochs=settings.EPOCHS,
    resume=settings.MODEL_RESUME
)

2025-04-09 17:50:49,574 : INFO : loading Word2Vec object from models/fasttext_rowiki-latest-pages-articles.model
2025-04-09 17:50:49,682 : INFO : loading wv recursively from models/fasttext_rowiki-latest-pages-articles.model.wv.* with mmap=None
2025-04-09 17:50:49,682 : INFO : loading vectors_vocab from models/fasttext_rowiki-latest-pages-articles.model.wv.vectors_vocab.npy with mmap=None
2025-04-09 17:50:49,716 : INFO : loading vectors_ngrams from models/fasttext_rowiki-latest-pages-articles.model.wv.vectors_ngrams.npy with mmap=None


Resuming training from existing model at models/fasttext_rowiki-latest-pages-articles.model


2025-04-09 17:50:49,796 : INFO : setting ignored attribute vectors to None
2025-04-09 17:50:49,797 : INFO : setting ignored attribute buckets_word to None
2025-04-09 17:51:04,664 : INFO : loading syn1neg from models/fasttext_rowiki-latest-pages-articles.model.syn1neg.npy with mmap=None
2025-04-09 17:51:04,686 : INFO : setting ignored attribute cum_table to None
2025-04-09 17:51:06,815 : INFO : FastText lifecycle event {'fname': 'models/fasttext_rowiki-latest-pages-articles.model', 'datetime': '2025-04-09T17:51:06.815610', 'gensim': '4.3.3', 'python': '3.12.9 (main, Mar 31 2025, 00:00:00) [GCC 14.2.1 20250110 (Red Hat 14.2.1-7)]', 'platform': 'Linux-6.13.9-200.fc41.x86_64-x86_64-with-glibc2.40', 'event': 'loaded'}
2025-04-09 17:51:06,815 : INFO : collecting all words and their counts
2025-04-09 17:51:06,816 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-04-09 17:51:07,299 : INFO : PROGRESS: at sentence #10000, processed 6789820 words, keeping 278118 word

In [None]:
# Cell 5: Some basic queries
from scripts.evaluate import run_simple_queries

run_simple_queries(model)