In [None]:
# Cell 1: Imports & Logging Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Cell 2: Load and override config
from config import Settings
settings = Settings(
    MODEL_TYPE="fasttext",  # or "fasttext"
    MODEL_NAME="fasttext_rowiki-latest-pages-articles",
    MODEL_RESUME=True,       # Resume from checkpoint if available
    EPOCHS=2                 # You can test with fewer epochs
)

In [None]:
from utils.download import download
from pathlib import Path

if not Path(settings.DATASET_PATH).exists():
    logging.info("Dataset / dump not found. Downloading...")
    download(settings.DATASET_URL, str(settings.DATASET_PATH))
    logging.info(f"Download complete: {settings.DATASET_PATH}")

In [None]:
# Cell 3: Load or generate tokenized corpus
from utils.corpus_loader import load_or_tokenize_wiki
from pathlib import Path

corpus_checkpoint = Path(f"./checkpoints/{settings.MODEL_NAME}.pkl")

sentences = load_or_tokenize_wiki(
    dataset_path=settings.DATASET_PATH,
    checkpoint_path=corpus_checkpoint
)

In [None]:
# Cell 4: Train the model (resumes if checkpoint exists)
from scripts.train import train_embedding_model

model = train_embedding_model(
    model_type=settings.MODEL_TYPE,
    sentences=sentences,
    save_dir=settings.MODEL_DIR,
    model_name=settings.MODEL_NAME,
    vector_size=settings.VECTOR_SIZE,
    window=settings.WINDOW,
    min_count=settings.MIN_COUNT,
    epochs=settings.EPOCHS,
    resume=settings.MODEL_RESUME
)

In [25]:
# Cell 5: Some basic queries
from scripts.evaluate import run_simple_queries

run_simple_queries(model)

Most similar words for 'muzica': [('muzicală', 0.9007065296173096), ('muzică', 0.9002702832221985), ('muzical', 0.865666925907135), ('improvizația', 0.8611472845077515), ('muzicii', 0.8593776822090149), ('improvizație', 0.8559248447418213), ('melodiile', 0.8533461093902588), ('muzicala', 0.8532575368881226), ('muzicalâ', 0.8532226085662842), ('melodie', 0.8506162762641907), ('muzicale', 0.8485515713691711), ('melodia', 0.8481923341751099), ('melodii', 0.847549557685852), ('îndrăgita', 0.8470050096511841), ('muzicile', 0.8464491367340088)]
Similarity between 'mancare' and 'nutritie': 0.64

Word that doesn't match in ['vulpe', 'iepure', 'motocicleta', 'pisica']: motocicleta

Vector for 'tehnologie':
[-0.922466   -0.08408512  0.01337457  0.4200269   0.25501388  0.3013266
  0.6211818   0.391625   -0.28068843  0.39844242 -0.14745809 -1.1583319
 -0.47723126  0.34541005  0.11916659 -0.07403175 -0.09654294  0.11558955
  0.20718303  0.08619243  0.84084207  0.3038405  -0.5606956  -0.15525958
  0