### Setting up configurations / settings

In [None]:
# Cell 1: Logging Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Cell 2: Load and override config
from config import Settings
settings = Settings(
    MODEL_TYPE="word2vec", # or "fasttext"
    MODEL_NAME="word2vec_enwiki-latest-pages-articles",
    MODEL_RESUME=True, # Existing model from checkpoint if available
    DATASET_URL= "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
    EPOCHS=5, # You can test with fewer epochs
    CORPUS_CHECKPOINT_STRATEGY="streaming" # Use "streaming" (saved to disk) or "serialized" (loaded into RAM)
)

logging.info("Current configuration settings:\n%s", settings.model_dump_json(indent=2))

### Load the dataset, run tokenization, and train the model

In [None]:
# Cell 3: If not present, download corpus
from utils.download import download
from pathlib import Path

if not Path(settings.DATASET_PATH).exists():
    logging.info("Dataset / dump not found. Downloading...")
    download(
        url = settings.DATASET_URL,
        destination = str(settings.DATASET_PATH)
    )
    logging.info(f"Download complete: {settings.DATASET_PATH}")

In [None]:
# Cell 4: Load Corpus based on Checkpoint Strategy
from utils.corpus_loader import load_or_tokenize_wiki
from pathlib import Path

# Choose the checkpoint file and strategy based on the configuration.
match settings.CORPUS_CHECKPOINT_STRATEGY:
    case "serialized":
        corpus_checkpoint = Path(settings.CHECKPOINT_DIR / f"{settings.MODEL_NAME}.pkl")
        use_streaming = False
    case "streaming":
        corpus_checkpoint = Path(settings.CHECKPOINT_DIR / f"{settings.MODEL_NAME}.txt")
        use_streaming = True
    case _:
        raise ValueError(f"Invalid checkpoint strategy: {settings.CORPUS_CHECKPOINT_STRATEGY}")

# Load or create the corpus based on the chosen strategy.
sentences = load_or_tokenize_wiki(
    dataset_path=settings.DATASET_PATH,
    checkpoint_path=corpus_checkpoint,
    use_streaming=use_streaming
)

In [None]:
# Cell 5: Train the model (resumes if checkpoint exists)
from scripts.train import train_embedding_model

model = train_embedding_model(
    model_type=settings.MODEL_TYPE,
    sentences=sentences,
    save_dir=settings.MODEL_DIR,
    model_name=settings.MODEL_NAME,
    vector_size=settings.VECTOR_SIZE,
    window=settings.WINDOW,
    min_count=settings.MIN_COUNT,
    epochs=settings.EPOCHS,
    resume=settings.MODEL_RESUME
)

### Once loaded, the model can be used / queried, etc

In [82]:
from gensim.models import FastText, Word2Vec

model_path = settings.MODEL_DIR / f"{settings.MODEL_NAME}.model"
model_type = settings.MODEL_TYPE.lower()

match model_type:
    case "fasttext":
        loaded_model = FastText.load(str(model_path))
        print("Model loaded successfully")
    case "word2vec":
        loaded_model = Word2Vec.load(str(model_path))
        print("Model loaded successfully")
    case _:
        raise ValueError(f"Unsupported model_type '{model_type}'.")

2025-04-11 07:12:09,404 : INFO : loading Word2Vec object from models/word2vec_enwiki-latest-pages-articles.model
2025-04-11 07:12:10,242 : INFO : loading wv recursively from models/word2vec_enwiki-latest-pages-articles.model.wv.* with mmap=None
2025-04-11 07:12:10,257 : INFO : loading vectors from models/word2vec_enwiki-latest-pages-articles.model.wv.vectors.npy with mmap=None
2025-04-11 07:12:10,422 : INFO : loading syn1neg from models/word2vec_enwiki-latest-pages-articles.model.syn1neg.npy with mmap=None
2025-04-11 07:12:10,562 : INFO : setting ignored attribute cum_table to None
2025-04-11 07:12:22,997 : INFO : Word2Vec lifecycle event {'fname': 'models/word2vec_enwiki-latest-pages-articles.model', 'datetime': '2025-04-11T07:12:22.997860', 'gensim': '4.3.3', 'python': '3.12.9 (main, Mar 31 2025, 00:00:00) [GCC 14.2.1 20250110 (Red Hat 14.2.1-7)]', 'platform': 'Linux-6.13.9-200.fc41.x86_64-x86_64-with-glibc2.40', 'event': 'loaded'}


Model loaded successfully


In [83]:
# Cell 5: Some basic queries
from scripts.evaluate import run_simple_queries

run_simple_queries(loaded_model)

Most similar words for 'music': [('songs', 0.8584167957305908), ('musical', 0.8547187447547913), ('song', 0.8307401537895203), ('soundtracks', 0.8159443736076355), ('duo', 0.8120606541633606), ('soundtrack', 0.7995540499687195), ('lyrics', 0.7965595126152039), ('composer', 0.7955746054649353), ('album', 0.7939282059669495), ('orchestral', 0.7905789017677307), ('pop', 0.7880132794380188), ('dance', 0.7796504497528076), ('singing', 0.7778360247612), ('singers', 0.7772938013076782), ('musically', 0.7719873785972595)]
Similarity between 'food' and 'nutrition': 0.77

Word that doesn't match in ['fox', 'rabbit', 'motorcycle', 'cat']: motorcycle

Vector for 'technology':
[-0.4158041  -0.06729489 -0.5843771   0.20349322 -0.546658   -0.21965845
  0.24115673 -0.02055575  0.60455126 -0.5004687   0.44455427 -0.04475039
  0.50166476 -0.07543188  0.05580118  0.268717   -0.4210576  -0.21334498
 -0.23160847  0.24778453  0.28523096 -0.088859   -0.42142525  0.32468542
 -0.02259633  0.4494542   0.1975472