In [None]:
# Cell 1: Logging Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Cell 2: Load and override config
from config import Settings
settings = Settings(
    MODEL_TYPE="word2vec", # or "fasttext"
    MODEL_NAME="word2vec_enwiki-latest-pages-articles",
    MODEL_RESUME=True, # Resume from checkpoint if available
    DATASET_URL= "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
    EPOCHS=2, # You can test with fewer epochs
    CHECKPOINT_STRATEGY="streaming" # Use "streaming" (for large corpora) or "serialized" (pickle) for smaller datasets
)

In [None]:
# Cell 3: If not present, download corpus
from utils.download import download
from pathlib import Path

if not Path(settings.DATASET_PATH).exists():
    logging.info("Dataset / dump not found. Downloading...")
    download(settings.DATASET_URL, str(settings.DATASET_PATH))
    logging.info(f"Download complete: {settings.DATASET_PATH}")

In [None]:
# Cell 4: Load Corpus based on Checkpoint Strategy
from utils.corpus_loader import load_or_tokenize_wiki
from pathlib import Path

# Choose the checkpoint file and strategy based on the configuration.
if settings.CHECKPOINT_STRATEGY == "serialized":
    corpus_checkpoint = Path(f"./checkpoints/{settings.MODEL_NAME}.pkl")
    use_streaming = False
else:
    corpus_checkpoint = Path(f"./checkpoints/{settings.MODEL_NAME}.txt")
    use_streaming = True

# Load or create the corpus based on the chosen strategy.
sentences = load_or_tokenize_wiki(
    dataset_path=settings.DATASET_PATH,
    checkpoint_path=corpus_checkpoint,
    use_streaming=use_streaming
)

In [None]:
# Cell 5: Train the model (resumes if checkpoint exists)
from scripts.train import train_embedding_model

model = train_embedding_model(
    model_type=settings.MODEL_TYPE,
    sentences=sentences,
    save_dir=settings.MODEL_DIR,
    model_name=settings.MODEL_NAME,
    vector_size=settings.VECTOR_SIZE,
    window=settings.WINDOW,
    min_count=settings.MIN_COUNT,
    epochs=settings.EPOCHS,
    resume=settings.MODEL_RESUME
)

In [None]:
# Cell 5: Some basic queries
from scripts.evaluate import run_simple_queries

run_simple_queries(model)