### Setting up configurations / settings

In [None]:
# Cell 1: Logging Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [28]:
# Cell 2: Load and override config
from config import Settings
settings = Settings(
    MODEL_TYPE="word2vec", # or "fasttext"
    MODEL_NAME="word2vec_enwiki-latest-pages-articles",
    MODEL_RESUME=True, # Existing model from checkpoint if available
    DATASET_URL= "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
    EPOCHS=5, # You can test with fewer epochs
    CORPUS_CHECKPOINT_STRATEGY="streaming" # Use "streaming" (saved to disk) or "serialized" (loaded into RAM)
)

logging.info("Current configuration settings:\n%s", settings.model_dump_json(indent=2))

2025-04-10 16:09:24,911 : INFO : Current configuration settings:
{
  "MODEL_DIR": "models",
  "DATASET_DIR": "datasets",
  "CHECKPOINT_DIR": "checkpoints",
  "DATASET_URL": "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
  "DATASET_FILE": "enwiki-latest-pages-articles.xml.bz2",
  "DATASET_PATH": "datasets/enwiki-latest-pages-articles.xml.bz2",
  "MODEL_TYPE": "word2vec",
  "MODEL_TRAIN": true,
  "MODEL_NAME": "word2vec_enwiki-latest-pages-articles",
  "MODEL_RESUME": true,
  "VECTOR_SIZE": 100,
  "WINDOW": 5,
  "MIN_COUNT": 3,
  "EPOCHS": 5,
  "CORPUS_CHECKPOINT_STRATEGY": "streaming",
  "UPLOAD_TO_VECTORDB": false,
  "VECTORDB_HOST": "127.0.0.1",
  "VECTORDB_PORT": 6333,
  "VECTORDB_COLLECTION": "word2vec_enwiki-latest-pages-articles"
}


### Load the dataset, run tokenization, and train the model

In [27]:
# Cell 3: If not present, download corpus
from utils.download import download
from pathlib import Path

if not Path(settings.DATASET_PATH).exists():
    logging.info("Dataset / dump not found. Downloading...")
    download(
        url = settings.DATASET_URL,
        destination = str(settings.DATASET_PATH)
    )
    logging.info(f"Download complete: {settings.DATASET_PATH}")

In [None]:
# Cell 4: Load Corpus based on Checkpoint Strategy
from utils.corpus_loader import load_or_tokenize_wiki
from pathlib import Path

# Choose the checkpoint file and strategy based on the configuration.
match settings.CORPUS_CHECKPOINT_STRATEGY:
    case "serialized":
        corpus_checkpoint = Path(settings.CHECKPOINT_DIR / f"{settings.MODEL_NAME}.pkl")
        use_streaming = False
    case "streaming":
        corpus_checkpoint = Path(settings.CHECKPOINT_DIR / f"{settings.MODEL_NAME}.txt")
        use_streaming = True
    case _:
        raise ValueError(f"Invalid checkpoint strategy: {settings.CORPUS_CHECKPOINT_STRATEGY}")

# Load or create the corpus based on the chosen strategy.
sentences = load_or_tokenize_wiki(
    dataset_path=settings.DATASET_PATH,
    checkpoint_path=corpus_checkpoint,
    use_streaming=use_streaming
)

In [None]:
# Cell 5: Train the model (resumes if checkpoint exists)
from scripts.train import train_embedding_model

model = train_embedding_model(
    model_type=settings.MODEL_TYPE,
    sentences=sentences,
    save_dir=settings.MODEL_DIR,
    model_name=settings.MODEL_NAME,
    vector_size=settings.VECTOR_SIZE,
    window=settings.WINDOW,
    min_count=settings.MIN_COUNT,
    epochs=settings.EPOCHS,
    resume=settings.MODEL_RESUME
)

### Once loaded, the model can be used / queried, etc

In [None]:
from gensim.models import FastText, Word2Vec

model_path = settings.MODEL_DIR / f"{settings.MODEL_NAME}.model"
model_type = settings.MODEL_TYPE.lower()

match model_type:
    case "fasttext":
        loaded_model = FastText.load(str(model_path))
        print("Model loaded successfully")
    case "word2vec":
        loaded_model = Word2Vec.load(str(model_path))
        print("Model loaded successfully")
    case _:
        raise ValueError(f"Unsupported model_type '{model_type}'.")

In [None]:
# Cell 5: Some basic queries
from scripts.evaluate import run_simple_queries

run_simple_queries(loaded_model)