### Setting up configurations / settings

In [None]:
# Cell 1: Logging Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Cell 2: Load and override config
from config import Settings
settings = Settings(
    MODEL_TYPE="word2vec", # or "fasttext"
    MODEL_NAME="word2vec_enwiki-latest-pages-articles",
    MODEL_RESUME=True, # Existing model from checkpoint if available
    DATASET_URL= "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
    EPOCHS=5, # You can test with fewer epochs
    CORPUS_CHECKPOINT_STRATEGY="streaming" # Use "streaming" (saved to disk) or "serialized" (loaded into RAM)
)

logging.info("Current configuration settings:\n%s", settings.model_dump_json(indent=2))

### Load the dataset, run tokenization, and train the model

In [None]:
# Cell 3: If not present, download corpus
from utils.download import download
from pathlib import Path

corpus_checkpoint = settings.CHECKPOINT_DIR / f"{settings.DATASET_FILE}.txt"

print(corpus_checkpoint)

if not Path(settings.DATASET_PATH).exists():
    logging.info("Dataset / dump not found. Downloading...")
    download(
        url = settings.DATASET_URL,
        destination = str(settings.DATASET_PATH)
    )
    logging.info(f"Download complete: {settings.DATASET_PATH}")

In [None]:
# Cell 4: Load Corpus based on Checkpoint Strategy
from utils.corpus_loader import load_or_tokenize_wiki
from pathlib import Path

# Choose the checkpoint file and strategy based on the configuration.
match settings.CORPUS_CHECKPOINT_STRATEGY:
    case "serialized":
        corpus_checkpoint = Path(settings.CHECKPOINT_DIR / f"{settings.MODEL_NAME}.pkl")
        use_streaming = False
    case "streaming":
        corpus_checkpoint = Path(settings.CHECKPOINT_DIR / f"{settings.MODEL_NAME}.txt")
        use_streaming = True
    case _:
        raise ValueError(f"Invalid checkpoint strategy: {settings.CORPUS_CHECKPOINT_STRATEGY}")

# Load or create the corpus based on the chosen strategy.
sentences = load_or_tokenize_wiki(
    dataset_path=settings.DATASET_PATH,
    checkpoint_path=corpus_checkpoint,
    use_streaming=use_streaming
)

In [None]:
# Cell 5: Train the model (resumes if checkpoint exists)
from scripts.train import train_embedding_model

model = train_embedding_model(
    model_type=settings.MODEL_TYPE,
    sentences=sentences,
    save_dir=settings.MODEL_DIR,
    model_name=settings.MODEL_NAME,
    vector_size=settings.VECTOR_SIZE,
    window=settings.WINDOW,
    min_count=settings.MIN_COUNT,
    epochs=settings.EPOCHS,
    resume=settings.MODEL_RESUME
)

### Once loaded, the model can be used / queried, etc

In [None]:
from gensim.models import FastText, Word2Vec

model_path = settings.MODEL_DIR / f"{settings.MODEL_NAME}.model"
model_type = settings.MODEL_TYPE.lower()

match model_type:
    case "fasttext":
        loaded_model = FastText.load(str(model_path))
        print("Model loaded successfully")
    case "word2vec":
        loaded_model = Word2Vec.load(str(model_path))
        print("Model loaded successfully")
    case _:
        raise ValueError(f"Unsupported model_type '{model_type}'.")

In [None]:
# Cell 5: Some basic queries
from scripts.evaluate import run_simple_queries

run_simple_queries(loaded_model)

### If you set up the Qdrant container, you can directly query the vectors in the db - no model load overhead.

In [171]:
# Cell 6: Setting up Qdrant client
import json
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection = "fasttext_rowiki-latest-pages-articles"

2025-04-11 18:37:30,426 : INFO : HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"


In [178]:
from qdrant_client.models import Filter, FieldCondition, MatchValue, SearchParams

# Human-readable token to vectorise
# we use vectors in the queries so this makes our lives easier when running queries
target_word = "stranie"

# Step 1: Get vector for the `target_word`
hits, _ = client.scroll(
    collection_name=collection,
    scroll_filter=Filter(
        must=[FieldCondition(key="word", match=MatchValue(value=target_word))]
    ),
    with_vectors=True,
    limit=1
)

if not hits:
    print(f"⚠️ Word '{target_word}' not found.")
else:
    vector = hits[0].vector

    # Step 2: Use vector directly in query
    results = client.query_points(
        collection_name=collection,
        query=vector,
        query_filter=Filter(
            must=[
                FieldCondition(key="pos", match=MatchValue(value="ADJ")),
            ]
        ),
        limit=10,
        with_payload=True,
        search_params=SearchParams(hnsw_ef=128)
    ).points

    # Step 3: Print results
    for i, hit in enumerate(results, 1):
        print(f"\nResult #{i} (id: {hit.id}, score: {hit.score:.3f}):")
        print(json.dumps(hit.payload, indent=2, ensure_ascii=False))


2025-04-11 19:58:14,577 : INFO : HTTP Request: POST http://localhost:6333/collections/fasttext_rowiki-latest-pages-articles/points/scroll "HTTP/1.1 200 OK"
2025-04-11 19:58:14,702 : INFO : HTTP Request: POST http://localhost:6333/collections/fasttext_rowiki-latest-pages-articles/points/query "HTTP/1.1 200 OK"



Result #1 (id: 37690, score: 1.000):
{
  "word": "stranie",
  "pos": "ADJ",
  "tag": "Afpfsrn",
  "dep": "amod",
  "lemma": "stranie",
  "morph": "Case=Acc,Nom|Definite=Ind|Degree=Pos|Gender=Fem|Number=Sing",
  "ent_type": "",
  "frequency": 246
}

Result #2 (id: 33061, score: 0.881):
{
  "word": "stranii",
  "pos": "ADJ",
  "tag": "Afp-p-n",
  "dep": "amod",
  "lemma": "straniu",
  "morph": "Definite=Ind|Degree=Pos|Number=Plur",
  "ent_type": "",
  "frequency": 297
}

Result #3 (id: 252233, score: 0.849):
{
  "word": "straniile",
  "pos": "ADJ",
  "tag": "Afpfpry",
  "dep": "amod",
  "lemma": "straniile",
  "morph": "Case=Acc,Nom|Definite=Def|Degree=Pos|Gender=Fem|Number=Plur",
  "ent_type": "",
  "frequency": 13
}

Result #4 (id: 17719, score: 0.848):
{
  "word": "interesantă",
  "pos": "ADJ",
  "tag": "Afpfsrn",
  "dep": "amod",
  "lemma": "interesant",
  "morph": "Case=Acc,Nom|Definite=Ind|Degree=Pos|Gender=Fem|Number=Sing",
  "ent_type": "",
  "frequency": 692
}

Result #5 (id: 3

In [177]:
import ipywidgets as widgets
from IPython.display import display

def explore_word(word):
    results = search_similar_words(word)
    for r in results:
        print(f"🔹 {r.payload.get('word')} (score: {r.score:.3f})")

text_input = widgets.Text(
    value='ciudata',
    placeholder='Enter a word',
    description='Word:',
    disabled=False
)

button = widgets.Button(description="Search")
button.on_click(lambda x: explore_word(text_input.value))
display(text_input, button)

Text(value='ciudata', description='Word:', placeholder='Enter a word')

Button(description='Search', style=ButtonStyle())