### Setting up configurations / settings

In [None]:
# Cell 1: Logging Settings
import logging

from qdrant_client.http.models import ScoredPoint

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Cell 2: Load and override config
from config import Settings
settings = Settings(
    MODEL_TYPE="word2vec", # or "fasttext"
    MODEL_NAME="word2vec_enwiki-latest-pages-articles",
    MODEL_RESUME=True, # Existing model from checkpoint if available
    DATASET_URL= "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
    EPOCHS=5, # You can test with fewer epochs
    CORPUS_CHECKPOINT_STRATEGY="streaming" # Use "streaming" (saved to disk) or "serialized" (loaded into RAM)
)

logging.info("Current configuration settings:\n%s", settings.model_dump_json(indent=2))

### Load the dataset, run tokenization, and train the model

In [None]:
# Cell 3: If not present, download corpus
from utils.download import download
from pathlib import Path

corpus_checkpoint = settings.CHECKPOINT_DIR / f"{settings.DATASET_FILE}.txt"

print(corpus_checkpoint)

if not Path(settings.DATASET_PATH).exists():
    logging.info("Dataset / dump not found. Downloading...")
    download(
        url = settings.DATASET_URL,
        destination = str(settings.DATASET_PATH)
    )
    logging.info(f"Download complete: {settings.DATASET_PATH}")

In [None]:
# Cell 4: Load Corpus based on Checkpoint Strategy
from utils.corpus_loader import load_or_tokenize_wiki
from pathlib import Path

# Choose the checkpoint file and strategy based on the configuration.
match settings.CORPUS_CHECKPOINT_STRATEGY:
    case "serialized":
        corpus_checkpoint = Path(settings.CHECKPOINT_DIR / f"{settings.MODEL_NAME}.pkl")
        use_streaming = False
    case "streaming":
        corpus_checkpoint = Path(settings.CHECKPOINT_DIR / f"{settings.MODEL_NAME}.txt")
        use_streaming = True
    case _:
        raise ValueError(f"Invalid checkpoint strategy: {settings.CORPUS_CHECKPOINT_STRATEGY}")

# Load or create the corpus based on the chosen strategy.
sentences = load_or_tokenize_wiki(
    dataset_path=settings.DATASET_PATH,
    checkpoint_path=corpus_checkpoint,
    use_streaming=use_streaming
)

In [None]:
# Cell 5: Train the model (resumes if checkpoint exists)
from scripts.train import train_embedding_model

model = train_embedding_model(
    model_type=settings.MODEL_TYPE,
    sentences=sentences,
    save_dir=settings.MODEL_DIR,
    model_name=settings.MODEL_NAME,
    vector_size=settings.VECTOR_SIZE,
    window=settings.WINDOW,
    min_count=settings.MIN_COUNT,
    epochs=settings.EPOCHS,
    resume=settings.MODEL_RESUME
)

### Once loaded, the model can be used / queried, etc

In [None]:
from gensim.models import FastText, Word2Vec

model_path = settings.MODEL_DIR / f"{settings.MODEL_NAME}.model"
model_type = settings.MODEL_TYPE.lower()

match model_type:
    case "fasttext":
        loaded_model = FastText.load(str(model_path))
        print("Model loaded successfully")
    case "word2vec":
        loaded_model = Word2Vec.load(str(model_path))
        print("Model loaded successfully")
    case _:
        raise ValueError(f"Unsupported model_type '{model_type}'.")

In [None]:
# Cell 5: Some basic queries
from scripts.evaluate import run_simple_queries

run_simple_queries(loaded_model)

### If you set up the Qdrant container, you can directly query the vectors in the db - no model load overhead.

In [335]:
# Cell 6: Setting up Qdrant client
import json
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection = "fasttext_rowiki-latest-pages-articles"

In [337]:
from qdrant_client.models import Filter, FieldCondition, MatchValue, SearchParams, ScoredPoint

# Human-readable token to vectorise
# we use vectors in the queries so this makes our lives easier when running queries
target_word = "barca"

# Step 1: Get vector for the `target_word`
hits, _ = client.scroll(
    collection_name=collection,
    scroll_filter=Filter(
        must=[FieldCondition(key="word", match=MatchValue(value=target_word))]
    ),
    with_vectors=True,
    limit=1
)

if not hits:
    print(f"⚠️ Word '{target_word}' not found.")
else:
    vector = hits[0].vector

    # Step 2: Use vector directly in query
    results: list[ScoredPoint] = client.query_points(
        collection_name=collection,
        query=vector,
        query_filter=Filter(
            must=[
                FieldCondition(key="pos", match=MatchValue(value="ADJ")),
            ]
        ),
        limit=10,
        with_payload=True,
        search_params=SearchParams(hnsw_ef=128)
    ).points

    # Step 3: Print results
    for i, hit in enumerate(results, 1):
        print(f"\nResult #{i} (id: {hit.id}, score: {hit.score:.3f}):")
        print(json.dumps(hit.payload, indent=2, ensure_ascii=False))


ResponseHandlingException: Server disconnected without sending a response.

In [353]:
from qdrant_client.models import Filter, FieldCondition, MatchValue, ScoredPoint

target_phrase = "lege"


# Step 1: Get vector for the verb
hits, _ = client.scroll(
    collection_name=collection,
    scroll_filter=Filter(
        must=[FieldCondition(key="word", match=MatchValue(value=target_phrase))]
    ),
    with_vectors=True,
    limit=1
)

if not hits:
    print(f"⚠️ Phrase '{target_phrase}' not found.")
else:
    vector = hits[0].vector

    # Step 2: Search for morph-compatible adjectives
    results: list[ScoredPoint] = client.query_points(
        collection_name=collection,
        query=vector, # vector for 'noun'
        query_filter=Filter(
            must=[
                FieldCondition(key="morph", match=MatchValue(value="Gender=Fem|Number=Sing|VerbForm=Part")),
            ]
        ),
        limit=10, # top-k
        # with_payload=True,
    ).points

    print(f"\nSuggested adjectives for: '{target_phrase} {{...}}'")
    for i, hit in enumerate(results, 1):
        noun = hit.payload.get("word", "[unknown]")
        print(f"{i}. {noun} (score: {hit.score:.3f})")

2025-04-13 10:47:36,276 : INFO : HTTP Request: POST http://localhost:6333/collections/fasttext_rowiki-latest-pages-articles/points/scroll "HTTP/1.1 200 OK"
2025-04-13 10:47:36,383 : INFO : HTTP Request: POST http://localhost:6333/collections/fasttext_rowiki-latest-pages-articles/points/query "HTTP/1.1 200 OK"



Suggested adjectives for: 'lege {...}'
1. legiferată (score: 0.833)
2. adoptată (score: 0.811)
3. stipulată (score: 0.780)
4. reglementată (score: 0.776)
5. garantată (score: 0.771)
6. promulgată (score: 0.762)
7. normată (score: 0.754)
8. legalizată (score: 0.751)
9. instituționalizată (score: 0.748)
10. aprobată (score: 0.748)
