In [1]:
# KNN for one word and also word combinations

import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path(__file__).parent))

from src.models import ModelManager


def explore_name_neighbors(model, names, k=10):
    # I did this originally to test names, but it can be whatever
    print("\n" + "="*70)
    print(f"K-NEAREST NEIGHBORS (k={k})")
    print("="*70)

    for name in names:
        if name in model:
            print(f"\nNearest neighbors for '{name}':")
            neighbors = model.most_similar(name, topn=k)
            for i, (neighbor, similarity) in enumerate(neighbors, 1):
                print(f"  {i:2d}. {neighbor:<20} (similarity: {similarity:.4f})")
        else:
            print(f"\n'{name}' not found in model vocabulary")

    print("\n" + "="*70)


def explore_word_combination(model, positive_words, negative_words=None, k=10):
    """
    Find k-nearest neighbors for a combination of words using vector arithmetic.

    Args:
        model: Word embedding model
        positive_words: List of words to add together
        negative_words: List of words to subtract (optional)
        k: Number of nearest neighbors to show
    """
    print("\n" + "="*70)
    print("K-NEAREST NEIGHBORS FOR WORD COMBINATION")
    print("="*70)

    print(f"\nPositive words: {positive_words}")
    if negative_words:
        print(f"Negative words: {negative_words}")

    try:
        neighbors = model.most_similar(
            positive=positive_words,
            negative=negative_words if negative_words else [],
            topn=k
        )

        print(f"\nTop {k} nearest neighbors:")
        for i, (neighbor, similarity) in enumerate(neighbors, 1):
            print(f"  {i:2d}. {neighbor:<20} (similarity: {similarity:.4f})")

    except KeyError as e:
        print(f"\nError: Word not found in vocabulary: {e}")

    print("\n" + "="*70)


def main():
    """Explore Word2Vec embeddings for specific names"""
    print("="*70)
    print("Word2Vec K-Nearest Neighbors Explorer")
    print("="*70)

    # Load model
    print("\nLoading Word2Vec model...")
    manager = ModelManager()
    model = manager.load_word2vec_google_news()

    print("\n" + "="*70)

    # Explore k-nearest neighbors for sample names
    # Change here
    sample_names = ["MLK"]
    explore_name_neighbors(model, sample_names, k=10)

    # Explore word combinations
    # Change here
    explore_word_combination(model, ["White", "girl"], k=10)
    explore_word_combination(model, ["Black", "girl"], k=10)
    explore_word_combination(model, ["Asian", "girl"], k=10)
    explore_word_combination(model, ["White", "boy"], k=10)
    explore_word_combination(model, ["Black", "boy"], k=10)
    explore_word_combination(model, ["Asian", "boy"], k=10)

if __name__ == "__main__":
    main()

NameError: name '__file__' is not defined