In [1]:
import gensim
import gensim.downloader as api
import numpy as np

import random
import json

import calc as c

In [2]:
VOCAB_PATH = "../data/tagged.json"

# Read words from the input JSON file
with open(VOCAB_PATH, "r", encoding="utf-8") as infile:
    base_vocab = json.load(infile)

In [3]:
# Initialize the SemCalculator
sem_calc = c.SemCalculator(base_vocab=base_vocab)

Loading the 'word2vec-google-news-300' model...
Model loaded successfully.



In [4]:
# Compute difference vector: king - queen
difference_vec = sem_calc.subtract_vectors('king', 'queen')

# Add the difference vector to 'man'
new_vec = sem_calc.subtract_vectors('man', difference_vec)

# Find the most similar words to the new vector
similar_words = sem_calc.get_most_similar_words(new_vec, topn=10)

# Print the results
print("Most similar words to 'man - (king - queen)':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")


Most similar words to 'man - (king - queen)':
woman: 0.7187
man: 0.6558
girl: 0.5883
lady: 0.5754
teenager: 0.5378
schoolgirl: 0.4978
policewoman: 0.4907
blonde: 0.4871
redhead: 0.4778
brunette: 0.4762


In [5]:
print(sem_calc.get_most_similar_words("recurse", topn=20, pos_filter="NN"))

[('recurse', 1.0000001192092896), ('delimiter', 0.5342383980751038), ('cardinality', 0.5059214234352112), ('tuple', 0.49939844012260437), ('obj', 0.4907994270324707), ('pathname', 0.488739013671875), ('preprocessor', 0.48395562171936035)]


In [6]:
unrelated_words = sem_calc.get_most_unrelated_words(["test", "starcraft"], pos_filter="NN")

In [7]:
print(unrelated_words)

[('economist', 1.1239797621965408), ('poignancy', 1.1229719780385494), ('stridency', 1.1031844671815634), ('preoccupation', 1.0996383093297482), ('gratitude', 1.099144585430622), ('shopkeeper', 1.098434541374445), ('ebullience', 1.0983146633952856), ('dais', 1.0931188948452473), ('shrillness', 1.0919341165572405), ('parishioner', 1.0871334224939346)]


In [8]:
def find_maximally_unrelated_words(sem_calc, k, pos_filter=None):
    """
    Find a set of k words that are maximally semantically unrelated.

    Parameters:
    - sem_calc: An instance of SemCalculator.
    - k: The number of words to find.
    - pos_filter: List of POS tags to include (e.g., ['NN'] for nouns).

    Returns:
    - A list of words that are maximally semantically unrelated.
    """
    # Initialize the set with a random word from the vocabulary
    available_words = list(sem_calc.vocab)
    if pos_filter:
        # Filter available words by POS tags
        available_words = [
            word for word in available_words
            if sem_calc.word_pos_tags.get(word, "") in pos_filter and word in sem_calc.model
        ]
    else:
        # Ensure words are in the model's vocabulary
        available_words = [word for word in available_words if word in sem_calc.model]

    if len(available_words) < k:
        raise ValueError(f"Not enough words available to find {k} unrelated words.")

    # Randomly select the first word
    first_word = random.choice(available_words)
    selected_words = [first_word]
    available_words.remove(first_word)

    print(f"Starting word: {first_word}")

    # Iteratively select the most unrelated word
    while len(selected_words) < k:
        # Find the most unrelated words to the current set
        unrelated_words = sem_calc.get_most_unrelated_words(
            input_words_or_vectors=selected_words,
            topn=100,  # Get more candidates to avoid already selected words
            pos_filter=pos_filter
        )

        # Filter out words already selected or not available
        for word, _ in unrelated_words:
            if word not in selected_words and word in available_words:
                selected_words.append(word)
                available_words.remove(word)
                print(f"Selected word {len(selected_words)}: {word}")
                break
        else:
            # If no new word is found, break the loop
            print("No more unrelated words can be found.")
            break

    return selected_words

In [9]:
# Find a set of 5 maximally semantically unrelated words among nouns
k = 1
pos_filter = ['NN']  # Only consider nouns
unrelated_words_set = find_maximally_unrelated_words(sem_calc, k, pos_filter=pos_filter)




Starting word: hydrogenation


In [10]:
# randomly sampling nouns
nouns = random.sample([k for k,v in base_vocab.items() if v=="NN"], 20)

nouns

['bighead',
 'sonneratia',
 'scaffie',
 'plerophory',
 'dotage',
 'barometer',
 'atloidoaxoid',
 'campanile',
 'trabuco',
 'trophaea',
 'lapboard',
 'archbishop',
 'flashbulbs',
 'trustier',
 'astarboard',
 'shadowbox',
 'splenius',
 'duettino',
 'odophone',
 'beggarhood']

In [11]:
k_more = 0
words = ["quality", "initialism", "havocs", "dragster", "pedometer", "month"]

for i in range(k_more):
    # Get the most unrelated word to the current list of words
    # It returns a list of tuples [(word, distance)], so we extract the word
    word, distance = sem_calc.get_most_unrelated_words(words, 1, pos_filter=['NN'])[0]
    words.append(word)

# Print the resulting words
print("Maximally unrelated words starting from 'quality':")
for idx, word in enumerate(words, 1):
    print(f"{idx}: {word}")


Maximally unrelated words starting from 'quality':
1: quality
2: initialism
3: havocs
4: dragster
5: pedometer
6: month


In [14]:
# Parameters
pos_tags = ['NN']          # POS tags to include
num_clusters = 9          # Number of clusters / words to select

# Find diverse words (works worse than greedy)
diverse_words = sem_calc.find_top_words_by_clustering(pos_tags, num_clusters)

# Print the results
print("\nSelected words:")
for idx, word in enumerate(diverse_words, 1):
    print(f"{idx}: {word}")

Clustering 30071 words into 9 clusters...
Cluster 1: Top 10 words: tchr, bibliog, quate, fringilla, chiasm, latticelike, hyperpyrexia, plexiform, centage, gametophyte
Cluster 2: Top 10 words: potoroo, yellowhead, sengi, bladderwort, clethra, arrowwood, stringybark, cistus, jerboa, elodea
Cluster 3: Top 10 words: meshuga, puddy, caramba, moper, gagster, ringy, kewpie, snaggle, hoppity, booky
Cluster 4: Top 10 words: derange, anthropomorphise, preordain, unfetter, pussyfoot, unhorse, dissatisfy, importune, feminise, wisen
Cluster 5: Top 10 words: almondy, vacherin, farfel, verjuice, paprikas, demerara, orgeat, bavarois, confiture, souffl
Cluster 6: Top 10 words: mortice, dacron, turtleback, snowlike, vigas, batiste, headpin, karabiner, reflexed, watchcase
Cluster 7: Top 10 words: tortor, dawk, risus, viverra, traduction, dilo, naf, saul, irfan, donnie
Cluster 8: Top 10 words: eccrine, leiomyoma, degranulation, cytidine, isoenzyme, sitosterol, apocrine, hyperkeratosis, hydroperoxide, leio

In [13]:
# Compute similarity between 'apple' and 'orange'
similarity = sem_calc.compute_similarity('apple', 'orange')
print(f"Similarity between 'apple' and 'orange': {similarity:.4f}")

# Find shared connection between 'cat', 'dog', 'rabbit'
shared_connection = sem_calc.find_shared_connection(['cat', 'dog', 'rabbit'])
print("\nShared connection between 'cat', 'dog', 'rabbit':")
for word, score in shared_connection:
    print(f"{word}: {score:.4f}")


Similarity between 'apple' and 'orange': 0.3920

Shared connection between 'cat', 'dog', 'rabbit':
cat: 0.9105
dog: 0.8947
rabbit: 0.8304
puppy: 0.7766
beagle: 0.7664
