In [50]:
import gensim
import gensim.downloader as api
import numpy as np

import random
import json

import calc as c

In [51]:
VOCAB_PATH = "../data/words_dat.json"

# Read words from the input JSON file
with open(VOCAB_PATH, "r", encoding="utf-8") as infile:
    base_vocab = json.load(infile)

In [52]:
# Initialize the SemCalculator
sem_calc = c.SemCalculator(base_vocab=base_vocab)

Loading the 'word2vec-google-news-300' model...
Model loaded successfully.



In [53]:
# Compute difference vector: king - queen
difference_vec = sem_calc.subtract_vectors('king', 'queen')

# Add the difference vector to 'man'
new_vec = sem_calc.subtract_vectors('man', difference_vec)

# Find the most similar words to the new vector
similar_words = sem_calc.get_most_similar_words(new_vec, topn=10)

# Print the results
print("Most similar words to 'man - (king - queen)':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")


Most similar words to 'man - (king - queen)':
woman: 0.7187
man: 0.6558
girl: 0.5883
lady: 0.5754
teenager: 0.5378
schoolgirl: 0.4978
policewoman: 0.4907
blonde: 0.4871
redhead: 0.4778
brunette: 0.4762


In [54]:
print(sem_calc.get_most_similar_words("car", topn=20, pos_filter="NN"))

[('car', 1.0000001192092896), ('vehicle', 0.7821096181869507), ('cars', 0.7423831224441528), ('minivan', 0.6907036900520325), ('truck', 0.6735789775848389), ('scooter', 0.6381530165672302), ('sedan', 0.6336701512336731), ('motorcycle', 0.6256055235862732), ('van', 0.6115673780441284), ('vehicles', 0.5998871326446533), ('motorbike', 0.5921168923377991), ('bike', 0.5854154229164124), ('automobile', 0.5838367342948914), ('driver', 0.577939510345459)]


In [55]:
unrelated_words = sem_calc.get_most_unrelated_words(["test", "starcraft"], pos_filter="NN")

In [56]:
print(unrelated_words)

[('economist', 1.1239797621965408), ('poignancy', 1.1229719780385494), ('staples', 1.114982157945633), ('segued', 1.1101001650094986), ('tinware', 1.1093841940164566), ('directness', 1.1070785825140774), ('pained', 1.1067325808107853), ('somberness', 1.1062901727855206), ('macroeconomist', 1.1035850867629051), ('stridency', 1.1031844671815634)]


In [57]:
def find_maximally_unrelated_words(sem_calc, k, pos_filter=None):
    """
    Find a set of k words that are maximally semantically unrelated.

    Parameters:
    - sem_calc: An instance of SemCalculator.
    - k: The number of words to find.
    - pos_filter: List of POS tags to include (e.g., ['NN'] for nouns).

    Returns:
    - A list of words that are maximally semantically unrelated.
    """
    # Initialize the set with a random word from the vocabulary
    available_words = list(sem_calc.vocab)
    if pos_filter:
        # Filter available words by POS tags
        available_words = [
            word for word in available_words
            if sem_calc.word_pos_tags.get(word, "") in pos_filter and word in sem_calc.model
        ]
    else:
        # Ensure words are in the model's vocabulary
        available_words = [word for word in available_words if word in sem_calc.model]

    if len(available_words) < k:
        raise ValueError(f"Not enough words available to find {k} unrelated words.")

    # Randomly select the first word
    first_word = random.choice(available_words)
    selected_words = [first_word]
    available_words.remove(first_word)

    print(f"Starting word: {first_word}")

    # Iteratively select the most unrelated word
    while len(selected_words) < k:
        # Find the most unrelated words to the current set
        unrelated_words = sem_calc.get_most_unrelated_words(
            input_words_or_vectors=selected_words,
            topn=100,  # Get more candidates to avoid already selected words
            pos_filter=pos_filter
        )

        # Filter out words already selected or not available
        for word, _ in unrelated_words:
            if word not in selected_words and word in available_words:
                selected_words.append(word)
                available_words.remove(word)
                print(f"Selected word {len(selected_words)}: {word}")
                break
        else:
            # If no new word is found, break the loop
            print("No more unrelated words can be found.")
            break

    return selected_words

In [73]:
# Find a set of 5 maximally semantically unrelated words among nouns
k = 7
pos_filter = ['NN']  # Only consider nouns
unrelated_words_set = find_maximally_unrelated_words(sem_calc, k, pos_filter=pos_filter)



Starting word: mavericks
Selected word 2: curability
Selected word 3: finalizing
Selected word 4: conner
Selected word 5: inventoried
Selected word 6: gained
Selected word 7: charades


In [70]:
# randomly sampling nouns
nouns = random.sample([k for k,v in base_vocab.items() if v=="NN"], 7)

nouns

['crotonic',
 'refractometriably',
 'donkey',
 'diagnostics',
 'winkles',
 'xerographic',
 'mispromethazinisability']

In [68]:
k_more = 9
words = ["havoc"]

for i in range(k_more):
    # Get the most unrelated word to the current list of words
    # It returns a list of tuples [(word, distance)], so we extract the word
    word, distance = sem_calc.get_most_unrelated_words(words, 1, pos_filter=['NN'])[0]
    words.append(word)

# Print the resulting words
print("Maximally unrelated words starting from 'quality':")
for idx, word in enumerate(words, 1):
    print(f"{idx}: {word}")


Maximally unrelated words starting from 'quality':
1: havoc
2: thermography
3: pledges
4: sibling
5: quarrier
6: subheading
7: timeliest
8: acquit
9: commissary
10: attempting


In [63]:
# Parameters
pos_tags = ['NN']          # POS tags to include
num_clusters = 7          # Number of clusters / words to select

# Find diverse words (works worse than greedy)
diverse_words = sem_calc.find_top_words_by_clustering(pos_tags, num_clusters)

# Print the results
print("\nSelected words:")
for idx, word in enumerate(diverse_words, 1):
    print(f"{idx}: {word}")

Clustering 87874 words into 7 clusters...
Cluster 1: Top 10 words: pyrimidines, trophoblasts, cytidine, isoprenoid, metaplasia, degranulation, hexose, hydroperoxide, oxidases, leiomyoma
Cluster 2: Top 10 words: jubbly, caramba, hipsterish, wakey, swart, dillies, shitbag, pouters, staidly, plonky
Cluster 3: Top 10 words: habitude, ambiguousness, unconfessed, denotative, referentiality, ascriptions, incommensurability, indifferentism, abjectness, attitudinizing
Cluster 4: Top 10 words: dumdum, sallying, revilers, pussyfooted, roached, frogged, rived, vitalised, raying, ladino
Cluster 5: Top 10 words: irrebuttable, spinous, unpaged, paribus, polyphenylene, gravis, rheumatica, corpuscular, succinic, neurofibrillary
Cluster 6: Top 10 words: unhinges, overextends, atomizes, flusters, comes, expunges, verbalizes, goes, engrosses, emasculates
Cluster 7: Top 10 words: yellowthroat, glacé, gorgonians, ripply, stockinette, darner, turnstone, jardinieres, buttercreams, lavatera

Selected words:
1:

In [62]:
# Compute similarity between 'apple' and 'orange'
similarity = sem_calc.compute_similarity('apple', 'orange')
print(f"Similarity between 'apple' and 'orange': {similarity:.4f}")

# Find shared connection between 'cat', 'dog', 'rabbit'
shared_connection = sem_calc.find_shared_connection(['cat', 'dog', 'rabbit'])
print("\nShared connection between 'cat', 'dog', 'rabbit':")
for word, score in shared_connection:
    print(f"{word}: {score:.4f}")


Similarity between 'apple' and 'orange': 0.3920

Shared connection between 'cat', 'dog', 'rabbit':
cat: 0.9105
dog: 0.8947
rabbit: 0.8304
puppy: 0.7766
beagle: 0.7664
