In [1]:
import numpy as np
import random
import re
import time
import itertools
import scipy
import multiprocessing
from functools import partial
from scipy.spatial.distance import cdist

def load_model_vectors(model_path, words_set):
    vectors = {}
    with open(model_path, "r", encoding="utf8") as f:
        for line in f:
            tokens = line.rstrip().split(" ")
            word = tokens[0].lower()
            if word in words_set:
                vector = np.asarray(tokens[1:], dtype="float32")
                vectors[word] = vector
    return vectors


# Update Model class to store words and vectors as NumPy arrays
class Model:
    def __init__(self, model="glove.840B.300d.txt", dictionary="words.txt", pattern="^[a-z][a-z-]*[a-z]$"):
        words_set = set()
        with open(dictionary, "r", encoding="utf8") as f:
            for line in f:
                line = line.strip()
                if re.match(pattern, line):
                    words_set.add(line.lower())

        self.vectors_dict = load_model_vectors(model, words_set)
        self.words_array = np.array(list(self.vectors_dict.keys()))
        self.word_vectors = np.array(list(self.vectors_dict.values()))
        self.vectors = self.vectors_dict  # For compatibility

    def validate(self, word):
        """Clean up word and find best candidate to use"""

        # Strip unwanted characters
        clean = re.sub(r"[^a-zA-Z- ]+", "", word).strip().lower()
        if len(clean) <= 1:
            return None  # Word too short

        # Generate candidates for possible compound words
        candidates = []
        if " " in clean:
            candidates.append(re.sub(r" +", "-", clean))
            candidates.append(re.sub(r" +", "", clean))
        else:
            candidates.append(clean)
            if "-" in clean:
                candidates.append(re.sub(r"-+", "", clean))
        for cand in candidates:
            if cand in self.vectors:
                return cand  # Return first word that is in model
        return None  # Could not find valid word

    def distance(self, word1, word2):
        """Compute cosine distance (0 to 2) between two words"""
        return scipy.spatial.distance.cosine(self.vectors.get(word1), self.vectors.get(word2))

    def dat(self, words, minimum=7):
        """Compute DAT score"""
        # Keep only valid unique words
        uniques = []
        for word in words:
            valid = self.validate(word)
            if valid and valid not in uniques:
                uniques.append(valid)

        # Keep subset of words
        if len(uniques) >= minimum:
            subset = uniques[:minimum]
        else:
            return None  # Not enough valid words

        # Compute distances between each pair of words
        distances = []
        for word1, word2 in itertools.combinations(subset, 2):
            dist = self.distance(word1, word2)
            distances.append(dist)

        # Compute the DAT score (average semantic distance multiplied by 100)
        return (sum(distances) / len(distances)) * 100

In [2]:
def greedy_search(args):
    seed, words, word_vectors, k = args
    np.random.seed(seed)
    selected_indices = []
    remaining_indices = np.arange(len(words))

    # Randomly select the first word
    first_idx = np.random.choice(remaining_indices)
    selected_indices.append(first_idx)
    remaining_indices = np.delete(remaining_indices, np.where(remaining_indices == first_idx))

    selected_vecs = [word_vectors[first_idx]]

    for _ in range(k - 1):
        # Compute distances from selected words to all remaining words
        dists = cdist(word_vectors[remaining_indices], np.vstack(selected_vecs), 'cosine')
        min_dists = np.min(dists, axis=1)

        # Select the word with the maximum of these minimum distances
        max_min_dist_idx = np.argmax(min_dists)
        next_idx = remaining_indices[max_min_dist_idx]
        selected_indices.append(next_idx)
        selected_vecs.append(word_vectors[next_idx])

        # Remove selected index from remaining
        remaining_indices = np.delete(remaining_indices, max_min_dist_idx)

    selected_words = [words[idx] for idx in selected_indices]

    # Compute DAT score directly
    selected_vectors = np.vstack(selected_vecs)
    pairwise_distances = cdist(selected_vectors, selected_vectors, 'cosine')

    # Extract upper triangle without the diagonal
    triu_indices = np.triu_indices(k, k=1)
    distances = pairwise_distances[triu_indices]

    # Compute the DAT score
    score = np.mean(distances) * 100
    return score, selected_words

def find_max_dissimilar_words(model, k=7, num_restarts=10):
    words = model.words_array
    word_vectors = model.word_vectors

    best_score = -1
    best_words = None

    seeds = [random.randint(0, int(1e8)) for _ in range(num_restarts)]
    args_list = [(seed, words, word_vectors, k) for seed in seeds]

    pool = multiprocessing.Pool()
    results = pool.map(greedy_search, args_list)
    pool.close()
    pool.join()

    for score, selected_words in results:
        if score > best_score:
            best_score = score
            best_words = selected_words

    return best_words, best_score

In [3]:
model = Model(model="../../data/glove.840B.300d.txt", dictionary="../../data/words.txt")

In [5]:
start_time = time.time()
selected_words, score = find_max_dissimilar_words(model, k=10, num_restarts=10)
end_time = time.time()

print("Selected Words:", selected_words)
print("DAT Score:", score)
print("Time Taken:", end_time - start_time, "seconds")


Selected Words: ['ocelli', 'brand', 'geo', 'culminated', 'sec', 'herbalism', 'self-respecting']
DAT Score: 106.73878455664895
Time Taken: 292.5000169277191 seconds
