In [None]:
import fasttext
import fasttext.util

ft = fasttext.load_model('/Users/davidhunt/Documents/fastText/cc.en.300.bin')
ft.get_dimension()

fasttext.util.reduce_model(ft, 100)
ft.get_dimension()


In [None]:
import pandas as pd
import numpy as np

#  Load the CSV file
data = pd.read_csv("/Users/davidhunt/Documents/github/tamudatathon2024/data/formatted_data.csv")

# Pick a random row
random_row = data.sample(n=1)

# Extract columns 2-17 (index 1-16 in zero-based indexing)
selected_columns = random_row.iloc[0, 1:17].values

# Store in a 1D array
words = np.array(selected_columns)

print(words)



embeds = [ft.get_word_vector(word) for word in words]

groups = len(words) // 4



['FIERCE' 'DEEP' 'INTENSE' 'EXTREME' 'RAM' 'BUMP' 'BUTT' 'KNOCK' 'SNOW'
 'NOISE' 'FUZZ' 'STATIC' 'PROUD' 'VIRGIN' 'HAIL' 'BLOODY']


In [None]:
from sklearn.cluster import KMeans

similarities = cosine_similarity(embeds)
# print(similarities)

kmeans = KMeans(n_clusters=groups, random_state=0)
#kmeans.fit(embeds)
kmeans.fit(similarities)

for i in range(4):
    cluster_words = [words[j] for j in range(len(words)) if kmeans.labels_[j] == i]
    print(f"Cluster {i+1}: {cluster_words}")


Cluster 1: ['BUMP', 'SNOW', 'PROUD', 'HAIL']
Cluster 2: ['KNOCK', 'NOISE', 'FUZZ', 'STATIC']
Cluster 3: ['RAM']
Cluster 4: ['FIERCE', 'DEEP', 'INTENSE', 'EXTREME', 'BUTT', 'VIRGIN', 'BLOODY']


In [None]:
from k_means_constrained import KMeansConstrained
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

clf = KMeansConstrained(n_clusters=groups, size_min=4, size_max=4, random_state=42)
# clusters = clf.fit_predict(embeds)
clusters = clf.fit_predict(similarities)

cluster_cohesion = {}
most_cohesive_cluster = None
mymax = -1

for i in range(groups):
    # Get indices of elements in the current cluster
    cluster_indices = [j for j in range(len(words)) if clf.labels_[j] == i]
    
    # Extract similarities for the current cluster only
    cluster_similarities = similarities[np.ix_(cluster_indices, cluster_indices)]
    
    # Calculate the average similarity (excluding self-similarity)
    avg_similarity = np.mean(cluster_similarities[np.triu_indices(len(cluster_indices), k=1)])
    cluster_cohesion[i] = avg_similarity
    
    # Print the words in each cluster for reference
    cluster_words = [words[j] for j in cluster_indices]
    print(f"Cluster {i+1}: {cluster_words}")
    print(f"Average similarity (cohesion): {avg_similarity:.4f}\n")

    if avg_similarity > mymax:
        mymax = avg_similarity
        most_cohesive_words = cluster_words
    

# Find the cluster with the highest cohesion
most_cohesive_cluster = max(cluster_cohesion, key=cluster_cohesion.get)
print(f"The most cohesive cluster is Cluster {most_cohesive_cluster + 1} with cohesion {cluster_cohesion[most_cohesive_cluster]:.4f}")

print(most_cohesive_words)
# print(f"Strongest cluster: {max(clf.cluster_centers_)}")

Cluster 1: ['FIERCE', 'INTENSE', 'EXTREME', 'BLOODY']
Average similarity (cohesion): 0.6368

Cluster 2: ['RAM', 'PROUD', 'VIRGIN', 'HAIL']
Average similarity (cohesion): 0.3428

Cluster 3: ['BUMP', 'BUTT', 'KNOCK', 'SNOW']
Average similarity (cohesion): 0.6092

Cluster 4: ['DEEP', 'NOISE', 'FUZZ', 'STATIC']
Average similarity (cohesion): 0.6577

The most cohesive cluster is Cluster 4 with cohesion 0.6577
['DEEP', 'NOISE', 'FUZZ', 'STATIC']


In [None]:
import pandas as pd

similarity_pairs = {}

for i in range(len(words)):
    for j in range(i + 1, len(words)):
        similarity_pairs[(words[i], words[j])] = similarities[i, j]

# Sort the list of pairs by similarity in descending order
similarity_pairs_list = sorted(similarity_pairs.items(), key=lambda x: x[1], reverse=True)

# Convert to a DataFrame for easier viewing
similarity_df = pd.DataFrame(similarity_pairs_list, columns=["Word Pair", "Similarity"])

print(similarity_pairs_list)
print(similarity_df)

[(('INTENSE', 'EXTREME'), 0.7851559), (('NOISE', 'FUZZ'), 0.72250944), (('KNOCK', 'BLOODY'), 0.69096476), (('BUMP', 'KNOCK'), 0.69085044), (('DEEP', 'STATIC'), 0.6837579), (('FUZZ', 'STATIC'), 0.67562854), (('FIERCE', 'INTENSE'), 0.6744777), (('EXTREME', 'NOISE'), 0.67093134), (('NOISE', 'STATIC'), 0.6577175), (('BUTT', 'KNOCK'), 0.6548339), (('KNOCK', 'FUZZ'), 0.6495125), (('BUTT', 'BLOODY'), 0.644129), (('KNOCK', 'NOISE'), 0.6438557), (('FIERCE', 'PROUD'), 0.64218885), (('VIRGIN', 'BLOODY'), 0.6386467), (('EXTREME', 'BLOODY'), 0.63531), (('DEEP', 'EXTREME'), 0.6295576), (('DEEP', 'INTENSE'), 0.6286102), (('EXTREME', 'STATIC'), 0.62820345), (('KNOCK', 'STATIC'), 0.62675816), (('STATIC', 'BLOODY'), 0.6257698), (('EXTREME', 'BUTT'), 0.6216041), (('KNOCK', 'HAIL'), 0.6211042), (('BUMP', 'BUTT'), 0.6186168), (('DEEP', 'NOISE'), 0.61550343), (('BUTT', 'PROUD'), 0.6125905), (('BUMP', 'SNOW'), 0.60809624), (('PROUD', 'HAIL'), 0.6051511), (('DEEP', 'KNOCK'), 0.60381556), (('HAIL', 'BLOODY'), 

In [None]:
def get_scores(list1, list2):
    scores = []
    for word in list1:
        score = 0
        for word2 in list2:
            if word != word2:
                score += similarity_pairs.get((word, word2), similarity_pairs.get((word2, word)))
        scores.append(score)
    return scores

def find_most_dissimilar(list):
    scores = get_scores(list, list)
    print(scores.index(min(scores)))
    return scores.index(min(scores))

def find_next_similar(list1, list2, priority = 2):
    scores = get_scores(list1, list2)
    print(words[scores.index(max(scores))])
    print(scores.index(max(scores)))
    return words[scores.index(max(scores))]

In [None]:
isOneAway = True
last = ['BUMP', 'BUTT', 'KNOCK', 'SNOW']
prev = [['BUMP', 'BUTT', 'KNOCK', 'SNOW']]

if (isOneAway):
    guess = last
    print(guess)
    # guess.remove(guess[find_most_dissimilar(guess)]) # kicks out least similar word
    guess[find_most_dissimilar(guess)] = (find_next_similar(words, guess)) # adds most similar word
    if not guess in prev:
        prev.append(guess)
    print(guess)

['BUMP', 'BUTT', 'KNOCK', 'SNOW']
BLOODY
15
3
['BUMP', 'BUTT', 'KNOCK', 'BLOODY']


In [None]:
print(type(words))

<class 'numpy.ndarray'>
