In [None]:
from wordle_solver import WordleSolver
from wordle_game import WordleGame

import string 
import sys
import math
import random

In [None]:
from sklearn.cluster import KMeans
import numpy as np

In [None]:
word_list_filename = "english_words_alpha_dwyl.txt"
target_opener_list_filename = "english_words_opener.txt"
target_full_list_filename = "english_words_full.txt"

In [None]:
original_wordle_list_filename = "english_words_original_wordle.txt"
all_original_wordle_words = [word.replace("\r","").replace("\n","") for word in open(original_wordle_list_filename, "r").readlines()]

In [None]:
# all_original_wordle_words = []

In [None]:
all_words = open(word_list_filename, "r").readlines()

In [None]:
all_words = [word.replace("\r","").replace("\n","") for word in all_words]

In [None]:
not_on_list_words = list([wordle_word for wordle_word in all_original_wordle_words if wordle_word not in all_words])

In [None]:
all_words += not_on_list_words

In [None]:
with open(target_full_list_filename, "w") as f:
    f.write("\n".join([word for word in all_words]))

In [None]:
min_clusters = 500
cluster_groupping_rate = 0.12

In [None]:
words_by_length_dict = {}

for word in all_words:
    word_length = len(word)
    if word_length not in words_by_length_dict:
        words_by_length_dict[word_length] = []
    words_by_length_dict[word_length].append(word)

In [None]:
letter_vectorising_array = [letter for letter in string.ascii_lowercase]
random.shuffle(letter_vectorising_array)

In [None]:
def vectorise_word(word):
    word_vector = []
    for letter in word:
        word_vector.append(letter_vectorising_array.index(letter) + 1)
    return word_vector

In [None]:
def get_letter_prob_dict(word_list):
    if len(word_list) <= 0:
        return []
    letter_freq_dict = {}
    for letter in string.ascii_lowercase:
        letter_freq_dict[letter] = 0
    for word in word_list:
        for letter in word:
            letter_freq_dict[letter] += 1
    letter_prob_dict = {}
    total_letters = sum([len(word) for word in word_list])
    for letter in letter_freq_dict.keys():
        letter_prob_dict[letter] = (letter_freq_dict[letter] / total_letters)

    return letter_prob_dict

In [None]:
def get_letter_positional_prob_dict(words):
    positional_prob = []
    for i in range(0, len(words[0])):
        letter_list = [word[i] for word in words]
        positional_prob.append(get_letter_prob_dict(letter_list))
    return positional_prob

In [None]:
def sort_words_with_letter_positional_prob(words):
    letter_position_prob = get_letter_positional_prob_dict(words)
    words_with_prob = []
    for word in words:
        score = 1
        for i in range(0, len(letter_position_prob)):
            if letter_position_prob[i]:
                score *= letter_position_prob[i][word[i]]
        words_with_prob.append((word, score))
    words_with_prob.sort(key=lambda element: element[1], reverse=True)
    return [word for (word, _) in words_with_prob]

In [None]:
def get_word_entropy_dict(words):
    word_entropy_dict = {}
    game = WordleGame(None, word_length)
    game.word_length = len(words[0])
    game.word_list = words
    solver = WordleSolver(None, word_length)
    solver.word_length = len(words[0])
    solver.word_list = words
    word_count = len(words)
    hidden_word_list = words.copy()
    random.shuffle(hidden_word_list)
    for word in words:
        possible_word_count = []
        for hidden_word in hidden_word_list:
            solver.reset()
            game.hidden_word = hidden_word
            response_symbols = game.guess(word)
            solver.input_guess_result(word, response_symbols)
            solver.update_pattern_paramters()
            possible_word_count.append(len(solver.get_possible_words()))
        probs = [(sys.float_info.min + (count / word_count)) for count in possible_word_count]
        entropy = 0 - sum([prob * math.log(prob, 2) for prob in probs])
        word_entropy_dict[word] = entropy
    return word_entropy_dict

In [None]:
def sort_words_with_entropy(words):
    word_entropies_dict = get_word_entropy_dict(words)
    words_with_entopies = [(word, word_entropies_dict[word]) for word in words]
    words_with_entopies.sort(key=lambda element: element[1], reverse=False)
    sorted_words = [words_with_entropy[0] for words_with_entropy in words_with_entopies]
    return sorted_words

In [None]:
def get_opener_list(word_list):
    if len(word_list) < min_clusters:
        return word_list
    word_list_vectors = [vectorise_word(word) for word in word_list]
    X = np.array(word_list_vectors)
    cluster_n = int(len(word_list) * cluster_groupping_rate)
    if cluster_n < min_clusters:
        cluster_n = min_clusters
    print(f"clusters: {cluster_n}")
    km = KMeans(
            n_clusters=cluster_n, init='random',
            n_init=10, max_iter=600, 
            tol=1e-04, random_state=0,
            algorithm='full'
        )
    y_km = km.fit_predict(X)
    clusters = []
    for i in range(0, cluster_n):
        clusters.append([])
    for i in range(0, len(y_km)):
        word = word_list[i]
        cluster_index = y_km[i]
        clusters[cluster_index].append(word)
    new_word_list = []
    for cluster_words in clusters:
        best_words = sort_words_with_entropy(cluster_words)
        # best_words = sort_words_with_letter_positional_prob(cluster_words)
        if len(best_words[0]) == 5 and len(all_original_wordle_words) > 0:
            for best_word in best_words:
                if best_word in all_original_wordle_words:
                    new_word_list.append(best_word)
                    break
        else:
            new_word_list.append(best_words[0])
    return new_word_list

In [None]:
for length in words_by_length_dict.keys():
    print(f"word length: {length}\tnumber of words: {len(words_by_length_dict[length])}")

In [None]:
opener_lists_by_length = {}

for length in words_by_length_dict.keys():
    print(f"processing word length {length}")
    opener_lists_by_length[length] = get_opener_list(words_by_length_dict[length])

In [None]:
full_opener_list = []

for length in opener_lists_by_length.keys():
    full_opener_list += opener_lists_by_length[length]

In [None]:
with open(target_opener_list_filename, "w") as f:
    f.write("\n".join([word for word in full_opener_list]))