In [67]:
import numpy as np
from collections import Counter, defaultdict
from tqdm.notebook import tqdm_notebook
from scipy.stats import entropy
import threading

In [68]:
VOCAB_FILES = ["dictionaries/wordle-guesses.txt", "dictionaries/wordle-answers.txt"]

In [69]:
def load_vocab(filepaths):
    vocab = set()
    for filepath in filepaths:
        with open(filepath) as f:
            for line in f:
                vocab.add(line.strip())
            
    vocab_list = list(vocab)
    vocab_to_idx = {w:i for i, w in enumerate(vocab_list)}
    return vocab_list, vocab_to_idx

In [70]:
vocab_list, vocab_to_idx = load_vocab(VOCAB_FILES)
vocab_list = vocab_list
print(len(vocab_list))

12972


## Hint Representation

We need a function that produces the hint string given a guess word and a goal word.

Hints take the form of an n-length array where n is the length of the guess word. Hints are created with the following rules (Let $c_i$ be the $i$th character of the guess word):

1. The $i$th position in the hint contains a 0 if $c_i$ does not appear in the goal word or if the $i$-length prefix of the guess word contains at least $n$ instances of $c_i$, where $n$ is the number of times $c_i$ appears in the goal word.

2. The $i$th position in the hint contains a 1 if $c_i$ appears in the goal word, but not in position $i$. 

3. The $i$th position in the hint contains a 2 if the $i$th position of the goal word is $c_i$

In [71]:
def get_hint(guess_word, goal_word):
    hint = np.zeros(len(guess_word), dtype=int)
    
    guess_counts = defaultdict(int)
    goal_counts = Counter(goal_word)
    
    for i, c in enumerate(guess_word):
        guess_counts[c] += 1
        if goal_counts[c] < guess_counts[c]:
            hint[i] = 0
        elif c != goal_word[i]:
            hint[i] = 1
        else:
            hint[i] = 2
        
    return hint

In [72]:
example_hint = get_hint("aroma", "alter")
example_hint

array([2, 1, 0, 0, 0])

In [73]:
def hint_to_str(hint):
    chars = ["_", "o", "x"]
    hint_str = "".join([chars[idx] for idx in hint])
    return hint_str

In [74]:
def str_to_hint(hint_str):
    d = {'_':0, 'o':1, 'x':2}
    hint = np.zeros(len(hint_str), dtype=int)
    for i, c in enumerate(hint_str):
        hint[i] = d[c]
    return hint

In [75]:
hint_str = hint_to_str(example_hint)
hint_str

'xo___'

In [76]:
str_to_hint(hint_str)

array([2, 1, 0, 0, 0])

In [77]:
def hint_to_num(hint, base=3):
    hint_num = 0
    factor = 1
    for i, v in enumerate(hint):
        hint_num += v * factor
        factor *= base
    return hint_num

In [78]:
def num_to_hint(hint_num, base=3, sz=5):
    hint = np.zeros(sz, dtype=int)
    factor = base**(sz-1)
    for i in range(sz-1, -1, -1):
        digit = 0
        while (hint_num >= digit * factor) and (digit < base): 
            digit += 1
        hint[i] = digit - 1
        hint_num -= (digit - 1) * factor
        factor /= base
    return hint

In [79]:
hint_num = hint_to_num(example_hint)
hint_num

5

In [80]:
num_to_hint(hint_num)

array([2, 1, 0, 0, 0])

Since constructing the hint for a particular word combination can become costly when done repeatitively, it is faster to pre-load a "hint-matrix" that stores the results of evaluating all possible guess and goal word combinations.

In [81]:
def build_hint_matrix(vocab_list):
    n_words = len(vocab_list)
    hint_matrix = np.zeros((n_words, n_words), dtype=int)
    
    hint_dict = {}        
    for i, guess_word in tqdm_notebook(enumerate(vocab_list), total=n_words):
        for j, goal_word in enumerate(vocab_list):
            hint = get_hint(guess_word, goal_word)
            hint_bytes = hint.tobytes()
            if hint_bytes not in hint_dict:
                hint_dict[hint_bytes] = hint_to_num(hint)
            hint_matrix[i,j] = hint_dict[hint_bytes]
        
    
    return hint_matrix

In [82]:
def build_hint_matrix_fast(vocab_list, n_threads=2):
    n_words = len(vocab_list)
    hint_matrix = np.zeros((n_words, n_words), dtype=int)
    
    hint_dict = {}
    def fill_rows(start, end):
        for i, guess_word in tqdm_notebook(enumerate(vocab_list[start:end]), total=end-start):
            for j, goal_word in enumerate(vocab_list):
                hint = get_hint(guess_word, goal_word)
                hint_bytes = hint.tobytes()
                if hint_bytes not in hint_dict:
                    hint_dict[hint_bytes] = hint_to_num(hint)
                hint_matrix[start + i,j] = hint_dict[hint_bytes]
                
    section_sz = int(n_words / n_threads) + 1
    threads = []
    for start in range(0, n_words, section_sz):
        end = start + section_sz if start + section_sz < n_words else n_words
        t = threading.Thread(target=fill_rows, args=(start, end))
        t.start()
        
    for t in threads:
        t.join()
    
    return hint_matrix

In [83]:
hint_matrix = build_hint_matrix(vocab_list)

  0%|          | 0/12972 [00:00<?, ?it/s]

## Calculating Word Entropy

Each potential guess word is associated with a distribution of possible hints (assuming that we select a goal word uniformly at random). The simplest approach is to assign an entropy value to each guess word according to its hint distribution. A more sophisticated approach takes into account the distribution of entropies conditioned on each possible hint that could be recieved after guessing. We can perform this evaluation recursively in order to determine the expected entropy of a set of word choices rather than a single choice.

In [87]:
def get_entropies(hint_matrix, idxs=None, progress_bar=False):
    idxs = np.arange(hint_matrix.shape[0]) if idxs is None else np.array(idxs)
    entropies = np.zeros_like(idxs, dtype=float)
    n_guesses = idxs.shape[0]
    for i in tqdm_notebook(range(n_guesses)) if progress_bar else range(n_guesses):
        curr_idx = idxs[i]
        unique_hints, hint_counts = np.unique(hint_matrix[curr_idx], return_counts=True)
        entropies[i] = entropy(hint_counts, base=2)

    sorted_args = np.argsort(entropies)[::-1]
    return idxs[sorted_args], entropies[sorted_args]

In [88]:
idxs, entropies = get_entropies(hint_matrix, progress_bar=True)
entropies

  0%|          | 0/12972 [00:00<?, ?it/s]

array([6.19405254, 6.14991874, 6.1143431 , ..., 1.97771464, 1.88653329,
       1.88105052])

In [89]:
print(f" Word | Entropy (Base 2)")
print(f"------+-------------------")
for i in range(min(50, idxs.shape[0])):
    curr_idx = idxs[i]
    w = vocab_list[curr_idx]
    e = entropies[i]
    print(f"{w} | {e}")

 Word | Entropy (Base 2)
------+-------------------
tares | 6.194052544375467
lares | 6.14991874245314
rales | 6.114343099454239
rates | 6.096242642514618
teras | 6.076619177276194
nares | 6.066830765753908
soare | 6.061395399096273
tales | 6.05498776140121
reais | 6.049777632888339
tears | 6.032338670239826
arles | 6.029656532378534
tores | 6.0182943721829965
salet | 6.016842875398278
aeros | 6.013480318472095
dares | 6.010334729949025
saner | 5.999263329266256
reals | 5.999162055397679
lears | 5.988782099885778
lores | 5.976968564571484
serai | 5.973617168039969
lanes | 5.971303767248158
laers | 5.968985947815154
pares | 5.967345992114739
cares | 5.9664061818171925
tires | 5.963365144184288
saine | 5.962649288084535
seral | 5.953595576828316
mares | 5.951777451541286
reans | 5.950810301312482
aloes | 5.944708752487158
sared | 5.942603015777064
roles | 5.941958662971253
teals | 5.938324842377551
aures | 5.9301547505576355
earls | 5.926625009398865
taels | 5.920944159469225
raise | 5.9

In [69]:
def get_recursive_max_entropies(hint_matrix, idxs=None, max_depth=None, curr_depth=0, max_paths=None, progress_bar=False):
    idxs = np.arange(hint_matrix.shape[0]) if idxs is None else np.array(idxs)
    entropies = np.zeros_like(idxs, dtype=float)
    n_guesses = idxs.shape[0]
    n_solutions = hint_matrix.shape[1]
    
    if (max_depth is None) or (curr_depth < max_depth):
        for i in tqdm_notebook(range(n_guesses)) if progress_bar else range(n_guesses):
            curr_idx = idxs[i]
            
            unique_hints, hint_counts = np.unique(hint_matrix[idx], return_counts=True)
            curr_entropy = entropy(hint_counts, base=2) # this is the unconditional entropy of the current guess word
            
            if curr_entropy == 0.: # this is the base-case if max_depth=None
                entropies[i] = 0
                
            else:
                max_entropies = np.zeros_like(unique_hints, dtype=float)
                n_hints = unique_hints.shape[0]
                for k in range(n_hints):
                    print(f"\r~~~~~~~~~~ Evaluating branch {k + 1}/{n_hints} at a depth of {curr_depth + 1}/{max_depth} ~~~~~~~~~~", end="")
                    hint = unique_hints[k] # assume that this is the hint received in response to our current guess word
                    next_matrix = hint_matrix[:, hint_matrix[i] == hint] # takes only the columns of the hint matrix with viable goal words
                    
                    if max_paths is not None:
                        next_idxs, next_entropies = get_entropies(next_matrix)
                        next_idxs = next_idxs[:max_paths] # this limits the number of guess words that are explored

                    _, rec_entropies = get_recursive_max_entropies(next_matrix, idxs=next_idxs, max_depth=max_depth, curr_depth=curr_depth+1, max_paths=max_paths)
                    max_entropies[k] = np.max(rec_entropies)

                entropies[i] = curr_entropy + np.sum(max_entropies * hint_counts / n_solutions)
        
        sorted_args = np.argsort(entropies)[::-1]
        return idxs[sorted_args], entropies[sorted_args]
    
    return get_entropies(hint_matrix, idxs, progress_bar)

In [None]:
rec_idxs, rec_entropies = get_recursive_max_entropies(hint_matrix, idxs[:10], max_depth=0, max_paths=10, progress_bar=True)

  0%|          | 0/10 [00:00<?, ?it/s]

~~~~~~~~~~ Evaluating branch 1/188 at a depth of 1/1 ~~~~~~~~~~~~

In [None]:
print(f" Word | Entropy (Base 2)")
print(f"------+-------------------")
for i in range(min(50, rec_idxs.shape[0])):
    curr_idx = rec_idxs[i]
    w = vocab_list[curr_idx]
    e = rec_entropies[i]
    print(f"{w} | {e}")

## WORDLE REPL

To make a interacting with this code a little easier, we can create repl to run while solving the wordle

In [None]:
hint_matrix_cpy = hint_matrix.copy()

GOAL_WORD = "alter"
mode = "suggesting"
remaining_words = np.arange(hint_matrix_cpy.shape[1])
while True:
    if mode == "guessing":
        guess = input("Enter a guess word: ")
        while guess not in vocab_list:
            print(f"\"{guess}\" is not in the vocab list, try another word")
            guess = input("Enter a guess word: ")
        guess_idx = vocab_to_idx[guess]
        mode = "hinting"
    
    if mode == "hinting":
        if GOAL_WORD is None:
            hint_str = input("Enter the resulting hint: ")    
            hint = str_to_hint(hint_str)
        else:
            hint = get_hint(guess, GOAL_WORD)
            print(f"The hint is \"{hint_to_str(hint)}\"")
        hint_num = hint_to_num(hint)
        viable_idxs = hint_matrix_cpy[guess_idx] == hint_num
        hint_matrix_cpy = hint_matrix_cpy[:, viable_idxs]
        remaining_words = remaining_words[viable_idxs]
        mode = "suggesting"
        print()
    
    if mode == "suggesting":
        idxs, entropies = get_entropies(hint_matrix_cpy)
        if remaining_words.shape[0] == 1:
            remaining_word = vocab_list[remaining_words[0]]
            print(f"The goal word is \"{remaining_word}\"")
            break
        else:
            print("Here are best guess words")
            print()
            print(f" Word | Entropy (Base 2)")
            print(f"------+-------------------")
            for i in range(min(30, idxs.shape[0])):
                curr_idx = idxs[i]
                w = vocab_list[curr_idx]
                e = entropies[i]
                print(f"{w} | {e:.4f}")
            print()
        mode = "guessing"