In [3]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

In [220]:
%run get_words

In [221]:
with open('five_letter_words.pkl', 'rb') as handle:
    all_words = pickle.load(handle)

In [222]:
class WordleBrain():
    def __init__(self, words):
        self._assert_words(words)

        self.words = words
        self._let2num, self._num2let = self._get_letter_map()
        self.X = self._words2mat(self.words)
        
    def _get_letter_map(self):
        letters = sorted(list(set("".join(self.words))))
        let2num = {l:n for n,l in enumerate(letters)}
        num2let = {n:l for l,n in let2num.items()}
        return let2num, num2let

    def _assert_words(self, words):
        # Assert length = 5
        assert all([len(w) == 5 for w in words]), "All words must have length 5"

        # Assert case
        assert all([w.lower() == w for w in words]), "All words must be lower case"

        # Assert used letters
        only_allowed = "abcdefghijklmnopqrstuvxyzæøå"
        used = set("".join(words))
        assert all([u in only_allowed for u in used]), "Only allowed letters are {only_allowed}"

    def _word2vec(self, word):
            return np.array([self._let2num[l] for l in word]).astype('int8')
    
    def _vec2word(self, vec):
        return "".join([self._num2let[n] for n in vec])
    
    def _words2mat(self, words):
        return np.array([self._word2vec(w) for w in words]).astype('int8')
    
    def _mat2words(self,mat):
        return [self._vec2word(vec) for vec in mat]
    
    

In [223]:
wb = WordleBrain(all_words)
X = wb.X

In [244]:
def reduce_vecs(X, guess, response):
    assert len(guess) == 5, "Guess must have length 5"
    assert len(response) == 5, "Response must hav length 5"
    assert all([r in [0,1,2] for r in response]), "Response must be either 0 (gray), 1 (orange), 2 (green)"

    double_letters = list(set([g for g in guess if guess.count(g) > 1]))
    double_idx = [idx for idx,g in enumerate(guess) if g in double_letters]
    
    mask = np.array([True]*len(X))
    if not double_letters:
        green_idx  = [idx for idx,r in enumerate(response) if r == 2]
        orange_idx = [idx for idx,r in enumerate(response) if r == 1]
        gray_idx   = [idx for idx,r in enumerate(response) if r == 0]
        green_guesses = np.array(guess)[green_idx]
        orange_guesses = np.array(guess)[orange_idx]
        gray_guesses = np.array(guess)[gray_idx]
        
        # Green: all letters must be in correct position
        mask &= np.all(X[:,green_idx] == green_guesses, axis = 1)
        
        # Orange: Word must contain all orange letters, but not in the guessed positions
        contains_all_orange = np.sum(np.isin(X,orange_guesses), axis = 1) == len(orange_guesses)
        letters_not_at_orange = np.all(X[:,orange_idx] != orange_guesses, axis = 1)
        mask &= contains_all_orange & letters_not_at_orange
        
        # Gray: Word must not contain letters
        contains_any_gray = np.any(np.isin(X, gray_guesses), axis = 1)
        mask &= ~contains_any_gray
        
    else: # Be aware that some letters may occure twice. 
        # The below code would also apply for no double letters
        # Iterate over each guess letter g and response r, from left to right
        colored_letters = []
        for idx, (g,r) in enumerate(zip(guess,response)):
            if r == 2: # Green: Correct letter and position
                colored_letters.append(g)
                mask &= X[:,idx] == g
            else: # If orange or gray
                # Letter should not be in this position
                mask &= X[:,idx] != g 
                if r == 1: # Orange: Correct letter but wrong position
                    colored_letters.append(g)
                    if g in colored_letters:
                        # The letter must occure at least as many times as it is collered
                        mask &= np.sum(X == g, axis = 1) >= colored_letters.count(g)
                    else:
                        # Letter must be in word at least once
                        mask &= np.any(X == g, axis = 1)
                elif r == 0: # Gray: Wrong letter
                    if g in colored_letters:
                        # The letter must occure exactly as many times as it is collered
                        mask &= np.sum(X == g, axis = 1) == colored_letters.count(g)
                    else:
                        # The letter cannot be in the word
                        mask &= ~np.any(X == g, axis = 1)
                        
    return X[mask]

_X = reduce_vecs(X, list(wb._word2vec('sagde')) ,[0,0,2,2,2])
wb._mat2words(_X)

['bygde', 'hugde', 'kogde', 'lægde']

In [238]:
wb._mat2words(X[np.all(X[:,[2,3]] == np.array(wb._word2vec('sagde'))[[2,3]], axis = 1)])

['bygde',
 'bygds',
 'hugde',
 'kogde',
 'lagde',
 'lægde',
 'lægds',
 'sagde',
 'sigde',
 'sigds']

In [228]:
[x for x in wb._mat2words(X) if 'sagde' in x]

['sagde']

In [204]:
data = pd.read_csv("ods_fullforms_2020-08-26.csv",sep='\t',header = None)

In [206]:
words = list(data[0])
map_letters = {
        'aa':'å',
        'ã':'a',
        'ê':'e',
        'é':'e'
        }
for old, new in map_letters.items():
    words = [w.replace(old, new) for w in words]
    
    
# 5 letters
words = [w for w in words if len(w) == 5]

# Remove words with strange symbols
only_allowed = "abcdefghijklmnopqrstuvxyzæøå"
word_allowed = lambda word: all([letter in only_allowed for letter in word])
words = [w for w in words if word_allowed(w)]

# Drop duplicates
words = sorted(list(set(words)))

[w for w in words if 'agde' in w]

['lagde', 'sagde']

In [180]:
~(X[:,3] == 4)

array([False, False,  True, ...,  True,  True,  True])

In [149]:
def f1(X):
    for idx in range(5):
        for n in range(20):
            mask = X[:,idx] > n
            X = X[mask]
    return X
        
def f2(X):
    mask = np.array([True]*len(X))
    for idx in range(5):
        for n in range(20):
            mask &= X[:,idx] > n
    return X[mask]

In [144]:
import timeit

803 µs ± 134 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [197]:
def update_words(all_words, guess, response):
    words = all_words
    color_letters = []
    for idx, (r,l) in enumerate(zip(response, guess)):
        if r == 2: # Green: Correct letter and position
            words = [w for w in words if w[idx]==l]
            color_letters.append(l)
            if not words:
                return []
        if r == 1: # Orange: Correct letter wrong position
            if l in color_letters: # If a previous green or gray
                nb_occ_in_word = color_letters.count(l) + 1
                words = [w for w in words if w.count(l) >= nb_occ_in_word and (w[idx] != l)]
            else:
                words = [w for w in words if (l in w) and (w[idx] != l)]
            if not words:
                return []
            color_letters.append(l)
        if r == 0: # Gray: Wrong letter
            if l in color_letters: # If this letter is collored before, we know that the exact number of occurences is reached
                nb_occ_in_word = color_letters.count(l) + 1
                words = [w for w in words if w.count(l) == nb_occ_in_word and (w[idx] != l)]
                if not words:
                    return []
            else: # Only exclude if this letter has not been given a color before
                words = [w for w in words if l not in w]
                if not words:
                    return []
    return words

In [198]:
def get_information(all_words, guess):
    nb_all_words = len(all_words)
    probs = []
    information = 0
    # For all patterns
    for idx, response in enumerate(list(itertools.product(*[[0,1,2] for _ in range(5)]))):
        nb_reduced = len(update_words(all_words, guess, response))
        prob = nb_reduced / nb_all_words
        information += - prob * np.log2(prob) if prob != 0 else 0

    return information

In [201]:
%prun get_information(all_words,'hbupæ')

 

         2015 function calls in 0.492 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      314    0.276    0.001    0.276    0.001 <ipython-input-197-2719664d8c6d>:26(<listcomp>)
      314    0.115    0.000    0.115    0.000 <ipython-input-197-2719664d8c6d>:6(<listcomp>)
      314    0.089    0.000    0.089    0.000 <ipython-input-197-2719664d8c6d>:15(<listcomp>)
      243    0.010    0.000    0.491    0.002 <ipython-input-197-2719664d8c6d>:1(update_words)
        1    0.002    0.002    0.492    0.492 <ipython-input-198-b3171ce83484>:1(get_information)
      581    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
      244    0.000    0.000    0.000    0.000 {built-in method builtins.len}
        1    0.000    0.000    0.492    0.492 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 <ipython-input-198-b3171ce83484>:6(<listcomp>)
        1    0.000    0.000    0.492    0.492 <

In [195]:
informations = [get_information(all_words,guess) for guess in list(all_words)[:10]]


KeyboardInterrupt: 

In [188]:
df = pd.DataFrame()
df['ord'] = list(all_words)[:100]
df['information'] = informations
df.sort_values(by = 'information', ascending = False)

Unnamed: 0,ord,information
96,seret,7.011466
56,skeet,6.113415
50,beter,6.021064
92,tilse,5.696584
24,samer,5.684311
...,...,...
14,opgiv,3.577537
75,opråb,3.529832
0,påbid,3.350558
79,omhug,3.269102


In [182]:
get_information(all_words,'skeet')

6.113414650810543

In [137]:
print(len(all_words))
words = update_words(all_words, 'bores',[0,0,0,1,1])
print(len(words))
words = update_words(words, 'liste',[0,0,1,0,2])
print(len(words))
words = update_words(words, 'snude',[2,0,0,0,2])
print(len(words))
words = update_words(words, 'smage',[2,0,0,0,2])
print(len(words))
words = update_words(words, 'svæve',[2,0,0,0,2])
print(len(words))
words

11729
645
173
34
14
6


['sykke', 'skøje', 'sekse', 'skeje', 'søkke', 'sejse']

In [132]:
[w for w in all_words if 'skeje' in w]

['skeje']