In [297]:
import numpy as np
import pandas as pd
import itertools
import pickle
import datetime

%matplotlib inline

In [220]:
%run get_words

In [221]:
with open('five_letter_words.pkl', 'rb') as handle:
    all_words = pickle.load(handle)

In [354]:
class WordleBrain():
    def __init__(self, words):
        self._assert_words(words)

        self.words = words
        self._let2num, self._num2let = self._get_letter_map()
        self.X = self._words2mat(self.words)
        self.nb_guesses = 0
        
    def _get_letter_map(self):
        letters = sorted(list(set("".join(self.words))))
        let2num = {l:n for n,l in enumerate(letters)}
        num2let = {n:l for l,n in let2num.items()}
        return let2num, num2let

    def _assert_words(self, words):
        # Assert length = 5
        assert all([len(w) == 5 for w in words]), "All words must have length 5"

        # Assert case
        assert all([w.lower() == w for w in words]), "All words must be lower case"

        # Assert used letters
        only_allowed = "abcdefghijklmnopqrstuvxyzæøå"
        used = set("".join(words))
        assert all([u in only_allowed for u in used]), "Only allowed letters are {only_allowed}"

    def _word2vec(self, word):
            return np.array([self._let2num[l] for l in word]).astype('int8')
    
    def _vec2word(self, vec):
        return "".join([self._num2let[n] for n in vec])
    
    def _words2mat(self, words):
        return np.array([self._word2vec(w) for w in words]).astype('int8')
    
    def _mat2words(self,mat):
        return [self._vec2word(vec) for vec in mat]
    
    def reduce_vecs(self, X, guess, response):
        assert len(guess) == 5, "Guess must have length 5"
        assert len(response) == 5, "Response must hav length 5"
        assert all([r in [0,1,2] for r in response]), "Response must be either 0 (gray), 1 (orange), 2 (green)"
        guess = list(guess)

        double_response = [r for g,r in zip(guess, response) if guess.count(g) > 1]
        doubles_all_gray = all([r == 0 for r in double_response])
        doubles_all_green = all([r == 2 for r in double_response])
        safely_ignore_doubles = doubles_all_gray or doubles_all_green
        
        
        mask = np.array([True]*len(X))
        if safely_ignore_doubles:
            green_idx  = [idx for idx,r in enumerate(response) if r == 2]
            orange_idx = [idx for idx,r in enumerate(response) if r == 1]
            gray_idx   = [idx for idx,r in enumerate(response) if r == 0]
            green_guesses = np.array(guess)[green_idx]
            orange_guesses = np.array(guess)[orange_idx]
            gray_guesses = np.array(guess)[gray_idx]
            
            # Green: all letters must be in correct position
            mask &= np.all(X[:,green_idx] == green_guesses, axis = 1)
            
            # Orange: Word must contain all orange letters, but not in the guessed positions
            contains_all_orange = np.sum(np.isin(X,orange_guesses), axis = 1) == len(orange_guesses)
            letters_not_at_orange = np.all(X[:,orange_idx] != orange_guesses, axis = 1)
            mask &= contains_all_orange & letters_not_at_orange
            
            # Gray: Word must not contain letters
            contains_any_gray = np.any(np.isin(X, gray_guesses), axis = 1)
            mask &= ~contains_any_gray
            
        else: # Be aware that some letters may occure twice. 
            # The below code would also apply for no double letters
            # Iterate over each guess letter g and response r, from left to right
            colored_letters = []
            for idx, (g,r) in enumerate(zip(guess,response)):
                if r == 2: # Green: Correct letter and position
                    colored_letters.append(g)
                    mask &= X[:,idx] == g
                else: # If orange or gray
                    # Letter should not be in this position
                    mask &= X[:,idx] != g 
                    if r == 1: # Orange: Correct letter but wrong position
                        colored_letters.append(g)
                        if g in colored_letters:
                            # The letter must occure at least as many times as it is collered
                            mask &= np.sum(X == g, axis = 1) >= colored_letters.count(g)
                        else:
                            # Letter must be in word at least once
                            mask &= np.any(X == g, axis = 1)
                    elif r == 0: # Gray: Wrong letter
                        if g in colored_letters:
                            # The letter must occure exactly as many times as it is collered
                            mask &= np.sum(X == g, axis = 1) == colored_letters.count(g)
                        else:
                            # The letter cannot be in the word
                            mask &= ~np.any(X == g, axis = 1)
                            
        return X[mask]
    
    def get_information(self, X, guess):
        nb_all_words = len(X)
        information = 0
        # For all patterns
        for response in itertools.product(*[[0,1,2] for _ in range(5)]):
            nb_reduced = len(self.reduce_vecs(X, guess, response))
            prob = nb_reduced / nb_all_words
            information += - prob * np.log2(prob) if prob != 0 else 0

        return information
    
    def get_all_informations(self, X, save_name = None):
        nb_words = len(X)
        informations = np.zeros(nb_words).astype('float')
        for idx, guess in enumerate(X):
            informations[idx] = self.get_information(X,guess)
            if save_name:
                print(f"Processed {self._vec2word(guess)} ({idx+1:>5} of {nb_words}): Expected {informations[idx]:.3f} bits of information")
        
        df = pd.DataFrame()
        df['word'] = self._mat2words(X)
        df['expected_information'] = informations
        df['expected_nb_words'] = [nb_words*(1/2)**info for info in informations]
        
        df = df.sort_values(by = 'expected_information', ascending = False)
        
        if save_name:
            timestamp = str(datetime.datetime.today()).split('.')[0].replace('-','_').replace(':','_').replace(' ','_')
            df.to_csv(f"{save_name}_{timestamp}.csv")
        else:
            return df
        
    def register_guess(self, guess, response, verbose = False):
        self.nb_guesses += 1
        guess_word = guess
        guess = list(self._word2vec(guess))
        response = list(response)
        expected_information = self.get_information(self.X, guess)
        old_nb_words = len(self.X)
        self.X = self.reduce_vecs(self.X, guess, response)
        new_nb_words = len(self.X)
        prob = new_nb_words / old_nb_words
        realized_information = -np.log2(prob)
        expected_nb_words = old_nb_words*(1/2)**expected_information
    
        if verbose:
            print(f"You guessed '{guess_word}'.")
            print(f"Expected information: {expected_information:.3f} bits (leading to {expected_nb_words:.1f} words)")
            print(f"Realized information: {realized_information:.3f} bits (leading to {new_nb_words} words)")
            print('\n')
    
    def test_guess(self, guess):
        expected_information = self.get_information(self.X, self._word2vec(guess))
        expected_nb_words = len(self.X)*(1/2)**expected_information
        print(f"Guessing on {guess} is expected to give {expected_information:.3f} bits of information (leading to {expected_nb_words:.1f} words)")
        
    def status(self, show_best_nb = 0):
        wait_message = f"Waiting for guess {self.nb_guesses + 1}..."
        nb_words_message = f"There are {len(self.X)} words to choose from."
        best_words_message = f"Here are the top {show_best_nb} best words to choose next:"
        
        if self.nb_guesses == 0:
            print(wait_message)
            print(nb_words_message)
            if show_best_nb > 0:
                print("The best words among all words takes long to calculate. Please see stored file.")
        elif self.nb_guesses < 5:
            print(wait_message)
            print(nb_words_message)
            if show_best_nb > 0:
                df = self.get_all_informations(self.X)
                print(best_words_message)
                print(df.head(show_best_nb))
        else:
            print("No more guesses")
                
        print('\n')
    

In [355]:
wb = WordleBrain(all_words)
wb.get_all_informations(wb.X, save_name = 'word_informations')

Processed abbed (    1 of 11757): Expected 4.198 bits of information
Processed abcen (    2 of 11757): Expected 4.871 bits of information
Processed abcte (    3 of 11757): Expected 5.030 bits of information
Processed abcts (    4 of 11757): Expected 4.740 bits of information
Processed abede (    5 of 11757): Expected 4.422 bits of information
Processed abels (    6 of 11757): Expected 5.976 bits of information
Processed abens (    7 of 11757): Expected 5.972 bits of information
Processed aberi (    8 of 11757): Expected 5.443 bits of information
Processed abers (    9 of 11757): Expected 6.197 bits of information
Processed abild (   10 of 11757): Expected 4.664 bits of information
Processed abort (   11 of 11757): Expected 5.377 bits of information
Processed abrod (   12 of 11757): Expected 4.910 bits of information
Processed achts (   13 of 11757): Expected 4.524 bits of information
Processed adder (   14 of 11757): Expected 4.860 bits of information
Processed adels (   15 of 11757): 

KeyboardInterrupt: 

In [339]:
wb.register_guess('sarte', [0,0,1,1,2], verbose = True)
wb.status(5)

You guessed 'sarte'.
Expected information: 7.096 bits (leading to 85.9 words)
Realized information: 7.590 bits (leading to 61 words)


Waiting for guess 2...
There are 61 words to choose from.
61
Here are the top 5 best words to choose next:
     word  expected_information  expected_nb_words
47  træne              3.653921           4.846073
34  trine              3.642473           4.884680
26  tiøre              3.609474           4.997697
36  trone              3.598703           5.035149
50  trøge              3.598612           5.035466




In [340]:
wb.register_guess('træne', [1,1,0,0,2], verbose = True)
wb.status(5)

You guessed 'træne'.
Expected information: 3.654 bits (leading to 4.8 words)
Realized information: 1.761 bits (leading to 18 words)


Waiting for guess 3...
There are 18 words to choose from.
18
Here are the top 5 best words to choose next:
     word  expected_information  expected_nb_words
7   litre              3.058814           2.160119
4   iltre              3.058814           2.160119
8   lutre              2.855954           2.486250
14  rulre              2.702904           2.764504
6   letre              2.702904           2.764504




In [341]:
wb.status(20)

Waiting for guess 3...
There are 18 words to choose from.
18
Here are the top 20 best words to choose next:
     word  expected_information  expected_nb_words
7   litre              3.058814           2.160119
4   iltre              3.058814           2.160119
8   lutre              2.855954           2.486250
14  rulre              2.702904           2.764504
6   letre              2.702904           2.764504
0   bitre              2.599666           2.969580
1   citre              2.599666           2.969580
17  vitre              2.599666           2.969580
3   gitre              2.599666           2.969580
13  rudre              2.362740           3.499587
11  riere              2.322085           3.599609
10  putre              2.102187           4.192288
2   døtre              2.060249           4.315944
16  uture              1.991076           4.527921
5   køtre              1.879965           4.890426
9   metre              1.879965           4.890426
15  rytme              1.

In [None]:
wb.register_guess('bores', [1,0,0,0,0], verbose = True)
wb.status(5)

['ammen',
 'ammer',
 'ammes',
 'ammet',
 'emmen',
 'emmer',
 'emmes',
 'emmet',
 'mamas',
 'mamen',
 'mamer',
 'ommen',
 'ommer',
 'ommes',
 'ømmer',
 'ømmes']

False

In [254]:
all([])

True

In [238]:
wb._mat2words(X[np.all(X[:,[2,3]] == np.array(wb._word2vec('sagde'))[[2,3]], axis = 1)])

['bygde',
 'bygds',
 'hugde',
 'kogde',
 'lagde',
 'lægde',
 'lægds',
 'sagde',
 'sigde',
 'sigds']

In [228]:
[x for x in wb._mat2words(X) if 'sagde' in x]

['sagde']

In [204]:
data = pd.read_csv("ods_fullforms_2020-08-26.csv",sep='\t',header = None)

In [206]:
words = list(data[0])
map_letters = {
        'aa':'å',
        'ã':'a',
        'ê':'e',
        'é':'e'
        }
for old, new in map_letters.items():
    words = [w.replace(old, new) for w in words]
    
    
# 5 letters
words = [w for w in words if len(w) == 5]

# Remove words with strange symbols
only_allowed = "abcdefghijklmnopqrstuvxyzæøå"
word_allowed = lambda word: all([letter in only_allowed for letter in word])
words = [w for w in words if word_allowed(w)]

# Drop duplicates
words = sorted(list(set(words)))

[w for w in words if 'agde' in w]

['lagde', 'sagde']

In [180]:
~(X[:,3] == 4)

array([False, False,  True, ...,  True,  True,  True])

In [198]:
def get_information(all_words, guess):
    nb_all_words = len(all_words)
    probs = []
    information = 0
    # For all patterns
    for idx, response in enumerate(list(itertools.product(*[[0,1,2] for _ in range(5)]))):
        nb_reduced = len(update_words(all_words, guess, response))
        prob = nb_reduced / nb_all_words
        information += - prob * np.log2(prob) if prob != 0 else 0

    return information

In [201]:
%prun get_information(all_words,'hbupæ')

 

         2015 function calls in 0.492 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      314    0.276    0.001    0.276    0.001 <ipython-input-197-2719664d8c6d>:26(<listcomp>)
      314    0.115    0.000    0.115    0.000 <ipython-input-197-2719664d8c6d>:6(<listcomp>)
      314    0.089    0.000    0.089    0.000 <ipython-input-197-2719664d8c6d>:15(<listcomp>)
      243    0.010    0.000    0.491    0.002 <ipython-input-197-2719664d8c6d>:1(update_words)
        1    0.002    0.002    0.492    0.492 <ipython-input-198-b3171ce83484>:1(get_information)
      581    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
      244    0.000    0.000    0.000    0.000 {built-in method builtins.len}
        1    0.000    0.000    0.492    0.492 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 <ipython-input-198-b3171ce83484>:6(<listcomp>)
        1    0.000    0.000    0.492    0.492 <

In [195]:
informations = [get_information(all_words,guess) for guess in list(all_words)[:10]]


KeyboardInterrupt: 

In [188]:
df = pd.DataFrame()
df['ord'] = list(all_words)[:100]
df['information'] = informations
df.sort_values(by = 'information', ascending = False)

Unnamed: 0,ord,information
96,seret,7.011466
56,skeet,6.113415
50,beter,6.021064
92,tilse,5.696584
24,samer,5.684311
...,...,...
14,opgiv,3.577537
75,opråb,3.529832
0,påbid,3.350558
79,omhug,3.269102


In [182]:
get_information(all_words,'skeet')

6.113414650810543

In [137]:
print(len(all_words))
words = update_words(all_words, 'bores',[0,0,0,1,1])
print(len(words))
words = update_words(words, 'liste',[0,0,1,0,2])
print(len(words))
words = update_words(words, 'snude',[2,0,0,0,2])
print(len(words))
words = update_words(words, 'smage',[2,0,0,0,2])
print(len(words))
words = update_words(words, 'svæve',[2,0,0,0,2])
print(len(words))
words

11729
645
173
34
14
6


['sykke', 'skøje', 'sekse', 'skeje', 'søkke', 'sejse']

In [132]:
[w for w in all_words if 'skeje' in w]

['skeje']