In [156]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt

%matplotlib inline

def get_all_words():
    data = pd.read_csv("ods_fullforms_2020-08-26.csv",sep='\t',header = None)
    words = list(data[0])
    #head = list(data[1])
    #pos = list(data[3])

    # Remove accents and aa
    map_letters = {
        'aa':'å',
        'ã':'a',
        'ê':'e',
        'é':'e'}
    for old,new in map_letters.items():
        words = [w.replace(old,new) for w in words]

    # 5 letters
    words = [w.lower() for w in words if len(w) == 5]

    # Remove words with strange symbols
    only_allowed = "abcdefghijklmnopqrstuvxyzæøå"
    word_allowed = lambda word: all([letter in only_allowed for letter in word])
    words = [w for w in words if word_allowed(w)]

    # Drop duplicates
    words = set(words)

    print("Letters in words:")
    letters = set("".join(words))
    print(", ".join(sorted(letters)))
    print(f"Number of words: {len(words):,}")

    return words

all_words = get_all_words()

Letters in words:
a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, x, y, z, å, æ, ø
Number of words: 11,729


In [197]:
def update_words(all_words, guess, response):
    words = all_words
    color_letters = []
    for idx, (r,l) in enumerate(zip(response, guess)):
        if r == 2: # Green: Correct letter and position
            words = [w for w in words if w[idx]==l]
            color_letters.append(l)
            if not words:
                return []
        if r == 1: # Orange: Correct letter wrong position
            if l in color_letters: # If a previous green or gray
                nb_occ_in_word = color_letters.count(l) + 1
                words = [w for w in words if w.count(l) >= nb_occ_in_word and (w[idx] != l)]
            else:
                words = [w for w in words if (l in w) and (w[idx] != l)]
            if not words:
                return []
            color_letters.append(l)
        if r == 0: # Gray: Wrong letter
            if l in color_letters: # If this letter is collored before, we know that the exact number of occurences is reached
                nb_occ_in_word = color_letters.count(l) + 1
                words = [w for w in words if w.count(l) == nb_occ_in_word and (w[idx] != l)]
                if not words:
                    return []
            else: # Only exclude if this letter has not been given a color before
                words = [w for w in words if l not in w]
                if not words:
                    return []
    return words

In [198]:
def get_information(all_words, guess):
    nb_all_words = len(all_words)
    probs = []
    information = 0
    # For all patterns
    for idx, response in enumerate(list(itertools.product(*[[0,1,2] for _ in range(5)]))):
        nb_reduced = len(update_words(all_words, guess, response))
        prob = nb_reduced / nb_all_words
        information += - prob * np.log2(prob) if prob != 0 else 0

    return information

In [201]:
%prun get_information(all_words,'hbupæ')

 

         2015 function calls in 0.492 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      314    0.276    0.001    0.276    0.001 <ipython-input-197-2719664d8c6d>:26(<listcomp>)
      314    0.115    0.000    0.115    0.000 <ipython-input-197-2719664d8c6d>:6(<listcomp>)
      314    0.089    0.000    0.089    0.000 <ipython-input-197-2719664d8c6d>:15(<listcomp>)
      243    0.010    0.000    0.491    0.002 <ipython-input-197-2719664d8c6d>:1(update_words)
        1    0.002    0.002    0.492    0.492 <ipython-input-198-b3171ce83484>:1(get_information)
      581    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
      244    0.000    0.000    0.000    0.000 {built-in method builtins.len}
        1    0.000    0.000    0.492    0.492 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 <ipython-input-198-b3171ce83484>:6(<listcomp>)
        1    0.000    0.000    0.492    0.492 <

In [195]:
informations = [get_information(all_words,guess) for guess in list(all_words)[:10]]


KeyboardInterrupt: 

In [188]:
df = pd.DataFrame()
df['ord'] = list(all_words)[:100]
df['information'] = informations
df.sort_values(by = 'information', ascending = False)

Unnamed: 0,ord,information
96,seret,7.011466
56,skeet,6.113415
50,beter,6.021064
92,tilse,5.696584
24,samer,5.684311
...,...,...
14,opgiv,3.577537
75,opråb,3.529832
0,påbid,3.350558
79,omhug,3.269102


In [182]:
get_information(all_words,'skeet')

6.113414650810543

In [137]:
print(len(all_words))
words = update_words(all_words, 'bores',[0,0,0,1,1])
print(len(words))
words = update_words(words, 'liste',[0,0,1,0,2])
print(len(words))
words = update_words(words, 'snude',[2,0,0,0,2])
print(len(words))
words = update_words(words, 'smage',[2,0,0,0,2])
print(len(words))
words = update_words(words, 'svæve',[2,0,0,0,2])
print(len(words))
words

11729
645
173
34
14
6


['sykke', 'skøje', 'sekse', 'skeje', 'søkke', 'sejse']

In [132]:
[w for w in all_words if 'skeje' in w]

['skeje']