In [1]:
import pandas as pd
import numpy as np

In [2]:
NORM = ord('a')-1
OUTPUT_STRING = "The strategy correctly guesses {:.2f}% of the dictionary words"

In [3]:
with open('../data/words_alpha.txt', 'r') as f: 
    data = f.read() 
words = data.splitlines()

In [4]:
letter_map = {chr(i): i-97 for i in range(97,123)}
position_map = {i-97: chr(i) for i in range(97,123)}

In [5]:
cooccurrence_probabilities = np.zeros((26,26))

In [6]:
cooccurrence_probabilities = np.zeros((26,26))
prior_probabilities = np.zeros((26,))
for word in words:
    for i in range(len(word)-1):
        for j in range(i+1,len(word)):
            cooccurrence_probabilities[letter_map[word[i]]][letter_map[word[j]]] += 1
        prior_probabilities[letter_map[word[i]]] += 1
    prior_probabilities[letter_map[word[-1]]] += 1
prior_probabilities /= prior_probabilities.sum()
cooccurrence_probabilities /= cooccurrence_probabilities.sum(axis=1).reshape((26,1))
prior_probabilities, cooccurrence_probabilities

(array([0.08464029, 0.01829659, 0.04377464, 0.03238966, 0.10772156,
        0.01122788, 0.02364355, 0.02643092, 0.08956661, 0.00156122,
        0.00767277, 0.05577454, 0.03010506, 0.07194762, 0.07199369,
        0.03252444, 0.00168341, 0.07043304, 0.07161769, 0.06607014,
        0.03762703, 0.00946435, 0.00641172, 0.00300255, 0.02019633,
        0.00422269]),
 array([[6.34680963e-02, 1.78970933e-02, 4.26776288e-02, 3.38250249e-02,
         1.04469063e-01, 5.94198159e-03, 2.57924259e-02, 2.24175414e-02,
         1.03569285e-01, 6.70521145e-04, 8.33803465e-03, 8.17008096e-02,
         2.67554466e-02, 8.43491166e-02, 5.59047903e-02, 2.01242584e-02,
         8.69593339e-04, 6.42902573e-02, 7.87348495e-02, 8.75960775e-02,
         2.32785106e-02, 7.03005128e-03, 3.68391360e-03, 2.14523646e-03,
         2.83401188e-02, 6.13027371e-03],
        [8.88753380e-02, 1.45608521e-02, 3.10479539e-02, 3.38717904e-02,
         1.36171037e-01, 5.79014706e-03, 2.20207955e-02, 1.84646422e-02,
         1.0

In [7]:
def game_cp(word, is_print=True):
    if is_print: print(f'word: {word}')
    guess_position = 1
    correct_letters_needed = len(set(word))
    transition_probs = prior_probabilities
    transition_order = np.argsort(prior_probabilities)
    guess = position_map[transition_order[-guess_position]]
    guesses = set()
    n_mistakes = 0
    n_correct = 0
    while n_mistakes < 8 and n_correct < correct_letters_needed:
        guesses.update(guess)
        if guess in word:
            n_correct += 1
            if is_print: print(f'correct guess: {guess}')
            guess_position = 1
            transition_probs = cooccurrence_probabilities[letter_map[guess]]
            transition_order = np.argsort(transition_probs)
            guess = position_map[transition_order[-guess_position]]
            while guess in guesses:
                guess_position += 1
                guess = position_map[transition_order[-guess_position]]
        else:
            if is_print: print(f'wrong guess: {guess}')
            n_mistakes += 1
            guess_position += 1
            guess = position_map[transition_order[-guess_position]]
            while guess in guesses:
                guess_position += 1
                guess = position_map[transition_order[-guess_position]]
    if (n_correct == correct_letters_needed):
        if is_print: print('WIN!')
    return n_correct == correct_letters_needed

In [8]:
results = []
is_print = False
for i, word in enumerate(words):
    if is_print: print(f'GAME: {i}')
    if is_print: print('-'*20)
    results.append(game_cp(word, is_print=is_print))
    if is_print: print('-'*20)
print(OUTPUT_STRING.format((sum(results) / len(words)*100)))

The strategy correctly guesses 29.22% of the dictionary words


In [9]:
def game_cp_cum(word, is_print=True):
    if is_print: print(f'word: {word}')
    guess_position = 1
    correct_letters_needed = len(set(word))
    transition_probs = np.zeros((26,))
    transition_order = np.argsort(prior_probabilities)
    guess = position_map[transition_order[-guess_position]]
    guesses = set()
    n_mistakes = 0
    n_correct = 0
    while n_mistakes < 8 and n_correct < correct_letters_needed:
        guesses.update(guess)
        if guess in word:
            n_correct += 1
            if is_print: print(f'correct guess: {guess}')
            guess_position = 1
            transition_probs += cooccurrence_probabilities[letter_map[guess]]
            transition_order = np.argsort(transition_probs)
            guess = position_map[transition_order[-guess_position]]
            while guess in guesses:
                guess_position += 1
                guess = position_map[transition_order[-guess_position]]
        else:
            if is_print: print(f'wrong guess: {guess}')
            n_mistakes += 1
            guess_position += 1
            guess = position_map[transition_order[-guess_position]]
            while guess in guesses:
                guess_position += 1
                guess = position_map[transition_order[-guess_position]]
    if (n_correct == correct_letters_needed):
        if is_print: print('WIN!')
    return n_correct == correct_letters_needed

In [10]:
results = []
is_print = False
for i, word in enumerate(words):
    if is_print: print(f'GAME: {i}')
    if is_print: print('-'*20)
    results.append(game_cp_cum(word, is_print=is_print))
    if is_print: print('-'*20)
print(OUTPUT_STRING.format((sum(results) / len(words)*100)))

The strategy correctly guesses 28.24% of the dictionary words


# LENGTH DEPENDENT TRANSITION PROBABILITIES

In [11]:
MAX_N_LETTERS = 31
cp = [np.zeros((26,26)) for i in range(MAX_N_LETTERS)]
pp = [np.zeros((26,)) for i in range(MAX_N_LETTERS)]
for word in words:
    n = len(word)-1
    for i in range(len(word)-1):
        for j in range(i+1,len(word)):
            cp[n][letter_map[word[i]]][letter_map[word[j]]] += 1
        pp[n][letter_map[word[i]]] += 1
    pp[n][letter_map[word[-1]]] += 1
for i in range(MAX_N_LETTERS):
    pp[i] /= pp[i].sum()
    cp[i] /= cp[i].sum(axis=1).reshape((26,1))
ip = [e/e.sum()for e in [np.divide(1, pp[i], out=np.zeros_like(pp[i]), where=pp[i]!=0) for i in range(MAX_N_LETTERS)]]

  cp[i] /= cp[i].sum(axis=1).reshape((26,1))
  pp[i] /= pp[i].sum()


In [12]:
def game_cp_ld(word, is_print=True):
    if is_print: print(f'word: {word}')
    WORD_LEN = len(word)-1
    guess_position = 1
    correct_letters_needed = len(set(word))
    transition_probs = np.zeros((26,))
    transition_order = np.argsort(pp[WORD_LEN])
    guess = position_map[transition_order[-guess_position]]
    guesses = set()
    n_mistakes = 0
    n_correct = 0
    while n_mistakes < 8 and n_correct < correct_letters_needed:
        guesses.update(guess)
        if guess in word:
            n_correct += 1
            if is_print: print(f'correct guess: {guess}')
            guess_position = 1
            transition_probs = cp[WORD_LEN][letter_map[guess]]
            transition_order = np.argsort(transition_probs)
            guess = position_map[transition_order[-guess_position]]
            while guess in guesses:
                guess_position += 1
                guess = position_map[transition_order[-guess_position]]
        else:
            if is_print: print(f'wrong guess: {guess}')
            n_mistakes += 1
            guess_position += 1
            guess = position_map[transition_order[-guess_position]]
            while guess in guesses:
                guess_position += 1
                guess = position_map[transition_order[-guess_position]]
    if (n_correct == correct_letters_needed):
        if is_print: print('WIN!')
    return n_correct == correct_letters_needed

In [13]:
results = []
is_print = False
for i, word in enumerate(words):
    if is_print: print(f'GAME: {i}')
    if is_print: print('-'*20)
    results.append(game_cp_ld(word, is_print=is_print))
    if is_print: print('-'*20)
print(OUTPUT_STRING.format((sum(results) / len(words)*100)))

The strategy correctly guesses 29.60% of the dictionary words


## CUMMULATIVE WEIGHTS OF CORRECT LETTERS

In [14]:
def game_cp_ld_cum(word, is_print=True, cum_strategy='uniform'):
    if is_print: print(f'word: {word}')
    WORD_LEN = len(word)-1
    guess_position = 1
    correct_letters_needed = len(set(word))
    transition_probs = np.zeros((26,))
    transition_order = np.argsort(pp[WORD_LEN])
    guess = position_map[transition_order[-guess_position]]
    guesses = set()
    n_mistakes = 0
    n_correct = 0
    while n_mistakes < 8 and n_correct < correct_letters_needed:
        guesses.update(guess)
        if guess in word:
            n_correct += 1
            if is_print: print(f'correct guess: {guess}')
            guess_position = 1
            if cum_strategy == 'uniform':
                transition_probs += cp[WORD_LEN][letter_map[guess]]
            elif cum_strategy == 'inverse_weights':
                transition_probs += cp[WORD_LEN][letter_map[guess]]*ip[WORD_LEN][letter_map[guess]]
            transition_order = np.argsort(transition_probs)
            guess = position_map[transition_order[-guess_position]]
            while guess in guesses:
                guess_position += 1
                guess = position_map[transition_order[-guess_position]]
        else:
            if is_print: print(f'wrong guess: {guess}')
            n_mistakes += 1
            guess_position += 1
            guess = position_map[transition_order[-guess_position]]
            while guess in guesses:
                guess_position += 1
                guess = position_map[transition_order[-guess_position]]
    if (n_correct == correct_letters_needed):
        if is_print: print('WIN!')
    return n_correct == correct_letters_needed

In [15]:
results = []
is_print = False
for i, word in enumerate(words):
    if is_print: print(f'GAME: {i}')
    if is_print: print('-'*20)
    results.append(game_cp_ld_cum(word, is_print=is_print))
    if is_print: print('-'*20)
print(OUTPUT_STRING.format((sum(results) / len(words)*100)))

The strategy correctly guesses 29.36% of the dictionary words


In [16]:
results = []
is_print = False
for i, word in enumerate(words):
    if is_print: print(f'GAME: {i}')
    if is_print: print('-'*20)
    results.append(game_cp_ld_cum(word, is_print=is_print, cum_strategy = 'inverse_weights'))
    if is_print: print('-'*20)
print(OUTPUT_STRING.format((sum(results) / len(words)*100)))

The strategy correctly guesses 29.62% of the dictionary words
