## Imports

In [1]:
import numpy as np
import random
import operator
import time
import pandas as pd
from wordle_functions import *

## Importing datasets

### official words
- official wordle word list

In [2]:
### Official list
official_words = []

with open("data/official_words_processed.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        official_words.append(word)

f.close() # closes connection to file

print(len(official_words))
official_words[:5]

2310


['wince', 'thyme', 'mower', 'horde', 'heard']

### alternative list 1
- an alternate list of 5-letter words found on the web

In [3]:
### Official list
alt_words_1 = []

with open("data/alt_words_1.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        alt_words_1.append(word)

f.close() # closes connection to file

print(len(alt_words_1))
alt_words_1[:5]

14856


['rossa', 'jetty', 'wizzo', 'cuppa', 'cohoe']

### nltk grand corpus
- Amalgamation of all words in various NLTK corpora to have as big a dataset as possible
- Developed manually

In [4]:
### grand corpus tokens
nltk_tokens = []

with open("data/nltk_grand_corpus_tokens_5.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        nltk_tokens.append(word)

f.close() # closes connection to file

print(len(nltk_tokens))
nltk_tokens[:5]

535189


['years', 'board', 'dutch', 'group', 'agnew']

### nltk grand corpus types and counts

In [5]:
### grand corpus types and counts
nltk_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_5.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        if len(line.split("\t")) == 2:
            word = line.split("\t")[0]
            count = line.split("\t")[1]
            nltk_counts[word] = count
        else:
            continue

f.close() # closes connection to file

print(len(nltk_counts))
nltk_counts['which']

8043


'15760'

In [6]:
### Official list
official_words = []

with open("data/official_words_processed.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        if len(word) > 0: # there's one blank entry at the start
            official_words.append(word)

f.close() # closes connection to file

print(len(official_words))
official_words[:10]

2309


['wince',
 'thyme',
 'mower',
 'horde',
 'heard',
 'tenor',
 'zonal',
 'parry',
 'shied',
 'fizzy']

## Wordle functions + Testing

### Testing `wordle_wizard()`

In [7]:
test_1 = wordle_wizard(word_list = official_words, max_guesses = 6, 
                guess = "paint", target = "force",
                random_guess = False, random_target = False, 
                verbose = True, drama = 0, return_stats = False, record = True)
# get_letter_counts(word_list = word_list, letters = english_alphabet, sort = "descending")

-----------------------------

Guess 1: 'paint'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[]

Letters to guess again:
	[]

Letters to not guess again:
	['a', 'i', 'n', 'p', 't']

At this point:
	1919, 83.11% of total words have been eliminated, and
	390, 16.89% of total words remain possible.

Next guess:
	'loser'

-----------------------------

Guess 2: 'loser'
Letters in correct positions:
	[('o', 1)]

Letters in incorrect positions:
	[('e', 3), ('r', 4)]

Letters to guess again:
	['e', 'o', 'r']

Letters to not guess again:
	['a', 'i', 'l', 'n', 'p', 's', 't']

At this point:
	2303, 99.74% of total words have been eliminated, and
	6, 0.26% of total words remain possible.

Next guess:
	'horde'

-----------------------------

Guess 3: 'horde'
Letters in correct positions:
	[('o', 1), ('r', 2), ('e', 4)]

Letters in incorrect positions:
	[('e', 3), ('r', 4)]

Letters to guess again:
	['e', 'o', 'r']

Letters to not guess again:
	['a', 'd', 'h', 'i', 'l', 'n', 

In [8]:
for val in [False, True]:
        wordle_wizard(word_list = official_words, max_guesses = 6, 
                guess = "arose", target = "syrup",
                random_guess = False, random_target = False, 
                verbose = val, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'arose'
Guess 2: 'shirt'
Guess 3: 'surly'
Guess 4: 'syrup'

Congratulations! The Wordle has been solved in 4 guesses!
There were still 2 guesses remaining.

The target word was 'syrup'.

-----------------------------
-----------------------------

Guess 1: 'arose'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[('r', 1), ('s', 3)]

Letters to guess again:
	['r', 's']

Letters to not guess again:
	['a', 'e', 'o']

At this point:
	2288, 99.09% of total words have been eliminated, and
	21, 0.91% of total words remain possible.

Next guess:
	'shirt'

-----------------------------

Guess 2: 'shirt'
Letters in correct positions:
	[('s', 0)]

Letters in incorrect positions:
	[('r', 1), ('s', 3), ('r', 3)]

Letters to guess again:
	['r', 's']

Letters to not guess again:
	['a', 'e', 'h', 'i', 'o', 't']

At this point:
	2305, 99.83% of total words have been eliminated, and
	4, 0.17% of total words remain possible.

Next guess:
	'surly

### Testing on 3-letter words

In [9]:
### 3 letters
words_3_letters = []
words_3_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_3.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_3_letters.append(word)
                words_3_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_3_letters))
print(words_3_letters[:5])
words_3_types_counts['the']

1531
['the', 'and', 'for', 'his', 'was']


'286732'

In [10]:
wordle_wizard(word_list = words_3_letters, max_guesses = 6, 
                guess = "the", target = "his",
                random_guess = False, random_target = False, 
                verbose = False, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'the'
Guess 2: 'hoa'
Guess 3: 'his'

Congratulations! The Wordle has been solved in 3 guesses!
There were still 3 guesses remaining.

The target word was 'his'.

-----------------------------


### Testing on 4-letter words

In [11]:
### 3 letters
words_4_letters = []
words_4_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_4.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_4_letters.append(word)
                words_4_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_4_letters))
print(words_4_letters[:5])
words_4_types_counts['that']

4266
['that', 'with', 'this', 'they', 'have']


'57994'

### Testing on 6-letter words

In [12]:
### 6 letters
words_6_letters = []
words_6_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_6.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_6_letters.append(word)
                words_6_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_6_letters))
print(words_6_letters[:5])
words_6_types_counts[words_6_letters[0]]

11290
['little', 'before', 'people', 'should', 'things']


'5543'

### Testing on 7-letter words

In [13]:
### 7 letters
words_7_letters = []
words_7_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_7.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_7_letters.append(word)
                words_7_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_7_letters))
print(words_7_letters[:5])
words_7_types_counts[words_7_letters[0]]

12566
['because', 'through', 'against', 'another', 'himself']


'4809'

### Testing on 8-letter words

In [14]:
### 8 letters
words_8_letters = []
words_8_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_8.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_8_letters.append(word)
                words_8_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_8_letters))
print(words_8_letters[:5])
words_8_types_counts[words_8_letters[0]]

11650
['children', 'together', 'director', 'anything', 'american']


'3012'

### Testing on 9-letter words

In [15]:
### 9 letters
words_9_letters = []
words_9_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_9.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_9_letters.append(word)
                words_9_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_9_letters))
print(words_9_letters[:5])
words_9_types_counts[words_9_letters[0]]

9716
['something', 'character', 'therefore', 'according', 'different']


'2621'

### Testing on 10-letter words

In [16]:
### 10 letters
words_10_letters = []
words_10_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_10.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_10_letters.append(word)
                words_10_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_10_letters))
print(words_10_letters[:5])
words_10_types_counts[words_10_letters[0]]

7200
['characters', 'themselves', 'everything', 'especially', 'understand']


'1929'

In [17]:
wordle_wizard(word_list = official_words, max_guesses = 5, 
                guess = "quote", target = "silly",
                random_guess = False, random_target = False, 
                verbose = True, drama = 0, return_stats = False, record = False)

-----------------------------

Guess 1: 'quote'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[]

Letters to guess again:
	[]

Letters to not guess again:
	['e', 'o', 'q', 't', 'u']

At this point:
	1931, 83.63% of total words have been eliminated, and
	378, 16.37% of total words remain possible.

Next guess:
	'snarl'

-----------------------------

Guess 2: 'snarl'
Letters in correct positions:
	[('s', 0)]

Letters in incorrect positions:
	[('l', 4)]

Letters to guess again:
	['l', 's']

Letters to not guess again:
	['a', 'e', 'n', 'o', 'q', 'r', 't', 'u']

At this point:
	2303, 99.74% of total words have been eliminated, and
	6, 0.26% of total words remain possible.

Next guess:
	'slimy'

-----------------------------

Guess 3: 'slimy'
Letters in correct positions:
	[('s', 0), ('y', 4)]

Letters in incorrect positions:
	[('l', 1), ('i', 2), ('l', 4)]

Letters to guess again:
	['i', 'l', 's', 'y']

Letters to not guess again:
	['a', 'e', 'm', 'n', 'o', 'q', 'r', 

### `compare_wordle()` testing

In [18]:
df = pd.read_csv("compared_data/wordle_humans - Sheet1.csv")
print(df.shape)
df

(39, 8)


Unnamed: 0,player,target,first_guess,second_guess,third_guess,fourth_guess,fifth_guess,sixth_guess
0,diane,vague,arose,plate,cache,mauve,vague,none
1,aidan,apple,douce,lairy,slave,algae,apple,none
2,aidan,ninth,douce,lairy,gimps,ninth,none,none
3,aidan,flail,douce,lairy,snail,flail,none,none
4,aidan,stage,douce,lairy,phase,stage,none,none
5,aidan,heady,douce,lairy,ready,beady,heady,none
6,aidan,none,douce,dairy,dunes,debug,none,none
7,aidan,usage,douce,lairy,aunts,swamp,usage,none
8,aidan,sound,douce,pound,found,bound,mound,sound
9,aidan,salsa,douce,lairy,palms,salts,ghost,salsa


In [19]:
df = pd.read_csv("compared_data/wordle_humans - Sheet1.csv")
df
convert_row(df, 37)

('dad', 'toxic', ['audio', 'choir', 'toxic'])

In [20]:
### TESTING DF INTERPRETATION

df = pd.read_csv("compared_data/wordle_humans - Sheet1.csv")

row = 37

print(convert_row(df, row))
player = convert_row(df, row)[0]
target_word = convert_row(df, row)[1]
guess_list = convert_row(df, row)[2]

compare_wordle(word_list = official_words, max_guesses = 6, 
                    guess_list = guess_list, player = player, target = target_word,
                    verbose = False, return_stats = True, record = False)

('dad', 'toxic', ['audio', 'choir', 'toxic'])


{'first_guess': ['audio', 'audio'],
 'target_word': ['toxic', 'toxic'],
 'first_guess_vowels': [4.0, 4.0],
 'first_guess_consonants': [1.0, 1.0],
 'target_vowels': [2.0, 2.0],
 'target_consonants': [3.0, 3.0],
 'first_guess_meaning': [61.91, 61.91],
 'target_meaning': [44.4, 44.4],
 'target_guessed': [True, True],
 'mid_guesses_avg_vows': [2.67, 2.33],
 'mid_guesses_avg_cons': [2.33, 2.67],
 'avg_perf_letters': [2.0, 12.0],
 'avg_wrong_pos_letters': [4.0, 19.0],
 'avg_wrong_letters': [8.0, 29.0],
 'avg_remaining': [14.5, 6.8],
 'avg_intermediate_guess_meaning': [87.3, 93.65],
 'valid_success': [True, True],
 'player': ['dad', 'wizard'],
 'num_guesses': [3.0, 6.0],
 'expected_guesses': [6.0, 6.0],
 'luck': [0.56, 0]}

## Comparing player solutions against wizard solutions

In [21]:
# def create_compared_df(player_df, to_csv: bool = False, show_shapes: bool = False):
#     """
#     Creates master df of player wordle scores compared to how wordle_wizard would perform on the same puzzles

#     Parameters:
#     -----
#     `player_df`: Pandas dataFrame object
#         df of player scores of wordle puzzles
#     `to_csv`: bool
#         If True, writes returned df to csv
#     `show_shapes`: bool
#         If True, prints shape of new df before and after deleting duplicate rows (created by wordle_wizard running the same puzzles multiple times)
    
#     Returns:
#     -----
#     `df_master`: Pandas dataFrame object
#         df of player scores and wordle_wizard scores of wordle puzzles
#     """

#     stats_master = {}
#     excepts = []
#     for row in player_df.index:
#         player = convert_row(player_df, row)[0]
#         target_word = convert_row(player_df, row)[1]
#         guess_list = convert_row(player_df, row)[2]
#         try:
#             complete = compare_wordle(word_list = official_words, max_guesses = 6, 
#                         guess_list = guess_list, player = player, target = target_word,
#                         verbose = True, return_stats = True, record = False)
#             for metric, results in complete.items():
#                 if metric in stats_master:
#                     for result in results:
#                         stats_master[metric].append(result)
#                 else:
#                     stats_master[metric] = []
#                     for result in results:
#                         stats_master[metric].append(result)
#         except:
#             AttributeError
#             excepts.append(guess_list)

#     df_master = pd.DataFrame(stats_master)
#     print(df_master.columns.tolist())

#     # Re-organizing columns to a more logical order (for viewing)
#     df_master = df_master[['first_guess', 'target_word', 'player', 'num_guesses', 'expected_guesses', 'luck', 'first_guess_vowels', 'first_guess_consonants',
#                         'target_vowels', 'target_consonants', 'first_guess_entropy', 'target_entropy',
#                         'target_guessed', 'mid_guesses_avg_vows', 'mid_guesses_avg_cons', 'avg_perf_letters',
#                         'avg_wrong_pos_letters', 'avg_wrong_letters', 'avg_remaining', 'avg_intermediate_guess_entropy',
#                         'valid_success']]

#     # print(excepts)
#     if show_shapes == True:
#         print(df_master.shape) # check shape before deleting dups

#     # Delete duplicate rows (some created by process)
#     df_master.drop_duplicates(inplace = True)
    
#     if to_csv == True:
#         df_master.to_csv('compared_data/players_compared.csv') # write new data to csv
    
#     if show_shapes == True:
#         print(df_master.shape) # check shape after deleting dups
    
#     return df_master.reset_index(drop = True)

In [22]:
# test_word = "test 1 "
# test_word.strip().lower()

In [23]:
# df = pd.read_csv("compared_data/wordle_humans - Sheet1.csv")

# df_master = create_compared_df(df, to_csv = True, show_shapes = True)
# df_master

In [24]:
# print(df_master.query("player == 'aidan'")['num_guesses'].mean())
# print(df_master.query("player == 'aidan'").shape)
# df_master.query("player == 'aidan'").head()

In [25]:
# print(df_master.query("player == 'dad'")['num_guesses'].mean())
# print(df_master.query("player == 'dad'").shape)
# df_master.query("player == 'dad'").head()

In [26]:
# print(df_master.query("player == 'diane'")['num_guesses'].mean())
# print(df_master.query("player == 'diane'").shape)
# df_master.query("player == 'diane'").head()

In [27]:
# print(df_master.query("player == 'wizard'")['num_guesses'].mean())
# print(df_master.query("player == 'wizard'").shape)
# df_master.query("player == 'wizard'").head(40)

## Prefix/Suffix bias

In [28]:
test_1 = wordle_wizard(word_list = official_words, max_guesses = 6, 
                guess = "later", target = "jolly",
                random_guess = False, random_target = False, 
                verbose = True, drama = 0, return_stats = False, record = False)

-----------------------------

Guess 1: 'later'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[('l', 0)]

Letters to guess again:
	['l']

Letters to not guess again:
	['a', 'e', 'r', 't']

At this point:
	2192, 94.93% of total words have been eliminated, and
	117, 5.07% of total words remain possible.

Next guess:
	'solid'

-----------------------------

Guess 2: 'solid'
Letters in correct positions:
	[('o', 1), ('l', 2)]

Letters in incorrect positions:
	[('l', 0)]

Letters to guess again:
	['l', 'o']

Letters to not guess again:
	['a', 'd', 'e', 'i', 'r', 's', 't']

At this point:
	2303, 99.74% of total words have been eliminated, and
	6, 0.26% of total words remain possible.

Next guess:
	'colon'

-----------------------------

Guess 3: 'colon'
Letters in correct positions:
	[('o', 1), ('l', 2)]

Letters in incorrect positions:
	[('l', 0), ('o', 3)]

Letters to guess again:
	['l', 'o']

Letters to not guess again:
	['a', 'c', 'd', 'e', 'i', 'n', 'r', 's', 't']


In [29]:
suffix_freq_dist = {}
prefix_freq_dist = {}

for word in official_words:
    prefix = word[:2] # first 2 letters
    suffix = word[-2:] # last 2 letters
    if prefix not in prefix_freq_dist:
        prefix_freq_dist[prefix] = 1
    else:
        prefix_freq_dist[prefix] += 1

    if suffix not in suffix_freq_dist:
        suffix_freq_dist[suffix] = 1
    else:
        suffix_freq_dist[suffix] += 1

suffix_types = [key for key in suffix_freq_dist.keys()]
prefix_types = [key for key in prefix_freq_dist.keys()]

sorted_prefix_dist = sorted(prefix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)
sorted_suffix_dist = sorted(suffix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)

print("Prefixes:")
print(len(sorted_prefix_dist))
print(sorted_prefix_dist[:10])
print("-----")
print("Suffixes:")
print(len(sorted_suffix_dist))
print(sorted_suffix_dist[:10])

Prefixes:
214
[('st', 65), ('sh', 52), ('cr', 45), ('sp', 45), ('ch', 40), ('gr', 38), ('fl', 36), ('re', 36), ('tr', 36), ('br', 35)]
-----
Suffixes:
202
[('er', 141), ('ly', 56), ('ch', 56), ('se', 52), ('al', 49), ('ck', 47), ('ty', 46), ('te', 39), ('el', 38), ('dy', 38)]


In [30]:
grams_freq_dist = {}
gram_len = 3

for word in official_words:
    for i in range(0, len(word) - (gram_len - 1)): # so it doesn't index out of range
        gram = word[i:i + gram_len]

        if gram not in grams_freq_dist:
            grams_freq_dist[gram] = 1
        else:
            grams_freq_dist[gram] += 1

print(len(grams_freq_dist))
sorted_gram_dist = sorted(grams_freq_dist.items(), key = operator.itemgetter(1), reverse = True)
sorted_gram_dist[:15]

2197


[('ing', 31),
 ('lly', 22),
 ('ove', 21),
 ('ver', 21),
 ('sta', 21),
 ('ast', 20),
 ('lea', 19),
 ('ter', 19),
 ('tch', 19),
 ('sha', 18),
 ('ine', 18),
 ('ate', 18),
 ('sto', 18),
 ('ide', 18),
 ('out', 18)]

## Evaluating Every Permutation of Guesses and Target Words

In [31]:
# ### keeping track of completed start words
# start_words_done = set()
# with open("all_permutations/start_words_done.txt", "r", encoding = "utf-8") as f:
#     for word in f.read().split("\n"):
#         if len (word) > 0:
#             start_words_done.add(word)
# f.close() # closes connection to file

# ### keeping track of completed target words
# targ_words_done = set()
# with open("all_permutations/targ_words_done.txt", "r", encoding = "utf-8") as f:
#     for word in f.read().split("\n"):
#         if len (word) > 0:
#             targ_words_done.add(word)
# f.close() # closes connection to file

# print("-----")
# print(len(start_words_done))
# print(list(targ_words_done)[:10])
# print("-----")
# print(len(targ_words_done))
# print(list(targ_words_done)[:10])
# print("-----")

In [32]:
official_words[0:2]

['wince', 'thyme']

In [33]:
# import glob

# last_complete_words = []

# for filepath in (glob.glob("all_permutations/*")):
#     print (filepath)
#     filename = (filepath.split("/")[1])
#     if ".csv" in filename:
#         last_word_done = filename.split("_")[3][:-4]
#         if last_word_done in official_words:
#             last_complete_words.append(last_word_done)

# last_complete_words

# most_recent_word_indices = []

# for word in last_complete_words:
#     most_recent_word_indices.append(official_words.index(word))

# new_first_start_word = max(most_recent_word_indices) + 1 # adding 1 as to not do the same word as the previous last word
# new_first_start_word

# prev_df = pd.read_csv(f"all_permutations/sims_df_until_{official_words[new_first_start_word - 1]}.csv", index_col = [0])

# prev_df.head()

In [34]:
official_words[:10]

['wince',
 'thyme',
 'mower',
 'horde',
 'heard',
 'tenor',
 'zonal',
 'parry',
 'shied',
 'fizzy']

In [35]:
# excepts = [] # keeping track of word combinations that don't work for some reason

# stats_master = {}

# batch = 10

# new_first_start_word = 0 # TEMPORARY

# for start_word in official_words[new_first_start_word : new_first_start_word + batch]:
#     # if start_word not in start_words_done:

#     for target_word in official_words[:2]:

#         ## only run the combination if it hasn't been done already
#         try:
#             complete = wordle_wizard(word_list = official_words, max_guesses = 15, 
#                 guess = start_word, target = target_word,
#                 random_guess = False, random_target = False, 
#                     verbose = False, drama = 0, return_stats = True)
        
#             # start_words_done.add(start_word) # add start word to completed starts list
#             # targ_words_done.add(target_word) # add target word to completed targets list
            
#         except:
#             IndexError
#             excepts.append((complete["first_guess"], complete["target_word"]))
            
#         for metric, result in complete.items():
#             if metric in stats_master.keys():
#                 stats_master[metric].append(result)
#             else:
#                 stats_master[metric] = []

# ### df creation and csv writing 
# sims_df = pd.DataFrame(stats_master)
# sims_df.to_csv(f'all_permutations/sims_df_until_{completed_words[-1]}.csv')

# print(f"{len(sims_df)} iterations run. {len(excepts)} combinations excepted.")
# print(excepts[:10])

# print(sims_df.shape)
# # sims_df.head()
# sims_df.head(60)

## wordle_wizard() 2.0

In [1]:
def get_possible_guesses(word_list: list, perf_letters: list, incorr_pos: list, wrong_letters: list):
    """
    This function takes a single Wordle guess and feedback returned by the game about that guess, and returns a list of possible remaining guesses.

    Parameters:
    -----
    `word_list`: list
        list of all possible words immediately prior to this stage's guess
    `perf_letters_letters`: list
        list of tuples, where the structure of each tuple is: ("correct_letter", int of letter position). Example listed below
    `incorr_pos`: list
        list of tuples, same structure as `perf_letters_letters`, but with letters and their incorrect positions
    `wrong_letters`: list
        list of individual letters that are not in the target word whatsoever

    Returns:
    -----
    `potentials_list`: list
        list of words that remain after eliminating all words based on the information provided

    Examples of inputs:
    -----
    word_list = official_words # something like ['wince', 'thyme', 'mower', 'horde', 'heard', 'tenor', 'zonal', 'parry', 'shied', 'fizzy']
    
    perf_letters = [("r", 2)] # could be any number of tuples
    
    incorr_pos = [("t", 2), ("r", 4)] # could be any number of tuples
    
    wrong_letters_letters = ["l", "a", "e"] # could be any number of items
    """
    
    incorr_words = set() # set of all words that the target could not possibly be the target
    for word in word_list:
        if len(incorr_pos) > 0: # sometimes there are none
            for incorr_letter, pos in incorr_pos: # adding words that have words of incorr letter positions (but not words that have these letters altogether - they could just be in a different spot)
                if word[pos] == incorr_letter:
                    incorr_words.add(word)
        if len(wrong_letters) > 0: # sometimes there are none
            for bad_letter in wrong_letters: # adding words that have completely wrong letters in them
                if bad_letter in word:
                    incorr_words.add(word)

    intermediate_list = set(word_list).difference(incorr_words) # difference between all impossible words and entire original passed word_list
    # print(intermediate_list)
    potentials_list = set()

    for word in intermediate_list:
        if len(perf_letters) > 0: # sometimes there are none
            good_letters = []
            for letter, pos in perf_letters:
                if word[pos] == letter:
                    good_letters.append(letter)
                if len(good_letters) == len(perf_letters):
                    potentials_list.add(word)
                else:
                    pass # skip to the next word
    
    if len(potentials_list) > 0:
        potentials_list = list(potentials_list)
    else:
        potentials_list = list(intermediate_list)

    return list(potentials_list)

Wordle

In [2]:
word_list = official_words

guess = "later"
if guess not in word_list and len(guess) == 5: word_list.append(guess)
target = "tests"
if target not in word_list and len(target) == 5: word_list.append(target)
wordlen = len(guess)
letter_positions = set(i for i in range(0, wordlen))
guess_num = 0
guessed_words = []
perf_letters = []
incorr_pos = []
wrong_letters = []

while guess:
    
    # this is redefined for each guess - makes everything have to iterate through zero redundant data in each of these lists at each guess stage
    guessed_words.append(guess)

    print(guess, target)
    if guess == target:
        break

    #### Step 2 -- ALL PERFECT
    for i in letter_positions: # number of letters in each word (current word and target word)

        if guess[i] == target[i]:
            if (guess[i], i) not in perf_letters:
                perf_letters.append((guess[i], i))
        elif guess[i] != target[i] and guess[i] in target:
            if (guess[i], i) not in incorr_pos:
                incorr_pos.append((guess[i], i))
        elif guess[i] not in target:
            if (guess[i]) not in wrong_letters:
                wrong_letters.append(guess[i])

    perf_letters = sorted(perf_letters, key = operator.itemgetter(1), reverse = False)
    incorr_pos = sorted(incorr_pos, key = operator.itemgetter(1), reverse = False)
    wrong_letters = sorted(wrong_letters, key = operator.itemgetter(0), reverse = False)
    print("------------------")
    potential_next_words = get_possible_guesses(word_list = word_list, perf_letters = perf_letters, incorr_pos = incorr_pos, wrong_letters = wrong_letters)
    # print("------------------")
    # potential_next_words = [word for word in potential_next_words if word not in guessed_words] # excludes already guessed words, or else function runs infinitely
    print(len(potential_next_words))
    guess_ratings = get_word_meaning(words_to_rate = potential_next_words, word_list = word_list, normalized = False, ascending = False)
    if len(guess_ratings) < 20:
        print(guess_ratings)
    guess = guess_ratings[0][0]

    # set(potential_next_words).difference(set(official_words))
    if guess == target:
        print (guess, target)
        break

NameError: name 'official_words' is not defined

In [None]:
asdf

## Presentation Prep

## Plots

In [None]:
import plotly.express as px
import altair as alt

### Plotting Letter Count Distribution

In [None]:
letter_counts = get_letter_counts(official_words)
letter_counts_dict = {} # {letter : count}
letter_counts_dict["Letters"] = []
letter_counts_dict["Counts"] = []
letter_counts_dict["Vowel"] = []
for tup in letter_counts:
    letter_counts_dict["Letters"].append(tup[0].upper())
    letter_counts_dict["Counts"].append(tup[1])
    if tup[0] in "aeiouy":
        letter_counts_dict["Vowel"].append("Yes")
    else:
        letter_counts_dict["Vowel"].append("No")

letters_dist_df = pd.DataFrame(letter_counts_dict)
letters_dist_df

counts_plot = px.bar(letters_dist_df, x = "Letters", y = "Counts", title = "Distribution of Letters in Official Wordle List",
                    color = "Vowel", color_discrete_map = {"Yes": "#6ca965", "No": "#c8b653"})
counts_plot.update_layout(xaxis = {'categoryorder' : 'total descending'}, title_font_size = 22)

counts_plot.show()

### Sum of Words Unique Letter Frequencies (best 15 and worst 15)

In [None]:
total_letters_sum = sum(count for letter, count in letter_counts) 
assert total_letters_sum == len(official_words) * 5 # same as 2310 * 5 -- makes sense
print(total_letters_sum)

word_counts = []

for word in official_words:
    
    # get set of all letters in the word (this intentionally doesn't count duplicate letters)
    word_letters = set()
    for letter in word:
        word_letters.add(letter)
    
    # get the sum of all counts of each letter in the word
    word_sum = 0
    for letter in word_letters:
        word_sum += dict(letter_counts)[letter]

    # finally, add the word and its letter count sum to the list    
    word_counts.append((word, word_sum))

### Best and worst x words
words_counts_top_10 = sorted(word_counts, key = operator.itemgetter(1), reverse = True)[:10] # top 10 words
words_counts_middle_10 = sorted(word_counts, key = operator.itemgetter(1), reverse = True)[(len(word_counts) // 2) - 5 : (len(word_counts) // 2) + 5] # top 10 words
words_counts_bottom_10 = sorted(word_counts, key = operator.itemgetter(1), reverse = False)[:10] # bottom 10 words
words_counts_x_dict = {}
words_counts_x_dict["Words"] = []
words_counts_x_dict["Sum of Unique Letter Frequencies"] = []
for word, rating in words_counts_top_10:
    words_counts_x_dict["Words"].append(word)
    words_counts_x_dict["Sum of Unique Letter Frequencies"].append(rating)
for word, rating in words_counts_middle_10:
    words_counts_x_dict["Words"].append(word)
    words_counts_x_dict["Sum of Unique Letter Frequencies"].append(rating)
for word, rating in words_counts_bottom_10:
    words_counts_x_dict["Words"].append(word)
    words_counts_x_dict["Sum of Unique Letter Frequencies"].append(rating)

words_counts_x_df = pd.DataFrame(words_counts_x_dict)
words_counts_x_plot = px.bar(words_counts_x_df, x = "Words", y = "Sum of Unique Letter Frequencies", title = "Sum of Words' Unique Letter Frequencies")                            #   color_discrete_map = {"Yes": "#6ca965", "No": "#c8b653"})
                            
words_counts_x_plot.update_layout(xaxis = {'categoryorder' : 'total descending'}, title_font_size = 22)
words_counts_x_plot.update_traces(marker_color = "#6ca965")

words_counts_x_plot.show()

11550


## Testing Best and Worst Words Against all Wordle Words

In [None]:
excepts = [] # keeping track of word combinations that don't work for some reason

stats_master = {}

for start_word in ["later", "fuzzy"]:

    for target_word in official_words:

        ## only run the combination if it hasn't been done already
        try:
            complete = wordle_wizard(word_list = official_words, max_guesses = 15, 
                guess = start_word, target = target_word,
                random_guess = False, random_target = False, 
                    verbose = False, drama = 0, return_stats = True)

        except:
            IndexError
            excepts.append((complete["first_guess"], complete["target_word"]))
            
        for metric, result in complete.items():
            if metric in stats_master.keys():
                stats_master[metric].append(result)
            else:
                stats_master[metric] = []

### df creation and csv writing 
sims_df = pd.DataFrame(stats_master)

print(f"{len(sims_df)} iterations run. {len(excepts)} combinations excepted.")
print(excepts[:10])

print(sims_df.shape)
print(sims_df['first_guess'].unique().tolist())
sims_df.head()


Mean of empty slice.


invalid value encountered in scalar divide



4619 iterations run. 0 combinations excepted.
[]
(4619, 19)
['later', 'fuzzy']


Unnamed: 0,first_guess,target_word,first_guess_vowels,first_guess_consonants,target_vowels,target_consonants,first_guess_meaning,target_meaning,target_guessed,mid_guesses_avg_vows,mid_guesses_avg_cons,avg_perf_letters,avg_wrong_pos_letters,avg_wrong_letters,avg_remaining,avg_intermediate_guess_meaning,luck,valid_success,num_guesses
0,later,thyme,2.0,3.0,2.0,3.0,100.0,56.91,True,2.0,3.0,3.0,8.0,18.0,21.33,100.0,0.65,True,4.0
1,later,mower,2.0,3.0,2.0,3.0,100.0,65.84,True,2.0,3.0,11.0,1.0,23.0,25.75,100.0,0.57,True,5.0
2,later,horde,2.0,3.0,2.0,3.0,100.0,73.81,True,2.0,3.0,1.0,6.0,8.0,52.5,100.0,0.74,True,3.0
3,later,heard,2.0,3.0,2.0,3.0,100.0,80.35,True,2.5,2.5,3.0,15.0,12.0,20.33,100.0,0.65,True,4.0
4,later,tenor,2.0,3.0,2.0,3.0,100.0,89.24,True,2.0,3.0,1.0,2.0,2.0,2.0,100.0,0.83,True,2.0


In [None]:
later_guesses = sims_df.query("first_guess == 'later'")[['first_guess', 'num_guesses']]
sims_df.query("first_guess == 'later'")['num_guesses'].mean() # later : ~3.81 avg guesses

for word in sims_df["first_guess"].unique().tolist():
    mean_guesses = round(sims_df.query(f"first_guess == '{word}'")['num_guesses'].mean(), 2) # later : ~3.81 avg guesses, fuzzy : ~4.36 avg guesses
    word_df = sims_df.query(f"first_guess == '{word}'")['num_guesses']
    
    word_df_dist_plot = px.histogram(word_df, x = "num_guesses", title = f"Distribution of Guesses with '{word}' as Starting Word",
                                        labels = {"num_guesses": "Number of Guesses"})
    word_df_dist_plot.add_vline(x = mean_guesses, line_width = 4, line_dash = "dot", line_color = "black",
                                    annotation_text = f"Mean = {mean_guesses}", annotation_font_size = 20,
                                    annotation_font_color = "black", annotation_position = "right")
    word_df_dist_plot.update_layout(title_font_size = 22)
    word_df_dist_plot.update_traces(marker_color = "#6ca965")

    word_df_dist_plot.show()