## Imports

In [1]:
import numpy as np
import random
import operator
import time
import pandas as pd
from wordle_functions import *

## Importing datasets

### official words
- official wordle word list

In [2]:
### Official list
official_words = []

with open("data/official_words_processed.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        official_words.append(word)

f.close() # closes connection to file

print(len(official_words))
official_words[:5]

2310


['wince', 'thyme', 'mower', 'horde', 'heard']

### alternative list 1
- an alternate list of 5-letter words found on the web

In [3]:
### Official list
alt_words_1 = []

with open("data/alt_words_1.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        alt_words_1.append(word)

f.close() # closes connection to file

print(len(alt_words_1))
alt_words_1[:5]

14856


['rossa', 'jetty', 'wizzo', 'cuppa', 'cohoe']

### nltk grand corpus
- Amalgamation of all words in various NLTK corpora to have as big a dataset as possible
- Developed manually

In [4]:
### grand corpus tokens
nltk_tokens = []

with open("data/nltk_grand_corpus_tokens_5.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        nltk_tokens.append(word)

f.close() # closes connection to file

print(len(nltk_tokens))
nltk_tokens[:5]

535189


['years', 'board', 'dutch', 'group', 'agnew']

### nltk grand corpus types and counts

In [5]:
### grand corpus types and counts
nltk_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_5.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        if len(line.split("\t")) == 2:
            word = line.split("\t")[0]
            count = line.split("\t")[1]
            nltk_counts[word] = count
        else:
            continue

f.close() # closes connection to file

print(len(nltk_counts))
nltk_counts['which']

8043


'15760'

In [6]:
### Official list
official_words = []

with open("data/official_words_processed.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        if len(word) > 0: # there's one blank entry at the start
            official_words.append(word)

f.close() # closes connection to file

print(len(official_words))
official_words[:10]

2309


['wince',
 'thyme',
 'mower',
 'horde',
 'heard',
 'tenor',
 'zonal',
 'parry',
 'shied',
 'fizzy']

## Wordle functions + Testing

### Testing `wordle_wizard()`

In [145]:
test_1 = wordle_wizard(word_list = official_words, max_guesses = 6, 
                guess = "paint", target = "force",
                random_guess = False, random_target = False, 
                verbose = True, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'paint'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[]

Letters to guess again:
	[]

Letters to not guess again:
	['a', 'i', 'n', 'p', 't']

At this point:
	1933, 83.07% of total words have been eliminated, and
	394, 16.93% of total words remain possible.

The top 40 potential next guesses are:
	[('loser', 100.0), ('score', 92.28), ('rouse', 92.02), ('older', 90.94), ('shore', 89.42), ('horse', 89.42), ('close', 86.49), ('louse', 86.23), ('sober', 85.97), ('lower', 84.56), ('ulcer', 84.41), ('cruel', 84.41), ('lover', 83.22), ('swore', 83.22), ('sower', 83.22), ('worse', 83.22), ('decor', 83.18), ('credo', 83.18), ('curse', 83.07), ('chore', 83.0), ('verso', 81.85), ('horde', 80.33), ('rogue', 80.25), ('rouge', 80.25), ('usher', 80.25), ('lemur', 79.32), ('gruel', 79.06), ('bluer', 78.14), ('serum', 77.99), ('homer', 77.91), ('force', 77.91), ('surge', 77.73), ('cower', 76.8), ('rebus', 76.8), ('foyer', 76.21), ('clerk', 7

In [8]:
for val in [False, True]:
        wordle_wizard(word_list = official_words, max_guesses = 6, 
                guess = "arose", target = "syrup", bias = 'entropy', 
                random_guess = False, random_target = False, 
                verbose = val, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'arose'
Guess 2: 'shirt'
Guess 3: 'surly'
Guess 4: 'syrup'

Congratulations! The Wordle has been solved in 4 guesses!
There were still 2 guesses remaining.

The target word was 'syrup'.

-----------------------------
-----------------------------

Guess 1: 'arose'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[('r', 1), ('s', 3)]

Letters to guess again:
	['r', 's']

Letters to not guess again:
	['a', 'e', 'o']

At this point:
	2288, 99.09% of total words have been eliminated, and
	21, 0.91% of total words remain possible.

All potential next guesses:
	[('shirt', 100.0), ('strip', 97.58), ('rusty', 82.49), ('skirt', 81.4), ('surly', 81.16), ('swirl', 78.38), ('spurt', 76.33), ('slurp', 75.0), ('spurn', 60.02), ('sprig', 53.86), ('risky', 49.52), ('virus', 47.83), ('shirk', 45.65), ('scrum', 44.57), ('syrup', 44.44), ('scrub', 40.82), ('smirk', 38.29), ('strut', 38.16), ('shrug', 34.78), ('shrub', 31.64), ('usurp', 0.0)]

Wor

### Testing on 3-letter words

In [9]:
### 3 letters
words_3_letters = []
words_3_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_3.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_3_letters.append(word)
                words_3_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_3_letters))
print(words_3_letters[:5])
words_3_types_counts['the']

1531
['the', 'and', 'for', 'his', 'was']


'286732'

In [10]:
for val in [False, True]:
        wordle_wizard(word_list = words_3_letters, max_guesses = 6, 
                guess = "the", target = "his", bias = 'entropy', 
                random_guess = False, random_target = False, 
                verbose = val, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'the'
Guess 2: 'hoa'
Guess 3: 'his'

Congratulations! The Wordle has been solved in 3 guesses!
There were still 3 guesses remaining.

The target word was 'his'.

-----------------------------
-----------------------------

Guess 1: 'the'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[('h', 1)]

Letters to guess again:
	['h']

Letters to not guess again:
	['e', 't']

At this point:
	1447, 94.51% of total words have been eliminated, and
	84, 5.49% of total words remain possible.

The top 40 potential next guesses are:
	[('hoa', 100.0), ('hai', 93.36), ('ash', 80.38), ('has', 80.38), ('han', 77.56), ('nah', 77.56), ('anh', 77.56), ('hua', 77.56), ('har', 76.41), ('mah', 73.89), ('ham', 73.89), ('hal', 72.06), ('pah', 71.6), ('hap', 71.6), ('dah', 70.92), ('had', 70.92), ('bah', 68.24), ('hab', 68.24), ('agh', 64.66), ('hag', 64.66), ('hos', 64.43), ('wah', 63.13), ('haw', 63.13), ('hay', 62.98), ('hon', 61.6), ('hou', 61.6), ('

### Testing on 4-letter words

In [11]:
### 3 letters
words_4_letters = []
words_4_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_4.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_4_letters.append(word)
                words_4_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_4_letters))
print(words_4_letters[:5])
words_4_types_counts['that']

4266
['that', 'with', 'this', 'they', 'have']


'57994'

In [12]:
for val in [False, True]:
        wordle_wizard(word_list = words_4_letters, max_guesses = 6, 
                guess = "have", target = "this", bias = 'entropy', 
                random_guess = False, random_target = False, 
                verbose = val, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'have'
Guess 2: 'rosh'
Guess 3: 'shin'
Guess 4: 'this'

Congratulations! The Wordle has been solved in 4 guesses!
There were still 2 guesses remaining.

The target word was 'this'.

-----------------------------
-----------------------------

Guess 1: 'have'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[('h', 0)]

Letters to guess again:
	['h']

Letters to not guess again:
	['a', 'e', 'v']

At this point:
	4101, 96.13% of total words have been eliminated, and
	165, 3.87% of total words remain possible.

The top 40 potential next guesses are:
	[('shor', 100.0), ('rosh', 100.0), ('shon', 97.4), ('sohn', 97.4), ('soth', 95.92), ('shot', 95.92), ('iohn', 93.08), ('itoh', 91.54), ('shin', 88.41), ('shou', 88.29), ('orth', 88.11), ('thor', 88.11), ('roth', 88.11), ('thro', 88.11), ('shod', 87.94), ('sith', 86.87), ('this', 86.87), ('shit', 86.87), ('tish', 86.87), ('loth', 86.58), ('mosh', 86.34), ('thon', 85.51), ('posh', 83.38)

### Testing on 6-letter words

In [13]:
### 6 letters
words_6_letters = []
words_6_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_6.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_6_letters.append(word)
                words_6_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_6_letters))
print(words_6_letters[:5])
words_6_types_counts[words_6_letters[0]]

11290
['little', 'before', 'people', 'should', 'things']


'5543'

In [14]:
for val in [False, True]:
        wordle_wizard(word_list = words_6_letters, max_guesses = 6, 
                guess = "little", target = "before", bias = 'entropy', 
                random_guess = False, random_target = False, 
                verbose = val, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'little'
Guess 2: 'sarone'
Guess 3: 'upmore'
Guess 4: 'decore'
Guess 5: 'before'

Congratulations! The Wordle has been solved in 5 guesses!
There were still 1 guesses remaining.

The target word was 'before'.

-----------------------------
-----------------------------

Guess 1: 'little'
Letters in correct positions:
	[('e', 5)]

Letters in incorrect positions:
	[]

Letters to guess again:
	['e']

Letters to not guess again:
	['i', 'l', 't']

At this point:
	10880, 96.37% of total words have been eliminated, and
	410, 3.63% of total words remain possible.

The top 40 potential next guesses are:
	[('sarone', 100.0), ('arouse', 92.3), ('hoarse', 91.81), ('ashore', 91.81), ('coarse', 91.12), ('romane', 86.05), ('orange', 84.29), ('groane', 84.29), ('organe', 84.29), ('scorne', 82.78), ('sundae', 82.69), ('sharpe', 81.14), ('phrase', 81.14), ('scrape', 80.44), ('dosage', 79.56), ('grande', 79.02), ('coahse', 78.65), ('drouse', 78.56), ('graspe', 78.5

### Testing on 7-letter words

In [15]:
### 7 letters
words_7_letters = []
words_7_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_7.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_7_letters.append(word)
                words_7_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_7_letters))
print(words_7_letters[:5])
words_7_types_counts[words_7_letters[0]]

12566
['because', 'through', 'against', 'another', 'himself']


'4809'

In [16]:
for val in [False, True]:
        wordle_wizard(word_list = words_7_letters, max_guesses = 6, 
                guess = "because", target = "through", bias = 'entropy', 
                random_guess = True, random_target = True, 
                verbose = val, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'hassled'
Guess 2: 'cretins'
Guess 3: 'bootsie'
Guess 4: 'smitest'

Congratulations! The Wordle has been solved in 4 guesses!
There were still 2 guesses remaining.

The target word was 'smitest'.

-----------------------------
-----------------------------

Guess 1: 'buggers'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[('e', 4), ('s', 6)]

Letters to guess again:
	['e', 's']

Letters to not guess again:
	['b', 'g', 'r', 'u']

At this point:
	12044, 95.85% of total words have been eliminated, and
	522, 4.15% of total words remain possible.

The top 40 potential next guesses are:
	[('isolate', 100.0), ('stained', 98.33), ('sainted', 98.33), ('sodaine', 97.07), ('seminal', 92.42), ('elastic', 92.23), ('slanted', 91.66), ('spaniel', 91.36), ('incased', 89.34), ('elysian', 87.36), ('ainsley', 87.36), ('despina', 85.73), ('salmone', 85.34), ('anselmo', 85.34), ('section', 84.93), ('evasion', 84.66), ('espanol', 84.28), ('aimeds

### Testing on 8-letter words

In [17]:
### 8 letters
words_8_letters = []
words_8_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_8.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_8_letters.append(word)
                words_8_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_8_letters))
print(words_8_letters[:5])
words_8_types_counts[words_8_letters[0]]

11650
['children', 'together', 'director', 'anything', 'american']


'3012'

In [18]:
for val in [False, True]:
        wordle_wizard(word_list = words_8_letters, max_guesses = 6, 
                guess = "trinidad", target = "together", bias = 'entropy', 
                random_guess = False, random_target = False, 
                verbose = val, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'trinidad'
Guess 2: 'tumblers'
Guess 3: 'together'

Congratulations! The Wordle has been solved in 3 guesses!
There were still 3 guesses remaining.

The target word was 'together'.

-----------------------------
-----------------------------

Guess 1: 'trinidad'
Letters in correct positions:
	[('t', 0)]

Letters in incorrect positions:
	[('r', 1)]

Letters to guess again:
	['r', 't']

Letters to not guess again:
	['a', 'd', 'i', 'n']

At this point:
	11624, 99.78% of total words have been eliminated, and
	26, 0.22% of total words remain possible.

All potential next guesses:
	[('tumbrels', 100.0), ('tumblers', 100.0), ('theorems', 91.84), ('throuble', 91.84), ('torquers', 82.1), ('tortures', 81.36), ('terrours', 81.36), ('throttle', 70.07), ('teleport', 67.5), ('threwest', 61.5), ('thereout', 61.5), ('together', 60.07), ('textures', 58.97), ('therfore', 53.47), ('therefor', 53.47), ('tweezers', 49.92), ('torturer', 48.69), ('tourette', 48.69), ('

### Testing on 9-letter words

In [19]:
### 9 letters
words_9_letters = []
words_9_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_9.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_9_letters.append(word)
                words_9_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_9_letters))
print(words_9_letters[:5])
words_9_types_counts[words_9_letters[0]]

9716
['something', 'character', 'therefore', 'according', 'different']


'2621'

In [20]:
for val in [False, True]:
        wordle_wizard(word_list = words_9_letters, max_guesses = 6, 
                guess = "something", target = "character", bias = 'entropy', 
                random_guess = True, random_target = False, 
                verbose = val, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'shiphmite'
Guess 2: 'charleton'
Guess 3: 'characted'
Guess 4: 'character'

Congratulations! The Wordle has been solved in 4 guesses!
There were still 2 guesses remaining.

The target word was 'character'.

-----------------------------
-----------------------------

Guess 1: 'crevillen'
Letters in correct positions:
	[('c', 0), ('e', 7)]

Letters in incorrect positions:
	[('r', 1), ('e', 2)]

Letters to guess again:
	['c', 'e', 'r']

Letters to not guess again:
	['i', 'l', 'n', 'v']

At this point:
	9694, 99.77% of total words have been eliminated, and
	22, 0.23% of total words remain possible.

All potential next guesses:
	[('castrated', 100.0), ('curtseyed', 78.61), ('corrupted', 71.73), ('chuckster', 68.98), ('comported', 67.61), ('characted', 64.92), ('chartered', 64.92), ('chattered', 64.92), ('comforted', 59.35), ('camcorder', 57.63), ('chambered', 50.89), ('corrupter', 44.43), ('chatterer', 37.62), ('character', 37.62), ('charecter', 37.6

### Testing on 10-letter words

In [21]:
### 10 letters
words_10_letters = []
words_10_types_counts = {}

with open("data/nltk_grand_corpus_types_and_counts_10.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word_freq = line.split("\t")
        if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line
            word = word_freq[0]
            freq = word_freq[1]
            if word.isascii() == True:
                words_10_letters.append(word)
                words_10_types_counts[word] = freq

f.close() # closes connection to file

print(len(words_10_letters))
print(words_10_letters[:5])
words_10_types_counts[words_10_letters[0]]

7200
['characters', 'themselves', 'everything', 'especially', 'understand']


'1929'

In [22]:
for val in [False, True]:
        wordle_wizard(word_list = words_10_letters, max_guesses = 6, 
                guess = "characters", target = "theologies", bias = 'entropy', 
                random_guess = True, random_target = False, 
                verbose = val, drama = 0, return_stats = False, record = True)

-----------------------------

Guess 1: 'woodhouses'
Guess 2: 'theologies'

Congratulations! The Wordle has been solved in 2 guesses!
There were still 4 guesses remaining.

The target word was 'theologies'.

-----------------------------
-----------------------------

Guess 1: 'projecting'
Letters in correct positions:
	[('i', 7)]

Letters in incorrect positions:
	[('o', 2), ('e', 4), ('t', 6), ('g', 9)]

Letters to guess again:
	['e', 'g', 'i', 'o', 't']

Letters to not guess again:
	['c', 'j', 'n', 'p', 'r']

At this point:
	7197, 99.96% of total words have been eliminated, and
	3, 0.04% of total words remain possible.

All potential next guesses:
	[('ideologist', 100.0), ('theologies', 49.73), ('volksgeist', 0.0)]

Words guessed so far:
	['projecting'].

Next guess:
	'ideologist'

-----------------------------

Guess 2: 'ideologist'
Letters in correct positions:
	[('e', 2), ('o', 3), ('l', 4), ('o', 5), ('g', 6), ('i', 7)]

Letters in incorrect positions:
	[('i', 0), ('o', 2), ('e',

In [23]:
wordle_wizard(word_list = official_words, max_guesses = 5, 
                guess = "quote", target = "silly", bias = 'entropy', 
                random_guess = False, random_target = False, 
                verbose = True, drama = 0, return_stats = False, record = False)

-----------------------------

Guess 1: 'quote'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[]

Letters to guess again:
	[]

Letters to not guess again:
	['e', 'o', 'q', 't', 'u']

At this point:
	1931, 83.63% of total words have been eliminated, and
	378, 16.37% of total words remain possible.

The top 40 potential next guesses are:
	[('snarl', 100.0), ('slain', 91.3), ('snail', 91.3), ('cairn', 90.86), ('grail', 90.02), ('rainy', 88.87), ('nadir', 87.73), ('drain', 87.73), ('frail', 86.89), ('flair', 86.89), ('scary', 85.08), ('grain', 84.55), ('acrid', 83.97), ('rival', 83.97), ('viral', 83.97), ('chair', 83.71), ('crash', 83.66), ('brain', 83.4), ('scrap', 82.83), ('dairy', 81.99), ('inlay', 81.99), ('diary', 81.99), ('hairy', 81.77), ('ralph', 81.28), ('scram', 80.93), ('raspy', 80.84), ('spray', 80.84), ('cigar', 80.79), ('shard', 80.53), ('basil', 80.09), ('ranch', 80.0), ('rapid', 79.74), ('plain', 79.74), ('sharp', 79.43), ('randy', 78.28), ('scaly', 78

### `compare_wordle()` testing

In [24]:
df = pd.read_csv("compared_data/wordle_humans - Sheet1.csv")
print(df.shape)
df

(39, 8)


Unnamed: 0,player,target,first_guess,second_guess,third_guess,fourth_guess,fifth_guess,sixth_guess
0,diane,vague,arose,plate,cache,mauve,vague,none
1,aidan,apple,douce,lairy,slave,algae,apple,none
2,aidan,ninth,douce,lairy,gimps,ninth,none,none
3,aidan,flail,douce,lairy,snail,flail,none,none
4,aidan,stage,douce,lairy,phase,stage,none,none
5,aidan,heady,douce,lairy,ready,beady,heady,none
6,aidan,none,douce,dairy,dunes,debug,none,none
7,aidan,usage,douce,lairy,aunts,swamp,usage,none
8,aidan,sound,douce,pound,found,bound,mound,sound
9,aidan,salsa,douce,lairy,palms,salts,ghost,salsa


In [25]:
df = pd.read_csv("compared_data/wordle_humans - Sheet1.csv")
df
convert_row(df, 37)

('dad', 'toxic', ['audio', 'choir', 'toxic'])

In [26]:
### TESTING DF INTERPRETATION

df = pd.read_csv("compared_data/wordle_humans - Sheet1.csv")

row = 37

print(convert_row(df, row))
player = convert_row(df, row)[0]
target_word = convert_row(df, row)[1]
guess_list = convert_row(df, row)[2]

compare_wordle(word_list = official_words, max_guesses = 6, 
                    guess_list = guess_list, player = player, target = target_word,
                    verbose = False, return_stats = True, record = False)

('dad', 'toxic', ['audio', 'choir', 'toxic'])


{'first_guess': ['audio', 'audio'],
 'target_word': ['toxic', 'toxic'],
 'first_guess_vowels': [4.0, 4.0],
 'first_guess_consonants': [1.0, 1.0],
 'target_vowels': [2.0, 2.0],
 'target_consonants': [3.0, 3.0],
 'first_guess_entropy': [61.91, 61.91],
 'target_entropy': [44.4, 44.4],
 'target_guessed': [True, True],
 'mid_guesses_avg_vows': [2.67, 2.33],
 'mid_guesses_avg_cons': [2.33, 2.67],
 'avg_perf_letters': [2.0, 12.0],
 'avg_wrong_pos_letters': [4.0, 19.0],
 'avg_wrong_letters': [8.0, 29.0],
 'avg_remaining': [14.5, 6.8],
 'avg_intermediate_guess_entropy': [87.3, 93.65],
 'valid_success': [True, True],
 'player': ['dad', 'wizard'],
 'num_guesses': [3.0, 6.0],
 'expected_guesses': [6.0, 6.0],
 'luck': [0.56, 0]}

## Comparing player solutions against wizard solutions

In [27]:
def create_compared_df(player_df, to_csv: bool = False, show_shapes: bool = False):
    """
    Creates master df of player wordle scores compared to how wordle_wizard would perform on the same puzzles

    Parameters:
    -----
    `player_df`: Pandas dataFrame object
        df of player scores of wordle puzzles
    `to_csv`: bool
        If True, writes returned df to csv
    `show_shapes`: bool
        If True, prints shape of new df before and after deleting duplicate rows (created by wordle_wizard running the same puzzles multiple times)
    
    Returns:
    -----
    `df_master`: Pandas dataFrame object
        df of player scores and wordle_wizard scores of wordle puzzles
    """

    stats_master = {}
    excepts = []
    for row in player_df.index:
        player = convert_row(player_df, row)[0]
        target_word = convert_row(player_df, row)[1]
        guess_list = convert_row(player_df, row)[2]
        try:
            complete = compare_wordle(word_list = official_words, max_guesses = 6, 
                        guess_list = guess_list, player = player, target = target_word,
                        verbose = True, return_stats = True, record = False)
            for metric, results in complete.items():
                if metric in stats_master:
                    for result in results:
                        stats_master[metric].append(result)
                else:
                    stats_master[metric] = []
                    for result in results:
                        stats_master[metric].append(result)
        except:
            AttributeError
            excepts.append(guess_list)

    df_master = pd.DataFrame(stats_master)
    print(df_master.columns.tolist())

    # Re-organizing columns to a more logical order (for viewing)
    df_master = df_master[['first_guess', 'target_word', 'player', 'num_guesses', 'expected_guesses', 'luck', 'first_guess_vowels', 'first_guess_consonants',
                        'target_vowels', 'target_consonants', 'first_guess_entropy', 'target_entropy',
                        'target_guessed', 'mid_guesses_avg_vows', 'mid_guesses_avg_cons', 'avg_perf_letters',
                        'avg_wrong_pos_letters', 'avg_wrong_letters', 'avg_remaining', 'avg_intermediate_guess_entropy',
                        'valid_success']]

    # print(excepts)
    if show_shapes == True:
        print(df_master.shape) # check shape before deleting dups

    # Delete duplicate rows (some created by process)
    df_master.drop_duplicates(inplace = True)
    
    if to_csv == True:
        df_master.to_csv('compared_data/players_compared.csv') # write new data to csv
    
    if show_shapes == True:
        print(df_master.shape) # check shape after deleting dups
    
    return df_master.reset_index().drop(columns = "index")

In [28]:
test_word = "test 1 "
test_word.strip().lower()

'test 1'

In [29]:
df = pd.read_csv("compared_data/wordle_humans - Sheet1.csv")

df_master = create_compared_df(df, to_csv = True, show_shapes = True)
df_master

['first_guess', 'target_word', 'first_guess_vowels', 'first_guess_consonants', 'target_vowels', 'target_consonants', 'first_guess_entropy', 'target_entropy', 'target_guessed', 'mid_guesses_avg_vows', 'mid_guesses_avg_cons', 'avg_perf_letters', 'avg_wrong_pos_letters', 'avg_wrong_letters', 'avg_remaining', 'avg_intermediate_guess_entropy', 'valid_success', 'player', 'num_guesses', 'expected_guesses', 'luck']
(72, 21)
(69, 21)


Unnamed: 0,first_guess,target_word,player,num_guesses,expected_guesses,luck,first_guess_vowels,first_guess_consonants,target_vowels,target_consonants,...,target_entropy,target_guessed,mid_guesses_avg_vows,mid_guesses_avg_cons,avg_perf_letters,avg_wrong_pos_letters,avg_wrong_letters,avg_remaining,avg_intermediate_guess_entropy,valid_success
0,arose,vague,diane,5.0,5.0,0.00,3.0,2.0,3.0,2.0,...,58.26,True,2.60,2.40,6.0,9.0,26.0,18.75,99.86,True
1,arose,vague,wizard,5.0,5.0,0.00,3.0,2.0,3.0,2.0,...,58.26,True,2.60,2.40,7.0,4.0,30.0,19.00,99.86,True
2,douce,apple,aidan,5.0,5.0,0.07,3.0,2.0,2.0,3.0,...,62.83,True,2.60,2.40,5.0,11.0,30.0,49.50,92.75,True
3,douce,apple,wizard,5.0,5.0,0.00,3.0,2.0,2.0,3.0,...,62.83,True,2.40,2.60,8.0,7.0,28.0,48.00,92.75,True
4,douce,ninth,aidan,4.0,5.0,0.27,3.0,2.0,1.0,4.0,...,35.38,True,2.00,3.00,1.0,2.0,27.0,123.00,90.92,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,later,toxic,wizard,5.0,5.0,0.00,2.0,3.0,2.0,3.0,...,44.39,True,2.00,3.00,10.0,10.0,22.0,28.25,100.00,True
65,audio,toxic,dad,3.0,6.0,0.56,4.0,1.0,2.0,3.0,...,44.39,True,2.67,2.33,2.0,4.0,8.0,14.50,87.33,True
66,audio,toxic,wizard,6.0,6.0,0.00,4.0,1.0,2.0,3.0,...,44.39,True,2.33,2.67,12.0,19.0,29.0,6.80,93.66,True
67,audio,toxic,diane,5.0,6.0,0.23,4.0,1.0,2.0,3.0,...,44.49,True,2.60,2.40,9.0,6.0,26.0,10.50,92.40,True


In [30]:
print(df_master.query("player == 'aidan'")['num_guesses'].mean())
print(df_master.query("player == 'aidan'").shape)
df_master.query("player == 'aidan'").head()

4.5
(18, 21)


Unnamed: 0,first_guess,target_word,player,num_guesses,expected_guesses,luck,first_guess_vowels,first_guess_consonants,target_vowels,target_consonants,...,target_entropy,target_guessed,mid_guesses_avg_vows,mid_guesses_avg_cons,avg_perf_letters,avg_wrong_pos_letters,avg_wrong_letters,avg_remaining,avg_intermediate_guess_entropy,valid_success
2,douce,apple,aidan,5.0,5.0,0.07,3.0,2.0,2.0,3.0,...,62.83,True,2.6,2.4,5.0,11.0,30.0,49.5,92.75,True
4,douce,ninth,aidan,4.0,5.0,0.27,3.0,2.0,1.0,4.0,...,35.38,True,2.0,3.0,1.0,2.0,27.0,123.0,90.92,True
6,douce,flail,aidan,4.0,4.0,0.09,3.0,2.0,2.0,3.0,...,42.3,True,2.5,2.5,3.0,6.0,21.0,122.33,90.92,True
8,douce,stage,aidan,4.0,4.0,0.09,3.0,2.0,2.0,3.0,...,81.29,True,2.5,2.5,4.0,3.0,22.0,65.67,90.92,True
10,douce,heady,aidan,5.0,4.0,-0.16,3.0,2.0,3.0,2.0,...,66.43,True,3.0,2.0,9.0,11.0,22.0,16.0,92.74,True


In [31]:
print(df_master.query("player == 'dad'")['num_guesses'].mean())
print(df_master.query("player == 'dad'").shape)
df_master.query("player == 'dad'").head()

3.857142857142857
(7, 21)


Unnamed: 0,first_guess,target_word,player,num_guesses,expected_guesses,luck,first_guess_vowels,first_guess_consonants,target_vowels,target_consonants,...,target_entropy,target_guessed,mid_guesses_avg_vows,mid_guesses_avg_cons,avg_perf_letters,avg_wrong_pos_letters,avg_wrong_letters,avg_remaining,avg_intermediate_guess_entropy,valid_success
38,audio,syrup,dad,4.0,3.0,-0.21,4.0,1.0,2.0,3.0,...,49.32,True,2.5,2.5,1.0,10.0,18.0,37.0,90.5,True
40,audio,worse,dad,4.0,4.0,0.09,4.0,1.0,2.0,3.0,...,76.4,True,2.5,2.5,5.0,9.0,15.0,95.33,90.5,True
45,audio,polka,dad,4.0,3.0,-0.21,4.0,1.0,2.0,3.0,...,55.0,True,2.5,2.5,2.0,11.0,17.0,32.0,90.51,True
47,audio,moose,dad,6.0,6.0,0.06,4.0,1.0,3.0,2.0,...,53.53,True,2.83,2.17,13.0,9.0,37.0,59.8,93.67,True
56,audio,squat,dad,3.0,3.0,0.13,4.0,1.0,2.0,3.0,...,50.67,True,3.0,2.0,1.0,6.0,8.0,17.0,87.34,True


In [32]:
print(df_master.query("player == 'diane'")['num_guesses'].mean())
print(df_master.query("player == 'diane'").shape)
df_master.query("player == 'diane'").head()

3.75
(8, 21)


Unnamed: 0,first_guess,target_word,player,num_guesses,expected_guesses,luck,first_guess_vowels,first_guess_consonants,target_vowels,target_consonants,...,target_entropy,target_guessed,mid_guesses_avg_vows,mid_guesses_avg_cons,avg_perf_letters,avg_wrong_pos_letters,avg_wrong_letters,avg_remaining,avg_intermediate_guess_entropy,valid_success
0,arose,vague,diane,5.0,5.0,0.0,3.0,2.0,3.0,2.0,...,58.26,True,2.6,2.4,6.0,9.0,26.0,18.75,99.86,True
42,audio,worse,diane,4.0,4.0,0.09,4.0,1.0,2.0,3.0,...,76.4,True,2.5,2.5,4.0,8.0,17.0,96.67,90.5,True
43,audio,polka,diane,3.0,3.0,0.13,4.0,1.0,2.0,3.0,...,54.91,True,2.67,2.33,3.0,5.0,7.0,38.5,87.34,True
49,audio,moose,diane,4.0,6.0,0.4,4.0,1.0,3.0,2.0,...,53.53,True,2.75,2.25,5.0,5.0,18.0,98.0,90.51,True
52,audio,above,diane,3.0,4.0,0.34,4.0,1.0,3.0,2.0,...,65.89,True,3.33,1.67,4.0,2.0,8.0,16.5,87.35,True


In [33]:
print(df_master.query("player == 'wizard'")['num_guesses'].mean())
print(df_master.query("player == 'wizard'").shape)
df_master.query("player == 'wizard'").head(40)

3.9696969696969697
(33, 21)


Unnamed: 0,first_guess,target_word,player,num_guesses,expected_guesses,luck,first_guess_vowels,first_guess_consonants,target_vowels,target_consonants,...,target_entropy,target_guessed,mid_guesses_avg_vows,mid_guesses_avg_cons,avg_perf_letters,avg_wrong_pos_letters,avg_wrong_letters,avg_remaining,avg_intermediate_guess_entropy,valid_success
1,arose,vague,wizard,5.0,5.0,0.0,3.0,2.0,3.0,2.0,...,58.26,True,2.6,2.4,7.0,4.0,30.0,19.0,99.86,True
3,douce,apple,wizard,5.0,5.0,0.0,3.0,2.0,2.0,3.0,...,62.83,True,2.4,2.6,8.0,7.0,28.0,48.0,92.75,True
5,douce,ninth,wizard,5.0,5.0,0.0,3.0,2.0,1.0,4.0,...,35.38,True,1.8,3.2,3.0,12.0,35.0,95.0,92.74,True
7,douce,flail,wizard,4.0,4.0,0.0,3.0,2.0,2.0,3.0,...,42.3,True,2.25,2.75,6.0,0.0,21.0,120.0,90.92,True
9,douce,stage,wizard,4.0,4.0,0.0,3.0,2.0,2.0,3.0,...,81.29,True,2.5,2.5,7.0,2.0,17.0,59.33,90.92,True
11,douce,heady,wizard,4.0,4.0,0.0,3.0,2.0,3.0,2.0,...,66.43,True,2.5,2.5,0.0,15.0,15.0,21.67,90.92,True
13,douce,usage,wizard,3.0,3.0,0.0,3.0,2.0,3.0,2.0,...,73.67,True,3.0,2.0,2.0,5.0,7.0,18.0,87.91,True
15,douce,sound,wizard,3.0,3.0,0.0,3.0,2.0,2.0,3.0,...,50.14,True,2.33,2.67,6.0,2.0,5.0,8.5,87.91,True
17,douce,salsa,wizard,4.0,4.0,0.0,3.0,2.0,2.0,3.0,...,35.65,True,2.25,2.75,3.0,4.0,23.0,123.67,90.92,True
19,douce,magic,wizard,4.0,4.0,0.0,3.0,2.0,2.0,3.0,...,46.9,True,2.25,2.75,4.0,7.0,18.0,27.0,90.92,True


## Prefix/Suffix bias

In [148]:
def wordle_wizard(word_list: list, max_guesses: int = None, 
                  guess: str = None, target: str = None,
                  random_guess: bool = False, random_target: bool = False, 
                  verbose: bool = False, drama: float = None, 
                  return_stats: bool = False, record: bool = False):
    """
    Mimicking the popular web game, this function matches a current word to a target word automatically, in the most statistically optimal way possible.

    ------
    Parameters:
    ------
    `word_list`: list
        list of valid words to be considered
    `guess`: str
        a string -- must be the same length as `target_word`
    `target`: str
        a string -- must be the same length as `opening_word`
    `bias`: str ['entropy', 'common', 'rare', None]
        'entropy' biases next word guesses to be the ones with the highest impact on the range of next possible guesses. Entropy values associated with each word are normalized across the list.

        'common' biases next word guesses to be words that are more commonly used

        'rare' biases next word guesses to be words that are more rarely used

        'no_bias' chooses a next guess at random of all available guesses

    `max_guesses`: int
        the maximum number of attempts allowed to solve the Wordle
    `random_guess`: bool
        if True, randomly chooses a starting word from all words within `word_list`. If False, passed starting word must be used instead
    `random_target`: bool
        if True, randomly chooses a target word from all words within `word_list`. If False, passed target word must be used instead
    `verbose`: bool
        if True, prints progress and explanation of how function solves the puzzle. If False, prints only the guessed word at each guess.
    `drama`: float or int
        if int provided, each guess' output is delayed by that number of seconds, else each output is shown as quickly as possible. For ~dRaMaTiC eFfEcT~
    `return_stats`: bool
        if True, prints nothing and returns a dictionary of various statistics about the function's performance trying to solve the puzzle
    `record`: bool
        if True, creates a .txt file with the same information printed according to the indicated verbosity

    ------
    Returns:
    ------
    `stats_dict`: dict
        dictionary containing various statistics about the function's performance trying to solve the puzzle
    """

    sugg_words = []

    for i in range(0, 20):
        ran_int = random.randint(0, len(word_list) - 1)
        word = word_list[ran_int]
        sugg_words.append(word)

    if guess not in word_list:
        print ("Guess word not in passed word list.\nOnly words within the given word list are valid.")
        print (f"Here are some examples of valid words from the passed word list.\n\t{sugg_words[:10]}")
        return None
    
    if target not in word_list:
        print ("Target word not in passed word list.\nOnly words within the given word list are valid.")
        print (f"Here are some examples of valid words from the passed word list.\n\t{sugg_words[-10:]}")
        return None

    if random_guess == True:
        randomint_guess = random.randint(0, len(word_list) - 1)
        guess = word_list[randomint_guess]

    if random_target == True:
        randomint_target = random.randint(0, len(word_list) - 1)
        target = word_list[randomint_target]


    suffix_freq_dist = {}
    prefix_freq_dist = {}

    for word in official_words:
        prefix = word[:2] # first 2 letters
        suffix = word[-2:] # last 2 letters
        if prefix not in prefix_freq_dist:
            prefix_freq_dist[prefix] = 1
        else:
            prefix_freq_dist[prefix] += 1

        if suffix not in suffix_freq_dist:
            suffix_freq_dist[suffix] = 1
        else:
            suffix_freq_dist[suffix] += 1

    sorted_prefix_dist = sorted(prefix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)
    sorted_suffix_dist = sorted(suffix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)

    stats_dict = {}
    stats_dict['first_guess'] = guess
    stats_dict['target_word'] = target
    stats_dict['first_guess_vowels'] = float(count_vows_cons(guess, y_vow = True)['vows'])
    stats_dict['first_guess_consonants'] = float(count_vows_cons(guess, y_vow = True)['cons'])
    stats_dict['target_vowels'] = float(count_vows_cons(target, y_vow = True)['vows'])
    stats_dict['target_consonants'] = float(count_vows_cons(target, y_vow = True)['cons'])
    
    # get entropy of the first guess word and target word in the entire word_list
    for tup in get_word_entropy(word_list, word_list, normalized = True):
        if tup[0] == guess:
            stats_dict['first_guess_entropy'] = tup[1]
        if tup[0] == target:
            stats_dict['target_entropy'] = tup[1]

    guess_entropies = []
    guess_entropies.append(stats_dict['first_guess_entropy'])

    # luck_guess_1 = round(1 - ((1 / len(word_list)) * guess_entropies[0] / 100), 2) * 100

    english_alphabet = "abcdefghijklmnopqrstuvwxyz"

    word_list_sorted_counts = get_letter_counts(english_alphabet, word_list, sort = "descending")
    
    wordlen = len(guess)
    letter_positions = set(i for i in range(0, wordlen))

    guess_set = set()
    perfect_dict = {}
    wrong_pos_dict = {}
    wrong_pos_set = set()
    dont_guess_again = set()

    guessed_words = [] # running set of guessed words
    guess_num = 0 # baseline for variable
    dont_guess_words = set()
    incorrect_positions = []
    reduction_per_guess = []

    if max_guesses == None: # if no value is passed, default is len(guess)
        max_guesses = wordlen
    else: # else it is the value passed
        max_guesses = max_guesses

    perfect_letts_per_guess = []
    wrong_pos_per_guess = []
    wrong_letts_per_guess = []

    record_list = []

    while guess: # while there is any guess -- there are conditions to break it at the bottom

        guess_num += 1

        guessed_words.append(guess)

        if drama:
            time.sleep(drama)

        # guess_num += 1 # each time the guess is processed
        if return_stats == False:
            if guess_num == 1:
                print("-----------------------------\n")
                record_list.append("-----------------------------\n")
    
        if return_stats == False:
            print(f"Guess {guess_num}: '{guess}'")
            record_list.append(f"Guess {guess_num}: '{guess}'")

        if guess == target:
            stats_dict['target_guessed'] = True
            if return_stats == False:
                if guess_num == 1:
                    print(f"Congratulations! The Wordle has been solved in {guess_num} guess, that's amazingly lucky!")
                    print(f"The target word was {target}")
                    record_list.append(f"Congratulations! The Wordle has been solved in {guess_num} guess, that's amazingly lucky!")
                    record_list.append(f"The target word was {target}")
                    perfect_letts_per_guess.append(5)
                    wrong_pos_per_guess.append(0)
                    wrong_letts_per_guess.append(0)
            break

        guess_set = set()
        wrong_pos_set = set()

        #### Step 2 -- ALL PERFECT
        for i in letter_positions: # number of letters in each word (current word and target word)
            guess_set.add(guess[i])

            if guess[i] not in perfect_dict:
                perfect_dict[guess[i]] = set()
            if guess[i] not in wrong_pos_dict:
                wrong_pos_dict[guess[i]] = set()

            ### EVALUATE CURRENT GUESS
            if guess[i] == target[i]: # letter == correct and position == correct
                perfect_dict[guess[i]].add(i)

            if (guess[i] != target[i] and  guess[i] in target): # letter == correct and position != correct
                wrong_pos_dict[guess[i]].add(i)
                wrong_pos_set.add(guess[i])

            if guess[i] not in target: # if letter is not relevant at all
                dont_guess_again.add(guess[i])

        #### Step 3 -- ALL PERFECT
        next_letters = set()
        for letter, positions in perfect_dict.items():
            if len(positions) > 0:
                next_letters.add(letter)

        for letter, positions in wrong_pos_dict.items():
            if len(positions) > 0:
                next_letters.add(letter)

        #### List of tuples of correct letter positions in new valid words. Eg: [('e', 2), ('a', 3)]
        perfect_letters = []
        for letter, positions in perfect_dict.items():
            for pos in positions:
                if len(positions) > 0:
                    perfect_letters.append((letter, pos))

        #### all words that have correct letters in same spots
        words_matching_correct_all = []
        for word in word_list:
            word_set = set()
            for letter, pos in perfect_letters:
                if word[pos] == letter:
                    words_matching_correct_all.append(word)

        #### excluding words with letters in known incorrect positions
        for letter, positions in wrong_pos_dict.items():
            for pos in positions:
                if len(positions) > 0:
                    if (letter, pos) not in incorrect_positions:
                        incorrect_positions.append((letter, pos))

        # sorting lists of tuples just to make them look nice in the printout
        incorrect_positions = sorted(incorrect_positions, key = operator.itemgetter(1), reverse = False)
        perfect_letters = sorted(perfect_letters, key = operator.itemgetter(1), reverse = False)

        #### all words that have correct letters in incorrect spots -- so they can be excluded efficiently
        
        # print(incorrect_positions)
        
        for word in word_list:
            word_set = set()
            for letter, pos in incorrect_positions:
                if word[pos] == letter:
                    dont_guess_words.add(word)
        for word in word_list:
            word_set = set()
            for letter, pos in incorrect_positions:
                if word[pos] == letter:
                    dont_guess_words.add(word)

        for bad_letter in dont_guess_again:
            for word in word_list:
                if (bad_letter in word and word not in dont_guess_words):
                    dont_guess_words.add(word)

        if return_stats == False:
            if verbose == True:
                print(f"Letters in correct positions:\n\t{perfect_letters}\n")
                print(f"Letters in incorrect positions:\n\t{incorrect_positions}\n")
                print (f"Letters to guess again:\n\t{sorted(list(next_letters), reverse = False)}\n")
                print(f"Letters to not guess again:\n\t{sorted(list(dont_guess_again), reverse = False)}\n") # works
                record_list.append(f"Letters in correct positions:\n\t{perfect_letters}\n")
                record_list.append(f"Letters in incorrect positions:\n\t{incorrect_positions}\n")
                record_list.append(f"Letters to guess again:\n\t{sorted(list(next_letters), reverse = False)}\n")
                record_list.append(f"Letters to not guess again:\n\t{sorted(list(dont_guess_again), reverse = False)}\n") # works

        # Returns True
        # print(A.issubset(B)) # "if everything in A is in B", returns Bool

        perfect_letts_per_guess.append(len(perfect_letters))
        wrong_pos_per_guess.append(len(incorrect_positions))
        wrong_letts_per_guess.append(len(dont_guess_again))

        potential_next_guesses = set()
        middle_set = set()

        if len(perfect_letters) == 0 and len(incorrect_positions) == 0: # if there are NEITHER perfect letters, NOR incorrect positions, ....
            for word in word_list:
                if word not in dont_guess_words:
                    if word not in guessed_words:
                        potential_next_guesses.add(word)
                                        
            # print(f"GUESS {guess_num} : TEST 1-1")

        if len(perfect_letters) == 0 and len(incorrect_positions) != 0: # if there are no perfect letters whatsoever, but there ARE incorrect positions ....
            for word in word_list:
                for incor_letter, incor_pos in incorrect_positions:
                    if word[incor_pos] != incor_letter:
                        if word not in dont_guess_words: # just in case
                            word_set = set()
                            for letter in word:
                                word_set.add(letter)

                                if next_letters.issubset(word_set):
                                    if word not in guessed_words:
                                        if len(dont_guess_again) > 0:
                                            for bad_letter in dont_guess_again:
                                                if bad_letter not in word:
                                                    # potential_next_guesses.append(word)
                                                    potential_next_guesses.add(word)
                                        else:
                                            potential_next_guesses.add(word)
            
            # print(f"GUESS {guess_num} : TEST 2-1")

        else:
            for word in word_list:
                if word not in dont_guess_words: # just in case
                    word_set = set()
                    for letter in word:
                        word_set.add(letter)
                        if next_letters.issubset(word_set):
                            if word not in guessed_words:
                                # print ("TEST 3-2")

                                if len(dont_guess_again) > 0:
                                    for bad_letter in dont_guess_again:
                                        if bad_letter not in word:
                                            middle_set.add(word)
                                else:
                                    middle_set.add(word)
            for word in middle_set:
                dummy_list = []
                for good_lett, good_pos in perfect_letters:
                    if word[good_pos] == good_lett:
                        dummy_list.append(1)
                        if len(dummy_list) == len(perfect_letters):
                            potential_next_guesses.add(word)
            for word in middle_set:
                dummy_list = []
                for bad_lett, bad_pos in incorrect_positions:
                    if word[bad_pos] == bad_lett:
                        dummy_list.append(1)
                        if len(dummy_list) > 0:
                            potential_next_guesses.remove(word)
                                        
            # print(f"GUESS {guess_num} : TEST 3-1")

        if return_stats == False:
            if verbose == True:
                print(f"At this point:")
                print(f"\t{len(word_list) - len(potential_next_guesses)}, {round((len(word_list) - len(potential_next_guesses)) / len(word_list) * 100, 2)}% of total words have been eliminated, and")
                print(f"\t{len(potential_next_guesses)}, {round(len(potential_next_guesses) / len(word_list) * 100, 2)}% of total words remain possible.\n")
                record_list.append(f"At this point:")
                record_list.append(f"\t{len(word_list) - len(potential_next_guesses)}, {round((len(word_list) - len(potential_next_guesses)) / len(word_list) * 100, 2)}% of total words have been eliminated, and")
                record_list.append(f"\t{len(potential_next_guesses)}, {round(len(potential_next_guesses) / len(word_list) * 100, 2)}% of total words remain possible.\n")
        
        reduction_per_guess.append(len(potential_next_guesses))
                
        #### Guessing next word
        if len(potential_next_guesses) == 1:

            if return_stats == False:
                if verbose == True:
                    print(f"The only remaining possible word is:\n\t'{list(potential_next_guesses)[0]}'\n")
                    record_list.append(f"The only remaining possible word is:\n\t'{list(potential_next_guesses)[0]}'\n")
                
            guess = list(potential_next_guesses)[0]
            guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])

        else:

            # if bias == "entropy":
                
            best_next_guesses = list(potential_next_guesses)                
            # print (best_next_guesses)
            word_ratings = get_word_entropy(best_next_guesses, word_list, normalized = True, ascending = False) # "internal" ratings

            # Get max rated words
            max_rating = -np.inf
            for word, rating in word_ratings:
                if rating > max_rating:
                    max_rating = rating

            best_guess_words = []
            for word, rating in word_ratings:
                if rating == max_rating:
                    best_guess_words.append(word)

            best_suffix_words
            if len(best_guess_words) > 1: # bias towards words of equal max entropy with the most common suffix
                for suffix, count in sorted_suffix_dist:
                    for word in best_guess_words:
                        if word[-2:] == suffix:
                            guess = word
                        break
                    break
            else:
                guess = best_guess_words[0]

            # guess = something
            
            guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])

            if return_stats == False:
                if verbose == True:
                    if len(word_ratings) <= 40:
                        print(f"All potential next guesses:\n\t{word_ratings}\n")
                        print(f"Words guessed so far:\n\t{guessed_words}.\n")
                        record_list.append(f"Potential next guesses:\n\t{word_ratings}\n")
                        record_list.append(f"Words guessed so far:\n\t{guessed_words}.\n")
                    else:
                        print(f"The top 40 potential next guesses are:\n\t{word_ratings[:40]}\n")
                        print(f"Words guessed so far:\n\t{guessed_words}.\n")
                        record_list.append(f"The top 40 potential next guesses are::\n\t{word_ratings[:40]}\n")
                        record_list.append(f"Words guessed so far:\n\t{guessed_words}.\n")

            # if bias == "no_bias":
            #     best_next_guesses = set()
            #     for word in potential_next_guesses:
            #         for letter, freq in word_list_sorted_counts:
            #             if letter not in dont_guess_again:
            #                 if len(next_letters) > 0:
            #                     if letter in next_letters:
            #                         if letter in word:
            #                             best_next_guesses.add(word)
            #                             break
            #                 else:
            #                     if letter in word:
            #                         best_next_guesses.add(word)
            #                         break
                                
            #     if return_stats == False:
            #         if verbose == True:
            #             if len(best_next_guesses) <= 40:
            #                 print(f"Potential next guesses:\n\t{best_next_guesses}\n")
            #                 print(f"Words guessed so far:\n\t{guessed_words}.\n") 
            #                 record_list.append(f"Potential next guesses:\n\t{best_next_guesses}\n")
            #                 record_list.append(f"Words guessed so far:\n\t{guessed_words}.\n") 

            # if bias == ("common" or "rare"):
            #     found_words = []
            #     for word in word_list:
            #         if word in nltk_counts.keys():
            #             found_words.append(word)

            #     found_words_sorted = sorted(found_words, key = operator.itemgetter(1), reverse = True) # sorted descending

            #     rated_words = []
            #     for word in potential_next_guesses:
            #         for tup in found_words_sorted:
            #             if tup[0] == word:
            #                 rated_words.append(tup)

            #     rated_words = sorted(rated_words, key = operator.itemgetter(1), reverse = True) # sorted descending
                
            #     if bias == "common":
            #         guess = rated_words[0][0] # word in first position // most frequent word
                    
            #         if return_stats == False:
            #             if verbose == True:
            #                 if len(potential_next_guesses) <= 40:
            #                     print(f"Potential next guesses:\n\t{rated_words}\n")
            #                     print(f"Words guessed so far:\n\t{guessed_words}.\n") 
            #                     record_list.append(f"Potential next guesses:\n\t{potential_next_guesses}\n")
            #                     record_list.append(f"Words guessed so far:\n\t{guessed_words}.\n") 
                
            #     if bias == "rare":
            #         guess = rated_words[-1][0] # word in last position // least frequent word
                
            #         if return_stats == False:
            #             if verbose == True:
            #                 if len(potential_next_guesses) <= 40:
            #                     print(f"Potential next guesses:\n\t{rated_words}\n")
            #                     print(f"Words guessed so far:\n\t{guessed_words}.\n") 
            #                     record_list.append(f"Potential next guesses:\n\t{potential_next_guesses}\n")
            #                     record_list.append(f"Words guessed so far:\n\t{guessed_words}.\n") 
                    
            #     # guess = list(best_next_guesses)[0]
            #     guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])

        #### Guess has now been made -- what to do next
        if guess_num == max_guesses: # if at max guesses allowed
            guessed_words.append(guess)
            stats_dict['target_guessed'] = False
            if return_stats == False:
                if verbose == True:
                    # print("-----------------------------\n")
                    print(f"Unfortunately, the Wordle could not be solved in {max_guesses} guesses.\n")
                    print(f"The target word was '{target}'. Better luck next time!\n")
                    print("-----------------------------\n")
                    record_list.append(f"Unfortunately, the Wordle could not be solved in {max_guesses} guesses.\n")
                    record_list.append(f"The target word was '{target}'. Better luck next time!\n")
                    record_list.append("-----------------------------\n")
                else:
                    print(f"\nUnfortunately, the Wordle could not be solved in {max_guesses} guesses.")
                    print(f"The target word was '{target}'. Better luck next time!\n")
                    record_list.append(f"\nUnfortunately, the Wordle could not be solved in {max_guesses} guesses.")
                    record_list.append(f"The target word was '{target}'. Better luck next time!\n")
            break
        else: # if not at max guesses yet allowed
            # stats_dict['target_guessed'] = False
            if return_stats == False:
                if verbose == True:
                    print(f"Next guess:\n\t'{guess}'")
                    print("\n-----------------------------\n")
                    record_list.append(f"Next guess:\n\t'{guess}'")
                    record_list.append("\n-----------------------------\n")

        if guess == target:
            guess_num += 1
            guessed_words.append(guess)
            stats_dict['target_guessed'] = True

            if return_stats == False:
                print(f"Guess {guess_num}: '{guess}'\n")
                print(f"Congratulations! The Wordle has been solved in {guess_num} guesses!")
                record_list.append(f"Guess {guess_num}: '{guess}'\n")
                record_list.append(f"Congratulations! The Wordle has been solved in {guess_num} guesses!")

                if max_guesses - guess_num == 0:
                    print(f"Lucky! It was the last guess.")
                    record_list.append(f"Lucky! It was the last guess.")
                else:
                    print(f"There were still {max_guesses - guess_num} guesses remaining.")
                    record_list.append(f"There were still {max_guesses - guess_num} guesses remaining.")

            if return_stats == False:   
                # stats_dict['target_guessed'] = True                 
                print(f"\nThe target word was '{target}'.")
                print("\n-----------------------------")
                record_list.append(f"\nThe target word was '{target}'.")
                record_list.append("\n-----------------------------")
            break

    #### STATS STUFF    
    mid_guesses_vows = 0
    mid_guesses_cons = 0
    avg_perf_letters = 0
    avg_wrong_pos_letters = 0
    avg_wrong_letters = 0

    for i, word in enumerate(guessed_words):
        mid_guesses_vows += count_vows_cons(word, y_vow = True)['vows']
        mid_guesses_cons += count_vows_cons(word, y_vow = True)['cons']
        
    for i in range(0, len(guessed_words) - 1):
        avg_perf_letters += perfect_letts_per_guess[i]
        avg_wrong_pos_letters += wrong_pos_per_guess[i]
        avg_wrong_letters += wrong_letts_per_guess[i]

    stats_dict['mid_guesses_avg_vows'] = float(round(mid_guesses_vows / len(guessed_words), 2))
    stats_dict['mid_guesses_avg_cons'] = float(round(mid_guesses_cons / len(guessed_words), 2))

    stats_dict['avg_perf_letters'] = float(round(np.mean(avg_perf_letters), 2))
    stats_dict['avg_wrong_pos_letters'] = float(round(np.mean(avg_wrong_pos_letters), 2))
    stats_dict['avg_wrong_letters'] = float(round(np.mean(avg_wrong_letters), 2))
    
    # average number of words remaining after each guess -- the higher this is, the luckier the person got (the lower, the more guesses it took)
    stats_dict['avg_remaining'] = float(round(np.mean(reduction_per_guess), 2))

    # avg entropy of each guessed word relative to all other words possible at that moment -- this should consistently be 100 for the algorithm, but will be different for user
    if len(guess_entropies) > 1: # in case of guessing it correctly on the first try
        sum_entropies = 0
        for entropy in guess_entropies:
            sum_entropies += entropy

        average_entropy = float(round(sum_entropies / len(guess_entropies), 2))
        stats_dict['avg_intermediate_guess_entropy'] = average_entropy
    else:
        stats_dict['avg_intermediate_guess_entropy'] = float(100)

    expected_guesses = 3.85

    # guess_num = 3
    # average_entropy = 95
    luck = round(1 - ((((guess_num / expected_guesses) * (stats_dict['avg_intermediate_guess_entropy'] / 100)) / max_guesses) * 5), 2)
    stats_dict['luck'] = luck

    if record == True:
        if verbose == True:
            with open(f"solutions/{guessed_words[0]}_{target}_wizard_detailed.txt", "w") as fout:
                for line in record_list:
                    fout.write(line + "\n") # write each line of list of printed text to .txt file
        else:
            with open(f"solutions/{guessed_words[0]}_{target}_wizard_summary.txt", "w") as fout:
                for line in record_list:
                    fout.write(line + "\n") # write


    # if guess_num <= len(guess):
    if guess_num <= 6:
        stats_dict['valid_success'] = True
    else:
        stats_dict['valid_success'] = False

    stats_dict['num_guesses'] = float(guess_num)

    if return_stats == True:
        return stats_dict

In [155]:
test_1 = wordle_wizard(word_list = official_words, max_guesses = 6, 
                guess = "later", target = "jolly",
                random_guess = False, random_target = False, 
                verbose = True, drama = 0, return_stats = False, record = False)

-----------------------------

Guess 1: 'later'
Letters in correct positions:
	[]

Letters in incorrect positions:
	[('l', 0)]

Letters to guess again:
	['l']

Letters to not guess again:
	['a', 'e', 'r', 't']

At this point:
	2209, 94.93% of total words have been eliminated, and
	118, 5.07% of total words remain possible.

The top 40 potential next guesses are:
	[('solid', 100.0), ('spoil', 98.59), ('scold', 89.13), ('sling', 85.27), ('slink', 79.68), ('scowl', 78.07), ('cloud', 77.56), ('could', 77.56), ('slimy', 77.36), ('slick', 74.28), ('nobly', 74.21), ('cling', 74.15), ('slung', 73.95), ('blond', 72.54), ('clown', 72.35), ('silky', 71.38), ('clink', 68.62), ('child', 68.49), ('slunk', 68.36), ('ghoul', 67.97), ('blind', 67.91), ('colic', 67.07), ('plush', 66.88), ('moldy', 66.24), ('godly', 65.79), ('mogul', 64.05), ('guild', 63.67), ('slump', 62.96), ('clung', 62.83), ('vinyl', 62.51), ('slosh', 62.38), ('build', 62.06), ('blush', 61.99), ('would', 61.86), ('dimly', 61.61), ('c

In [142]:
suffix_freq_dist = {}
prefix_freq_dist = {}

for word in official_words:
    prefix = word[:2] # first 2 letters
    suffix = word[-2:] # last 2 letters
    if prefix not in prefix_freq_dist:
        prefix_freq_dist[prefix] = 1
    else:
        prefix_freq_dist[prefix] += 1

    if suffix not in suffix_freq_dist:
        suffix_freq_dist[suffix] = 1
    else:
        suffix_freq_dist[suffix] += 1

suffix_types = [key for key in suffix_freq_dist.keys()]
prefix_types = [key for key in prefix_freq_dist.keys()]

sorted_prefix_dist = sorted(prefix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)
sorted_suffix_dist = sorted(suffix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)

print("Prefixes:")
print(len(sorted_prefix_dist))
print(sorted_prefix_dist[:10])
print("-----")
print("Suffixes:")
print(len(sorted_suffix_dist))
print(sorted_suffix_dist[:10])

Prefixes:
214
[('st', 65), ('sh', 52), ('cr', 45), ('sp', 45), ('ch', 40), ('gr', 38), ('re', 37), ('tr', 37), ('fl', 36), ('br', 35)]
-----
Suffixes:
207
[('er', 142), ('ch', 57), ('ly', 56), ('se', 52), ('al', 49), ('ck', 47), ('ty', 46), ('te', 39), ('el', 38), ('dy', 38)]


In [143]:
grams_freq_dist = {}
gram_len = 3

for word in official_words:
    for i in range(0, len(word) - (gram_len - 1)): # so it doesn't index out of range
        gram = word[i:i + gram_len]

        if gram not in grams_freq_dist:
            grams_freq_dist[gram] = 1
        else:
            grams_freq_dist[gram] += 1

print(len(grams_freq_dist))
sorted_gram_dist = sorted(grams_freq_dist.items(), key = operator.itemgetter(1), reverse = True)
sorted_gram_dist[:15]

2210


[('ing', 31),
 ('lly', 22),
 ('ove', 21),
 ('ver', 21),
 ('sta', 21),
 ('ast', 20),
 ('lea', 19),
 ('ter', 19),
 ('tch', 19),
 ('unt', 19),
 ('sha', 18),
 ('ine', 18),
 ('ate', 18),
 ('sto', 18),
 ('ide', 18)]