## Imports

In [1]:
import numpy as np
import random
import operator
import time
import pandas as pd
import Levenshtein
from wordle_functions import *

## Importing datasets

### official words
- official wordle word list

In [2]:
### Official list
official_words = []

with open("data/official_words_processed.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        if word.isalpha():
            official_words.append(word)

f.close() # closes connection to file

print(len(official_words))
official_words[:5]

2309


['wince', 'thyme', 'mower', 'horde', 'heard']

In [3]:
for word in official_words:
    if word[1:4] == "ach":
        print(word)

cache
yacht
macho


## Prefix/Suffix bias

In [4]:
suffix_freq_dist = {}
prefix_freq_dist = {}

for word in official_words:
    prefix = word[:2] # first 2 letters
    suffix = word[-2:] # last 2 letters
    if prefix not in prefix_freq_dist:
        prefix_freq_dist[prefix] = 1
    else:
        prefix_freq_dist[prefix] += 1

    if suffix not in suffix_freq_dist:
        suffix_freq_dist[suffix] = 1
    else:
        suffix_freq_dist[suffix] += 1

suffix_types = [key for key in suffix_freq_dist.keys()]
prefix_types = [key for key in prefix_freq_dist.keys()]

sorted_prefix_dist = sorted(prefix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)
sorted_suffix_dist = sorted(suffix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)

print("Prefixes:")
print(len(sorted_prefix_dist))
print(sorted_prefix_dist[:10])
print("-----")
print("Suffixes:")
print(len(sorted_suffix_dist))
print(sorted_suffix_dist[:10])

Prefixes:
214
[('st', 65), ('sh', 52), ('cr', 45), ('sp', 45), ('ch', 40), ('gr', 38), ('fl', 36), ('re', 36), ('tr', 36), ('br', 35)]
-----
Suffixes:
202
[('er', 141), ('ly', 56), ('ch', 56), ('se', 52), ('al', 49), ('ck', 47), ('ty', 46), ('te', 39), ('el', 38), ('dy', 38)]


In [5]:
grams_freq_dist = {}
gram_len = 3

for word in official_words:
    for i in range(0, len(word) - (gram_len - 1)): # so it doesn't index out of range
        gram = word[i:i + gram_len]

        if gram not in grams_freq_dist:
            grams_freq_dist[gram] = 1
        else:
            grams_freq_dist[gram] += 1

print(len(grams_freq_dist))
sorted_gram_dist = sorted(grams_freq_dist.items(), key = operator.itemgetter(1), reverse = True)
sorted_gram_dist[:15]

2197


[('ing', 31),
 ('lly', 22),
 ('ove', 21),
 ('ver', 21),
 ('sta', 21),
 ('ast', 20),
 ('lea', 19),
 ('ter', 19),
 ('tch', 19),
 ('sha', 18),
 ('ine', 18),
 ('ate', 18),
 ('sto', 18),
 ('ide', 18),
 ('out', 18)]

## New functions

In [6]:
def get_possible_guesses(word_list: list, perf_letters: list, incorr_pos: list, wrong_letters: list):
    """
    This function takes a single Wordle guess and feedback returned by the game about that guess, and returns a list of possible remaining guesses.

    Parameters:
    -----
    `word_list`: list
        list of all possible words immediately prior to this stage's guess
    `perf_letters_letters`: list
        list of tuples, where the structure of each tuple is: ("correct_letter", int of letter position). Example listed below
    `incorr_pos`: list
        list of tuples, same structure as `perf_letters_letters`, but with letters and their incorrect positions
    `wrong_letters`: list
        list of individual letters that are not in the target word whatsoever

    Returns:
    -----
    `potentials_list`: list
        list of words that remain after eliminating all words based on the information provided

    Examples of inputs:
    -----
    word_list = official_words # something like ['wince', 'thyme', 'mower', 'horde', 'heard', 'tenor', 'zonal', 'parry', 'shied', 'fizzy']
    
    perf_letters = [("r", 2)] # could be any number of tuples
    
    incorr_pos = [("t", 2), ("r", 4)] # could be any number of tuples
    
    wrong_letters_letters = ["l", "a", "e"] # could be any number of items
    """
    
    incorr_words = set() # set of all words that the target could not possibly be the target
    for word in word_list:
        if len(incorr_pos) > 0: # sometimes there are none
            for incorr_letter, pos in incorr_pos: # adding words that have words of incorr letter positions (but not words that have these letters altogether - they could just be in a different spot)
                if 0 <= pos <= 4:
                    # print (word, pos)
                    if word[pos] == incorr_letter:
                        # print (word[pos])
                        incorr_words.add(word)
        if len(wrong_letters) > 0: # sometimes there are none
            for bad_letter in wrong_letters: # adding words that have completely wrong letters in them
                if bad_letter in word:
                    incorr_words.add(word)

    intermediate_list = set(word_list).difference(incorr_words) # difference between all impossible words and entire original passed word_list
    # print(intermediate_list)
    potentials_list = set()

    for word in intermediate_list:
        if len(perf_letters) > 0: # sometimes there are none
            good_letters = []
            for letter, pos in perf_letters:
                if 0 <= pos <= 4:
                    if word[pos] == letter:
                        good_letters.append(letter)
                if len(good_letters) == len(perf_letters):
                    potentials_list.add(word)
                else:
                    pass # skip to the next word
    
    if len(potentials_list) > 0:
        potentials_list = list(potentials_list)
    else:
        potentials_list = list(intermediate_list)

    return list(potentials_list)

In [7]:
corr_pos = [("p", 3)]
incor_pos = [('t', 1), ('a', 2), ('e', 4)]
wrong_letts = ['t']

pos_list = get_possible_guesses(word_list = official_words, perf_letters = corr_pos, incorr_pos = incor_pos, wrong_letters = wrong_letts)

test_ratings = get_word_meaning(word_list = official_words, words_to_rate = official_words, normalized = False, unique = True)
test_guess = "arose"

opposite_guesses = set()
for word, rating in test_ratings:
    if len(set(word).difference(set(test_guess))) == 5:
        if word != test_guess: # to make sure it doesn't loop infinitely
            opposite_guesses.add(word)

opposite_ratings = get_word_meaning(word_list = official_words, words_to_rate = list(opposite_guesses), normalized = False, unique = True)
opposite_ratings[:10]

#### NEXT
## if len(opposite_ratings) == 0, then use the highest rated word with common letters (or could try going to next highest rated word)
# (or could try going to next highest rated word that has 4 in common, instead of 5. If len(that list) == 0, go to 3, etc


[('until', 27.58),
 ('unlit', 27.58),
 ('glint', 26.12),
 ('tunic', 25.73),
 ('tulip', 25.69),
 ('unity', 25.45),
 ('guilt', 25.26),
 ('flint', 25.25),
 ('built', 24.96),
 ('input', 24.79)]

In [8]:
# asdf

In [9]:
def create_positional_dists(word_list: list, rating_method: str = "normalized"):
    """
    Given a list of words of the same length, creates a list of lists of tuples of (letter, count) in that position of the word. Returned list's indexing corresponds to letter positions of length of words within passed word list.

    Parameters:
    -----
    `word_list`: list
        list of words (str) of the same length
    `rating_method`: bool
        if "normalized", new ratings are normalized in the same way previous ratings were. If "ranked", new ratings are just a percentile score relative to their ranking within the list after multiplier is applied.

    Returns:
    -----
    `dists_list`: list
        list of lists of sorted letters distributions (tuples)
    """

    dists_list = []

    for i in range(0, len(word_list[0])):

        test_dict = {}

        for word in word_list:
            # if word.islpha():
            if word.isalpha():
                if word[i] in test_dict:
                    test_dict[word[i]] += 1
                else:
                    test_dict[word[i]] = 1

        sorted_dist = sorted(test_dict.items(), key = operator.itemgetter(1), reverse = True) # sorted descending

        # ratings calculated as normalized new ratings
        if rating_method == "normalized":
            if len(sorted_dist) > 1:
                sort_dist_normd = []
                for tup in sorted_dist:
                    # try:
                    normd = (tup[1] - sorted_dist[-1][1]) / (sorted_dist[0][1] - sorted_dist[-1][1])
                    sort_dist_normd.append((tup[0], normd))
                    # except:
                    #     ZeroDivisionError
                    #     # print(sorted_dist[-1][1])
                    #     print("TESTESTSETST")
                    #     sort_dist_normd.append((tup[0], 0.0))   
            # print (sort_dist_normd[-2][1])
            # if sort_dist_normd[-1][1] == 0:
            #     sort_dist_normd[-1][1] = sort_dist_normd[-2][1] / 2
            #     new_tup = sort_dist_normd[-1][1]
                
                    
        # ratings calculated according to ranking percentile
        if rating_method == "ranked":
            sort_dist_normd = []
            for tup in sorted_dist:
                letter = tup[0]
                rating = float((len(sorted_dist) - sorted_dist.index(tup)) / (len(sorted_dist)))
                sort_dist_normd.append((letter, rating))
            
            sort_dist_normd = sorted(sort_dist_normd, key = operator.itemgetter(1), reverse = True) # sorted descending
        
        dists_list.append(sort_dist_normd)

    return dists_list

In [10]:
create_positional_dists(['tests', 'party'], rating_method = "ranked")

[[('t', 1.0), ('p', 0.5)],
 [('e', 1.0), ('a', 0.5)],
 [('s', 1.0), ('r', 0.5)],
 [('t', 1.0)],
 [('s', 1.0), ('y', 0.5)]]

In [11]:
def intensify_by_positions(word_list: list, ratings_list: list, strength: float = 1, rounding: int = None):
    """
    Given a passed list of tuples containing words of equal length and a rating for each word, this function multiplies each word's passed rating by an evaluation of how likely each letter in the word is to appear in its current position.

    Parameters:
    ------
    `ratings_list`: list
        list of tuples. Structure is ('letter', rating)
    `strength`: float
        how strongly the intensifier should affect the current ratings. Default is 1 ("normal" strength)
    `rounding`: int
        Number of decimal places to round to
    Returns:
    ------
    `sorted_new_ratings`: list
        list of tuples of the same words that were passed, but ratings have been multiplied according to the average likelihood for the each letter in the word to appear in its position.
    """

    pos_dists = create_positional_dists(word_list = word_list, rating_method = "normalized")

    new_ratings = []

    for tup in ratings_list:
        word = tup[0]
        curr_rating = tup[1]
        
        intensifiers_sum = 0
        
        for char_id in range(0, len(word)): # iterate through chars in word

            char = word[char_id]
            # for ex_tup in example[char_id]: # iterate through the list that corresponds to the character position we're currently in of the word 
            for ex_tup in pos_dists[char_id]: # iterate through the list that corresponds to the character position we're currently in of the word 
                if ex_tup[0] == char:
                    intensifiers_sum += ex_tup[1] # add the intensifier for that letter from that list to the sum
                    # print(char, ex_tup[1])
                    break

        intensifier = float(intensifiers_sum / len(word))
        # print(word, intensifier)
        # final_rating = intensifier / curr_rating * strength
        final_rating = (1 - intensifier) / curr_rating * strength

        if rounding:
            final_rating = round(final_rating, rounding)
        # else:
        #     final_rating = final_rating

        new_ratings.append((word, final_rating))
        sorted_new_ratings = sorted(new_ratings, key = operator.itemgetter(1), reverse = True) # sorted descending

    return sorted_new_ratings        

In [12]:
tests = [("tests", 89), ("party", 49), ("later", 34)]

intensify_by_positions(word_list = official_words, ratings_list = tests, strength = 2, rounding = None)

[('later', 0.022424851940517614),
 ('party', 0.014627427808230845),
 ('tests', 0.013616171978570898)]

## wordle_wizard() 2.0

In [13]:
def get_grams(word_list: list, length: int = 2, from_start: bool = False, perc: bool = True, rounding: int = None):
    """
    Get a distribution of all grams of indicated length from passed list of words. Words must be each same length or else indexing issues may occur.

    Parameters:
    ------
    `word_list`: list
        list of words of the same length
    `length`: int
        length of desired gram
    `from_start`: bool
        if True, gram length is counted from the start of each word. If False, length is counted from the end
    `perc`: bool
        if True, normalizes returned values as a percentage of sum of all counts. If False, returns respective counts
    `rounding`: int
        if not None, rounds returned values to nearest int decimal place
    
    Returns:
    ------
    `gram_dist`: list
        list of tuples in structure [(gram, count), (gram, count), ...] in descending order of counts
    """

    gram_dist = {}
    for word in word_list:
        if from_start == True: # if prefix
            gram = word[:length]
        else: # if suffix
            gram = word[-length:]
        if gram not in gram_dist:
            gram_dist[gram] = 1
        else:
            gram_dist[gram] += 1

    gram_dist = sorted(gram_dist.items(), key = operator.itemgetter(1), reverse = True)
    gram_dist[:10]
    perc_dist = []

    if perc == True:    
        for gram, count in gram_dist:
            if rounding: # round result to indicated decimal place
                norm_score = round(((count / sum([c for g, c in gram_dist])) * 100), rounding)
            else: # don't round at all
                norm_score = (count / sum([c for g, c in gram_dist])) * 100
            perc_dist.append((gram, norm_score))

        gram_dist = sorted(perc_dist, key = operator.itemgetter(1), reverse = True)

    return gram_dist

In [14]:
get_grams(word_list = official_words, length = 2, from_start = False, perc = False, rounding = 4)[:5]

[('er', 141), ('ly', 56), ('ch', 56), ('se', 52), ('al', 49)]

In [131]:
def wordle_wizard2(guess: str, target: str, word_list: list, opt_thres: int = 100, max_guesses: int = 6, gram_bias: str = None, intensifier: bool = False, mult_strength: float = 1, return_stats: bool = False, drama: float = 0, verbose: bool = False):
    """
    Wordle Wizard 2.0. New and improved.

    Parameters:
    ------
    `guess`: str
        starting word
    `target`: str
        word for the model to find a path to
    `word_list`: list
        list of possible words. All words must be of same length
    `opt_thres`: int > 1
        after each guess is evaluated and all remaining possible words are found, opt_thres is the "optimizer" threshold above which the model will choose a new maximally diverse, highest rated word with all new letters, instead of using known correct letters
    `max_guesses`: int
        max number of guesses allowed before puzzle is considered unsolved. Default value is 6, matching official game
    `gram_bias`: str
        if "suffix", biases guessing towards words with more common suffixes. If "prefix", biases guessing towards words with more common prefixes
    `intensifier`: bool
        if True, ratings of each word are not just calculated based on letter diversity of each word, but also positionality of letters in each word
    `mult_strength`: float
        float value that affects the strength of the intensifier of the word ratings of all possible remaining words generated after each guess
    `return_stats`: bool
        if True, returns a dictionary of statistics and information about playthrough
    `verbose`: bool
        if True, print extra information as solution as found. If False, prints only guesses and found target word
    `drama`: float
        amount of time to wait at certain points as the functions runs. For dramatic effect and literally nothing else
    
    Returns:
    ------
    `stats_dict`: dict
        dictionary of metrics tracked about a wordle playthrough to make playthroughs comparable
    """

    guess = guess.lower() # lowering for consistency
    target = target.lower() # lowering for consistency
   
    if guess not in word_list and len(guess) == 5: word_list.append(guess)
    if target not in word_list and len(target) == 5: word_list.append(target)

    stats_dict = {}
    stats_dict['first_guess'] = guess
    stats_dict['target'] = target
    stats_dict['intensifier'] = intensifier
    stats_dict['mult_strength'] = mult_strength
    stats_dict['opt_thresh'] = opt_thres
    stats_dict['first_guess_vows'] = count_vows_cons(guess, y_vow = True)['vows']
    stats_dict['first_guess_cons'] = count_vows_cons(guess, y_vow = True)['cons']
    stats_dict['target_vows'] = count_vows_cons(target, y_vow = True)['vows']
    stats_dict['target_cons'] = count_vows_cons(target, y_vow = True)['cons']
    
    avg_guess_meaning = []
    all_guessed_letts = set() # running total of all unique guessed letters (corrent and incorrect) -- used for optimal mode

    wordlen = len(guess)
    ideal_letter_diversity = len(guess) # making a second one because this variable is decremented and I dont' want wordlen to be affected
    
    letter_positions = set(i for i in range(0, wordlen))
    
    guess_num = 0

    guessed_words = []
    perf_letters = []
    incorr_pos = []
    wrong_letters = []
    vows_per_guess = []
    cons_per_guess = []
    words_remaining = []
    if gram_bias == "suffix":
        word_list_grams = get_grams(word_list = official_words, length = 2, from_start = False, perc = True, rounding = 4)[:5]
    elif gram_bias == "prefix":
        word_list_grams = get_grams(word_list = official_words, length = 2, from_start = True, perc = True, rounding = 4)[:5]

    while guess:

        guess_num += 1

        if return_stats == False:
            time.sleep(drama)
            if guess_num == 1:
                print("------------------")
            # if verbose ==
            
            print(f"\nGuess {guess_num}: '{guess}'\n")
            if verbose == False:
                print("------------------")
        
        # stats tracked about each guess
        guessed_words.append(guess)
        all_guessed_letts.update(set(guess)) # set of all used letters -- used in "optimal" mode
        vows_per_guess.append(count_vows_cons(guess, y_vow = True)['vows'])
        cons_per_guess.append(count_vows_cons(guess, y_vow = True)['cons'])

        ### once target has been guessed, do all of these
        if guess == target:
            if return_stats == False:
                if verbose == False:
                    print(f"\nThe puzzle was solved after {guess_num} guesses.")
                else:
                    print(f"The puzzle was solved after {guess_num} guesses.")
                print(f"The target word was '{target}'.\n")
                print("------------------")
            if return_stats == True:
                stats_dict['num_guesses'] = guess_num
                stats_dict['avg_guess_meaning'] =  0 if sum(avg_guess_meaning) == 0 else round(sum(avg_guess_meaning) / len(avg_guess_meaning), 2)
                stats_dict['avg_vows_per_guess'] = sum(vows_per_guess) / len(vows_per_guess)
                stats_dict['avg_cons_per_guess'] = sum(cons_per_guess) / len(cons_per_guess)
                stats_dict['valid_solution'] = True if guess_num <= 6 else False
                stats_dict['gram_bias'] = str(gram_bias)

                if len(guessed_words) > 1:
                    lev_dists = []
                    for i in range(1, len(guessed_words)):
                        dist = Levenshtein.jaro_winkler(guessed_words[i], guessed_words[i-1])
                        lev_dists.append(dist)
                    stats_dict['avg_similarity'] = sum(lev_dists) / len(lev_dists)
                    stats_dict['luck'] = (sum(lev_dists) / len(lev_dists)) / guess_num
                    stats_dict['avg_words_remaining'] = sum(words_remaining) / len(words_remaining)
                else: # if first guess is target
                    stats_dict['avg_similarity'] = 1
                    stats_dict['luck'] = 1
                    stats_dict['avg_words_remaining'] = 0

                return stats_dict
            break
        
        ### if max_guesses is reached
        if guess_num == max_guesses:            
            if return_stats == False:
                if verbose == False:
                    print("\n")
                print(f"The puzzle could not be solved within the maximum number of guesses.")
                print(f"The target word was '{target}'.\n")
                print("------------------")            
            else:
                stats_dict['num_guesses'] = guess_num
                stats_dict['avg_guess_meaning'] =  0 if sum(avg_guess_meaning) == 0 else round(sum(avg_guess_meaning) / len(avg_guess_meaning), 2)
                stats_dict['avg_vows_per_guess'] = sum(vows_per_guess) / len(vows_per_guess)
                stats_dict['avg_cons_per_guess'] = sum(cons_per_guess) / len(cons_per_guess)
                stats_dict['valid_solution'] = True if guess_num <= 6 else False
                stats_dict['gram_bias'] = str(gram_bias)

                if len(guessed_words) > 1:
                    lev_dists = []
                    for i in range(1, len(guessed_words)):
                        dist = Levenshtein.jaro_winkler(guessed_words[i], guessed_words[i-1])
                        lev_dists.append(dist)
                    stats_dict['avg_similarity'] = sum(lev_dists) / len(lev_dists)
                    stats_dict['luck'] = (sum(lev_dists) / len(lev_dists)) / guess_num
                    stats_dict['avg_words_remaining'] = sum(words_remaining) / len(words_remaining)
                else: # if first guess is target
                    stats_dict['avg_similarity'] = 1
                    stats_dict['luck'] = 1
                    stats_dict['avg_words_remaining'] = 0
                
                return stats_dict
            break
        
        #### Evaluating current guess according to 3 checks of the game
        for i in letter_positions: # number of letters in each word (current word and target word)

            if guess[i] == target[i]:
                if (guess[i], i) not in perf_letters:
                    perf_letters.append((guess[i], i))
            elif guess[i] != target[i] and guess[i] in target:
                if (guess[i], i) not in incorr_pos:
                    incorr_pos.append((guess[i], i))
            elif guess[i] not in target:
                if (guess[i]) not in wrong_letters:
                    # wrong_letters.add(guess[i])
                    wrong_letters.append(guess[i])
                    wrong_letters = list(set(wrong_letters))

        perf_letters = sorted(perf_letters, key = operator.itemgetter(1), reverse = False)
        incorr_pos = sorted(incorr_pos, key = operator.itemgetter(1), reverse = False)
        wrong_letters = sorted(wrong_letters, key = operator.itemgetter(0), reverse = False)

        if return_stats == False:
            if verbose == True:
                # print("\n------\nStats:")
                # print("----------")
                print(f"Letters in correct positions: {perf_letters}")
                print(f"Letters in incorrect positions: {incorr_pos}")
                print(f"Incorrect letters: {wrong_letters}")
                # print("----------\n")

        potential_next_words = get_possible_guesses(word_list = word_list, perf_letters = perf_letters, incorr_pos = incorr_pos, wrong_letters = wrong_letters)
        potential_next_words = [word for word in potential_next_words if word not in guessed_words] # excludes already guessed words, or else function runs infinitely

        if return_stats == False:
            if verbose == True:
                time.sleep(drama / 1.5)
                if len(potential_next_words) != 1:
                    print(f"\n{len(potential_next_words)} possible words remaining.\n")
                else:
                    print(f"\n{len(potential_next_words)} possible word remaining.\n")

        if len(potential_next_words) > opt_thres:
            
            if return_stats == False:
                if verbose == True:
                    print("------------------")    
                    time.sleep(drama / 1.5)
                    print(f"\nNot enough related information in the previous guess.\n")
                    time.sleep(drama)
                    print(f"Finding best next guess with new letters...\n")
                    time.sleep(drama / 1.5)
                    # print("------------------")    

            while ideal_letter_diversity:
            
                remaining_opposites = set()
                for word in official_words:
                    if word not in guessed_words: # eliminate previously guessed words
                        if len(set(word).difference(set(all_guessed_letts))) == wordlen:
                            remaining_opposites.add(word)

                if len(remaining_opposites) > 0: # if it can find at least one word
                    guess_ratings = get_word_meaning(words_to_rate = list(remaining_opposites), word_list = word_list, normalized = False, ascending = False)
                    break
                else:
                    ideal_letter_diversity -= 1 # try with 4 new unique letters, then 3, then 2, etc
                
                if ideal_letter_diversity == 0: # if it can't find any words with at least 1 new letter (this should be impossible, but including this just in case, as a killswitch)
                    guess_ratings = get_word_meaning(words_to_rate = potential_next_words, word_list = word_list, normalized = False, ascending = False)
                    break
                else: # 
                    guess_ratings = get_word_meaning(words_to_rate = list(remaining_opposites), word_list = word_list, normalized = False, ascending = False)

        else: # if there are less than optim_thresh remaining possible words, only count remaining words as ones that satisfy all 3 of the checked criteria
            
            # checking for perfect letters
            if len(perf_letters) > 0:
                keeps = set()
                for word in potential_next_words:
                    for goodlett, goodpos in perf_letters:
                        if word[goodpos] == goodlett: # word letters ARE those letters in those spots
                            keeps.add(word)

            # checking for words with letters in incorrect pos, adding them to exclusions set
            if len(incor_pos) > 0:
                excludes = set()
                for word in potential_next_words:
                    for badlett, badpos in incor_pos:
                        if word[badpos] == badlett: 
                            excludes.add(word)

            # checking for words with bad letters anywhere in them, adding them to exclusions set
            if len(wrong_letters) > 0:
                for word in potential_next_words:
                    if len(set(word).difference(set(wrong_letters))) < len(set(word)): # if the difference is less than the number of unique chars in the word (there aren't always 5 unique chars)
                        excludes.add(word)

            potential_next_words = list(potential_next_words)
            guess_ratings = get_word_meaning(words_to_rate = potential_next_words, word_list = word_list, normalized = False, ascending = False)

        if intensifier == True:
            guess_ratings = intensify_by_positions(word_list = official_words, ratings_list = guess_ratings, strength = mult_strength, rounding = 2)

        words_remaining.append(len(guess_ratings))
        guess = guess_ratings[0][0] # word in [(word, rating)] structure. Chooses first word in first tuple
        avg_guess_meaning.append(guess_ratings[0][1])
        if return_stats == False:
            if verbose == True:
                time.sleep(drama / 1.5)
                print(f"Next guess: '{guess}'\n")
                print("------------------")

In [132]:
"""
NEXT STEPS:
- add prefix/suffix bias (whichever is currently in ww1.0)


"""

wordlist = official_words.copy() # so the list doesn't grow with each OOV word

starting_word = "irate"
target_word = "thyme"

# starting_word = random.choice(official_words)
# target_word = random.choice(official_words)

wordle_wizard2(guess = starting_word, target = target_word, word_list = wordlist,
               opt_thres = 100, max_guesses = 8, gram_bias = "suffix",
               intensifier = False, mult_strength = 10,
               return_stats = False, drama = 0.0, verbose = False)

------------------

Guess 1: 'irate'

------------------

Guess 2: 'stole'

------------------

Guess 3: 'dunce'

------------------

Guess 4: 'thyme'

------------------

The puzzle was solved after 4 guesses.
The target word was 'thyme'.

------------------


In [127]:
best_guess_words(word_list = official_words, show_letters = False)

[('arose', 38.02), ('adore', 35.72), ('opera', 35.49)]

In [130]:
word_pairs = []

for word1 in official_words:
    for word2 in official_words:
        if Levenshtein.distance(word1, word2) == 1:
            word_pairs.append((word1, word2))

len(list(set(word_pairs)))

5124

In [86]:
# len(word_pairs)
# sorted(word_pairs, key = operator.itemgetter(1), reverse = False) 
ones_counts = {}
for word1, word2 in word_pairs:
    one = word1 + "1"
    if one not in ones_counts:
        ones_counts[one] = 1
    else:
        ones_counts[one] += 1

sorted(ones_counts.items(), key = operator.itemgetter(1), reverse = True)[:10]

[('share1', 15),
 ('store1', 13),
 ('stare1', 12),
 ('shore1', 12),
 ('shale1', 11),
 ('stale1', 11),
 ('stack1', 10),
 ('patty1', 10),
 ('cover1', 9),
 ('spare1', 9)]

In [93]:
for word in official_words:
    if Levenshtein.distance("share", word) == 1:
        print(word)

snare
stare
scare
shard
spare
shape
shake
shade
shore
shame
shale
shire
sharp
shave
shark


In [18]:
asdf

NameError: name 'asdf' is not defined

In [None]:
len(official_words) // 10
len(official_words) // 100

1679360070.10414
1679360070.104507


## Simulations

In [110]:
import time
## Takes about 7min20s for 18.5k iterations

excepts = [] # keeping track of word combinations that don't work for some reason

stats_master = {}

for start_word in ["irate", "fuzzy"]:
    

    for bias in ["suffix", "prefix", None]:
        # for timing iterations
        start = time.time()

        for target_word in official_words:

            try:
                complete = wordle_wizard2(guess = start_word, target = target_word, word_list = official_words,
                opt_thres = 100, max_guesses = 15, gram_bias = bias,
                intensifier = False, mult_strength = 10,
                return_stats = True, drama = 0.0, verbose = False)

            except:
                ZeroDivisionError
                excepts.append((complete["first_guess"], complete["target"]))
                
            for metric, result in complete.items():
                if metric in stats_master.keys():
                    stats_master[metric].append(result)
                else:
                    stats_master[metric] = []

        # for timing iterations
        stop = time.time()
        print(((bias, start_word, round(stop - start, 2)), f"{round((stop - start) / len(official_words), 2)} seconds per bias"))     

### df creation and csv writing 
sims_df = pd.DataFrame(stats_master)

print(f"{len(sims_df)} iterations run. {len(excepts)} combinations excepted.")
# print(excepts[:10])
# print(sims_df['first_guess'].unique().tolist())

print(sims_df.shape)
sims_df.head()

(('suffix', 'irate', 86.18), '0.04 seconds per bias')
(('prefix', 'irate', 88.0), '0.04 seconds per bias')
((None, 'irate', 83.24), '0.04 seconds per bias')
(('suffix', 'fuzzy', 104.81), '0.05 seconds per bias')
(('prefix', 'fuzzy', 108.68), '0.05 seconds per bias')
((None, 'fuzzy', 98.44), '0.04 seconds per bias')
13853 iterations run. 123 combinations excepted.
(13853, 18)


Unnamed: 0,first_guess,target,intensifier,mult_strength,opt_thresh,first_guess_vows,first_guess_cons,target_vows,target_cons,num_guesses,avg_guess_meaning,avg_vows_per_guess,avg_cons_per_guess,valid_solution,gram_bias,avg_similarity,luck,avg_words_remaining
0,irate,thyme,False,10,100,3,2,2,3,15,31.32,2.066667,2.933333,False,suffix,0.92381,0.061587,57.0
1,irate,mower,False,10,100,3,2,2,3,7,30.03,2.285714,2.714286,False,suffix,0.566667,0.080952,30.166667
2,irate,horde,False,10,100,3,2,2,3,15,29.49,2.066667,2.933333,False,suffix,0.890476,0.059365,49.0
3,irate,heard,False,10,100,3,2,2,3,15,31.08,2.933333,2.066667,False,suffix,0.857143,0.057143,52.0
4,irate,tenor,False,10,100,3,2,2,3,3,31.44,2.333333,2.666667,True,suffix,0.0,0.0,84.5


In [133]:
# excepts

In [147]:
# print(sims_df.query("opt_thres == 230")['num_guesses'].mean()) # 4.28
# print(sims_df.query("opt_thres == 23")['num_guesses'].mean()) # 4.30
for word in sims_df["first_guess"].unique().tolist(): 
    for bias in sims_df["gram_bias"].unique().tolist(): 
        print("------")
        print((word, bias))
        print(sims_df.query(f"first_guess == '{word}' & 'gram_bias' == '{bias}'")['num_guesses'].mean())

------
('irate', 'suffix')
nan
------
('irate', 'prefix')
nan
------
('irate', 'None')
nan
------
('fuzzy', 'suffix')
nan
------
('fuzzy', 'prefix')
nan
------
('fuzzy', 'None')
nan


## Testing wordle_wizard2()

In [None]:
[round(i, 1) for i in np.arange(1, 2, 0.1)]

In [None]:
# print(f'False: {sims_df.query("multiplier == False")["num_guesses"].mean()}')
# print(f'False: {sims_df.query("multiplier == True")["num_guesses"].mean()}')
print(sims_df.query("first_guess == 'later' & multiplier == True")["num_guesses"].mean())
print(sims_df.query("first_guess == 'later' & multiplier == False")["num_guesses"].mean())
print("-----")
print(sims_df.query("first_guess == 'fuzzy' & multiplier == True")["num_guesses"].mean())
print(sims_df.query("first_guess == 'fuzzy' & multiplier == False")["num_guesses"].mean())

In [None]:
create_positional_dists(word_list = ["tests"], rating_method="ranked")

In [None]:
def get_word_entropy(word: str, return_prob: bool = True):

    char_freqs = {}

    for char in word:
        if char not in char_freqs:
            char_freqs[char] = 1
        else:
            char_freqs[char] += 1

    if return_prob == True:
        prob_dist = []
        for char, freq in char_freqs.items():
            prob_dist.append((char, (freq/len(char_freqs))))        
        sorted_freqs = sorted(prob_dist, key = operator.itemgetter(1), reverse = True)
    else:
        sorted_freqs = sorted(char_freqs.items(), key = operator.itemgetter(1), reverse = True)

    return sorted_freqs

result = get_word_entropy("abcdefghijklmnopqrstuvwxyz", return_prob = True)
result_letters = [letter for letter, prob in result]
result_dist = [round(prob, 2) for letter, prob in result]
print(result_letters)
print(result_dist)

from scipy.stats import entropy
entropy(pk = result_dist) # only measures how diverse a word is

In [None]:
asdf

## Testing Best and Worst Words Against all Wordle Words

In [None]:
excepts = [] # keeping track of word combinations that don't work for some reason

stats_master = {}

for start_word in ["later", "fuzzy"]:

    for target_word in official_words:

        ## only run the combination if it hasn't been done already
        try:
            complete = wordle_wizard(word_list = official_words, max_guesses = 15, 
                guess = start_word, target = target_word,
                random_guess = False, random_target = False, 
                    verbose = False, drama = 0, return_stats = True)

        except:
            IndexError
            excepts.append((complete["first_guess"], complete["target_word"]))
            
        for metric, result in complete.items():
            if metric in stats_master.keys():
                stats_master[metric].append(result)
            else:
                stats_master[metric] = []

### df creation and csv writing 
sims_df = pd.DataFrame(stats_master)

print(f"{len(sims_df)} iterations run. {len(excepts)} combinations excepted.")
print(excepts[:10])

print(sims_df.shape)
print(sims_df['first_guess'].unique().tolist())
sims_df.head()

In [None]:
later_guesses = sims_df.query("first_guess == 'later'")[['first_guess', 'num_guesses']]
sims_df.query("first_guess == 'later'")['num_guesses'].mean() # later : ~3.81 avg guesses

for word in sims_df["first_guess"].unique().tolist():
    mean_guesses = round(sims_df.query(f"first_guess == '{word}'")['num_guesses'].mean(), 2) # later : ~3.81 avg guesses, fuzzy : ~4.36 avg guesses
    word_df = sims_df.query(f"first_guess == '{word}'")['num_guesses']
    
    word_df_dist_plot = px.histogram(word_df, x = "num_guesses", title = f"Distribution of Guesses with '{word}' as Starting Word",
                                        labels = {"num_guesses": "Number of Guesses"})
    word_df_dist_plot.add_vline(x = mean_guesses, line_width = 4, line_dash = "dot", line_color = "black",
                                    annotation_text = f"Mean = {mean_guesses}", annotation_font_size = 20,
                                    annotation_font_color = "black", annotation_position = "right")
    word_df_dist_plot.update_layout(title_font_size = 22)
    word_df_dist_plot.update_traces(marker_color = "#6ca965")

    word_df_dist_plot.show()