In [None]:
# The purpose of this project is to see if nltk can help me find a better 
# starting word for Wordle. 

In [None]:
# %pip install nltk

In [None]:
# Downloads all packages from the nltk framework; this saves time down the road.
import nltk
nltk.download('all')

In [None]:
# Import packages.
import itertools
from nltk.corpus import words
from nltk.corpus import names

In [None]:
# Generate list of english words. nltk.corpus.words contains common English 
# names, so we need to remove them before we proceed.
words = words.words()
words = [word.upper() for word in words]

names = names.words()
names = [name.upper() for name in names]

words = sorted(list(set(words) - set(names)))

In [None]:
# Get all possible qualifying words for your version of Wordle. Standard
# Wordle uses 5 letters, but if you are playing with more of less, then
# adjust the parameter as needed.
number_of_letters = 5

wordle_words = [word for word in words if len(word) == number_of_letters]
wordle_words.sort(reverse = False) # Cosmetic; this alphabetizes things.

In [None]:
# Now we think about the problem from a meta level. Given a fixed number of 
# characters, there are a few considerations to choosing a 'best' starting 
# word:
# 
#     1.  You want the word to have distinct characters. If the characters are
#         are repeated, then you are wasting a good guess (remember, traditional
#         Wordle has a fixed number of guesses.
#     2.  You need to choose a method that maximizes the number of 'good' 
#         letters. A 'good' letter should be through of as frequently occurring.
#         This means you need a definition of 'frequently occurring'. The 
#         obvious ways to do it are:
#             (i)   Given the set of possible words, figure out which letters  
#                   show up the most across all words, counting repeats.
#                   Letters with a higher count mean that the letter occurs 
#                   more often than those with a low count.
#             (ii)  Given the set of possible words, figure out which letters 
#                   how many words contain that letter. Letters with a higher
#                   count of words should be better choices than those with a 
#                   low count.
#         Each has its' advantage. 
# 
# My strategy to approaching this will be looking for words that satisfy the 
# intersection of these definitions and happen to have no repeated letters.

In [None]:
# Let's start with definition 2.i. 

# Contains tuples with the structure
# (letter, total number of times the letter occurred across all words). 
# A letter being repeated is counted once per repeat. For instance, 
# given the list of words ['abra','cadabra','alakazam'], the tuple 
# for 'a' would be ('a', 9) despite the number of words being 3.

wordle_letter_frequency = {}

for word in wordle_words:
    for letter in word:
        if letter in wordle_letter_frequency.keys():
            wordle_letter_frequency[letter] += 1
        else:
            wordle_letter_frequency[letter] = 1

wordle_letter_frequency = [(letter, wordle_letter_frequency[letter]) for letter in wordle_letter_frequency.keys()]
wordle_letter_frequency.sort(key = lambda x: x[1], reverse = True)

In [None]:
# Now let's tackle definition 2.ii.

# Contains tuples with the structure 
# (letter, number of words the letter was in). Once a letter is in a 
# word, a repeat of a letter in the same word does not increase the 
# word count. For instance, given the list of words 
# ['abra','cadabra','alakazam'], the tuple for 'a' would be ('a', 3) 
# despite 'a' showing up 9 times in the 3 words.

wordle_word_contains_letter_frequency = {}

for word in wordle_words:
    used_letters = set([])
    for letter in word:
        if letter not in wordle_word_contains_letter_frequency.keys(): # i haven't seen the letter previously
            wordle_word_contains_letter_frequency[letter] = 1
            used_letters.add(letter)
        elif letter not in used_letters: # i haven't seen the letter in this word yet
            wordle_word_contains_letter_frequency[letter] += 1
            used_letters.add(letter)
        else: # i have seen the letter before, and i've seen it in this word
            pass

wordle_word_contains_letter_frequency = [(letter, wordle_word_contains_letter_frequency[letter]) for letter in wordle_word_contains_letter_frequency.keys()]
wordle_word_contains_letter_frequency.sort(key = lambda x: (x[1],x[0]), reverse = True)

In [None]:
def combinations_of_letters(list_of_letter_tuples, n):
    '''
    Generate the n-tuples of letters. Since the letters are already sorted
    most frequent to least frequent, the return will also be a sorted list 
    of tuples.
    
    params:
        list_of_letter_tuples : list
            A list of (letter, number of occurrences) tuples.  
            
        n : int
            The overall length of the wordle word you are looking for. 
    '''
    
    # Now we generate the set of possible letter combinations. The number of things
    # we'll need to generate is 26_CHOOSE_N, where N is the number of letters in 
    # our Wordle word.

    letter_tuples = list(itertools.combinations(list_of_letter_tuples, number_of_letters))

    # The output is itself tuple, so we need to cast it to a list for sorting. 
    letter_tuples = [list(letter_tuple) for letter_tuple in letter_tuples]

    # In each combination list of letters, organize the most-frequent to 
    # least-frequent. 
    letter_tuples = [sorted(sublist, key = lambda x: x[1], reverse = True) for sublist in letter_tuples]

    # Finally, set up a reverse sort that pulls frequently occurring letters 
    # together while also sorting the least frequent possibilities to the bottom
    # (so we spend less time looking for matches in those).
    letter_tuples = sorted(letter_tuples, key = lambda x: (x[4][1],x[3][1],x[2][1],x[1][1],x[0][1]), reverse = True)
    
    return letter_tuples

In [None]:
# Generate the possible unique letter combinations. Notice that the list of
# combinations generated below must be of equal length.
letter_freq_comb = combinations_of_letters(wordle_letter_frequency, 5)
contains_letter_freq_comb = combinations_of_letters(wordle_word_contains_letter_frequency, 5)

In [None]:
# Now we build the intersection of the definitions.

# Set up the place holders.
words_from_letter_tuples_1 = [] # To track words I have already found.
words_from_letter_tuples_2 = [] # To track words I have already found.
first_words_from_letter_tuples_1 = [] 
first_words_from_letter_tuples_2 = [] 

# Now we loop through each list. 
for i in range(0, len(letter_freq_comb), 1):
    
    # For the first list, start with all possible words.
    possible_words = wordle_words.copy()
    
    # Grab a combination list of letters. For each letter in the combination
    # make sure the letter exists within the word. Notice that this loop 
    # construction means ALL of the letters must be in the word, and because
    # all letters are unique, words you end with MUST have distinct letters.
    for letter in letter_freq_comb[i]:
        possible_words = [possible_word for possible_word in possible_words if letter[0] in possible_word]
    
    # For all of the words that remained, join them into the list of words I 
    # have already found.
    for possible_word in possible_words:
        words_from_letter_tuples_1.append(possible_word)
        
    if possible_words != [] and first_words_from_letter_tuples_1 == []:
        first_words_from_letter_tuples_1 = possible_words.copy()

    # Now we repeat that exact same process for the second list. 
    possible_words = wordle_words.copy()
    for letter in contains_letter_freq_comb[i]:
        possible_words = [possible_word for possible_word in possible_words if letter[0] in possible_word]
    for possible_word in possible_words:
        words_from_letter_tuples_2.append(possible_word)
    if possible_words != [] and first_words_from_letter_tuples_2 == []:
        first_words_from_letter_tuples_2 = possible_words.copy()

    # Now that we have taken a combination from both combination lists, check 
    # to see if they have a common word. 
    if set(words_from_letter_tuples_1) & set(words_from_letter_tuples_2):
        # If the definitions agree on a word, tell me word and stop looking for
        # new words.
        print('most frequent letter approach - {}'.format(first_words_from_letter_tuples_1))
        print('contained in most words approach - {}'.format(first_words_from_letter_tuples_2))
        print('first overlap in definitions - {}'.format(sorted(list(set(words_from_letter_tuples_1) & set(words_from_letter_tuples_2)))))
        break
    else: 
        # Keep checking for more words.
        pass
    
print(wordle_letter_frequency[0:6])
print(wordle_word_contains_letter_frequency[0:6])

In [None]:
# most frequent letter approach - ['AROSE', 'OREAS']
#     AROSE means 'emerged, became apparent // to have stood or gotten up'
#     OREAS means 'mountain or hill'
# contained in most words approach - ['ARIES', 'ARISE', 'RAISE', 'SERAI']
#     ARIES means 'first sign of the Zodiac // '
#     ARISE means 'emerges, becomes apparent // to stand or get up'
#     RAISE means 'to lift to a higher position // an increase in salary // to increase strength or amount of'
#     SERAI means 'an inn // a Turkish palace'
# first overlap in definitions - ['ARIES', 'ARISE', 'AROSE', 'OREAS', 'RAISE', 'SERAI']

# This is so surprising to me. If we assume that SERAI and OREAS have non-English 
# etymologies (and they do), then this would exclude them from Wordle words. That 
# leaves us with ['ARIES', 'ARISE', 'AROSE', 'RAISE']. If we assume that ARIES 
# is a proper noun (and it could be), then that leaves us with 
# ['ARISE', 'AROSE', 'RAISE']. All three of these words have a connotation of 
# emerging or getting stronger. Do you find it weird that choosing a starting
# word for standard 5-letter Wordle is tied to the idea of emerging or 
# getting stronger? I do...