In [2]:
'''
GAME PLAN

for each word in guesses:
  add up the frequencies of each letter from letters_freq
  call it total_freq
end with a dict with each word having a total_freq
word in guesses with the highest total_freq is the best one


when clue comes in:
1 - narrow down guesses
2 - find the one with the most frequent letters (same method as above)
3 - repeat
'''

'\nGAME PLAN\n\nfor each word in guesses:\n  add up the frequencies of each letter from letters_freq\n  call it total_freq\nend with a dict with each word having a total_freq\nword in guesses with the highest total_freq is the best one\n\n\nwhen clue comes in:\n1 - narrow down guesses\n2 - find the one with the most frequent letters (same method as above)\n3 - repeat\n'

In [3]:
import numpy as np
import pandas as pd
from time import sleep

In [4]:
gPath = 'wordle_data/wordle-allowed-guesses.txt'
aPath = 'wordle_data/wordle-answers-alphabetical.txt'

In [5]:
# read files

gFile = open(gPath, 'r')
guesses = gFile.read()
aFile = open(aPath, 'r')
answers = aFile.read()
guesses += '\n' + answers
guesses, answers = np.array(guesses.split('\n')), np.array(answers.split('\n'))
guesses, answers= np.sort(guesses), np.sort(answers)

In [6]:
# log letter frequencies

letters = ''.join(answers)
# print(letters)
# remove everything but the letters
# letters = str(letters)
# letters = ''.join(letters.split())
# letters = letters.strip('[]')
# letters = letters.replace("'", "")

letters_freq = {i:letters.count(i) for i in set(letters)} # set() gets the unique values
letters_freq = {k: v for k, v in sorted(letters_freq.items(), key=lambda item: item[1], reverse=True)} # sort by most frequent
print(letters_freq)

{'e': 1233, 'a': 979, 'r': 899, 'o': 754, 't': 729, 'l': 719, 'i': 671, 's': 669, 'n': 575, 'c': 477, 'u': 467, 'y': 425, 'd': 393, 'h': 389, 'p': 367, 'm': 316, 'g': 311, 'b': 281, 'f': 230, 'k': 210, 'w': 195, 'v': 153, 'z': 40, 'x': 37, 'q': 29, 'j': 27}


In [7]:
# get top 5 most frequent letters
top5 = dict(list(letters_freq.items())[:5]) # slice list: first 5
print(top5)

{'e': 1233, 'a': 979, 'r': 899, 'o': 754, 't': 729}


In [8]:
def generateClue(guess, answer):
  '''
  returns a list of clues based on given guess and correct word, and the guess used
  PRECONDITIONS: 
  - guess: 5-letter string in guesses
  - answer: 5-letter string in answers
  0 = GREY
  1 = YELLOW
  2 = GREEN
  '''
  # nuance: duplicate letters in guess should be grey if the other is green BUT they show as yellow instead
  clues = [0, 0, 0, 0, 0]
  for i in range(5):
    if guess[i] in answer:
      if guess[i] == answer[i]:
        clues[i] = 2
      else:
        clues[i] = 1
    else:
      clues[i] = 0
  
  return clues  

In [9]:
def narrowGuesses(clues, guess, currentGuesses):
  '''
  returns a narrowed list of possible guesses based on clues given for a guess
  PRECONDITIONS:
  - clues: list length 5 of only 0, 1, or 2
  - guess: 5-letter string in guesses (not necessarily in currentGuesses)
  - currentGuesses: np array of 5-letter strings, sublist of guesses
  '''
  # first remove the current guess
  currentGuesses = np.delete(currentGuesses, np.where(currentGuesses == guess))
  for i in range(5):
    letter = guess[i]
    '''
    clues: [0 1 1 2 0]
    asnwer: terror
    guess: train
    letter: a
    clue: yellow (1)
    '''

    if clues[i] == 0: # grey
      # remove any words in currentGuesses that HAVE the current letter
      for word in currentGuesses:
        if letter in word:
          currentGuesses = np.delete(currentGuesses, np.where(currentGuesses == word))

    elif clues[i] == 1: # yellow
      # remove any words that DON'T HAVE the current letter OR have the letter in the SAME PLACE
      for word in currentGuesses:
        if (letter == word[i]) or (letter not in word):
          currentGuesses = np.delete(currentGuesses, np.where(currentGuesses == word))

    elif clues[i] == 2: # green
      # remove any words that DON'T HAVE the current letter AT POSITION i
      for word in currentGuesses:
        if (not letter == word[i]) or (letter not in word):
          currentGuesses = np.delete(currentGuesses, np.where(currentGuesses == word))      
  
  return currentGuesses

In [10]:
def bestGuess(currentGuesses, duplicatePenalty=1):
  '''
  returns the best guess (one with the most frequent letters) based on the given options, and a list of the best options
  PRECONDITIONS:
  - currentGuesses: np array of 5-letter strings
  '''
  # convert currentGuesses to a dict so each word can be given a value
  freqDict = dict(enumerate(currentGuesses, 1))
  freqDict = {value:key for key, value in freqDict.items()} # reverse keys and values
  for word in currentGuesses:
    totalFreq = 0;
    for letter in word:
      totalFreq += letters_freq.get(letter) * (duplicatePenalty/word.count(letter))
      # totalFreq += letters_freq.get(letter)
    freqDict[word] = totalFreq
  # sort by highest value'
  freqDict = {k: v for k, v in sorted(freqDict.items(), key=lambda item: item[1], reverse=True)}
  
  # print(freqDict)
  best = list(freqDict.keys())[0] # fetch highest value (should be first)
  bestWords = list(freqDict.keys())
  return best, bestWords;

In [11]:
def runWordle(numTimes=1, display=True, firstWord=bestGuess(guesses)[0]):
  '''
  runs the game a given number of times (default=1)
  returns: np array of # tries it takes each time (for data analysis purposes)
  '''
  triesLog = np.array([])

  for i in range(numTimes):
    # game loop
    
    # setup
    answer = np.random.choice(answers) # choose random answer
    # answer='hoard'
    clues = [] # initiate clues, currentGuesses, guess
    guess = ''
    currentGuesses = guesses
    bestWords = {}
    tries = 0

    if display: print('Chosen answer: ', answer)
    while not clues == [2, 2, 2, 2, 2]:
      guess, bestWords = bestGuess(currentGuesses)
      if display:
        print('Best guesses: ', bestWords[:20])
        print('Current guess: ', guess)
        sleep(0.5)
      if tries == 0:
        clues = generateClue(firstWord, answer)
      else:
        clues = generateClue(guess, answer)
      if display:
        print('Current clues: ', clues)
        sleep(0.5)
      currentGuesses = narrowGuesses(clues, guess, currentGuesses)
      tries+=1

    triesLog = np.append(triesLog, tries)
    if display:
      print('Successfully guessed the right word in', tries, 'tries ', end='')
      if tries <= 6: print(':)')
      else: print(':(')
      print()
      sleep(1)

  return triesLog

In [12]:
'''
Problems observed:
1. Starting word is "areae" because it has many frequent letters. Should find a way to somewhat (not completely) penalize duplicate letters
2. Same problem is observed with many of the guesses. almost all of them have duplicate letters. Maybe find a way to reward getting more clues?
'''

'\nProblems observed:\n1. Starting word is "areae" because it has many frequent letters. Should find a way to somewhat (not completely) penalize duplicate letters\n2. Same problem is observed with many of the guesses. almost all of them have duplicate letters. Maybe find a way to reward getting more clues?\n'

In [13]:
def analyze(array):
  arr_df = pd.DataFrame(array)
  print('Analysis of performance:')
  print(arr_df.describe())

In [14]:
'''
Analysis of performance:
count  500.000000
mean     4.630000
std      1.448204
min      2.000000
25%      4.000000
50%      4.000000
75%      5.000000
max     10.000000
'''

'\nAnalysis of performance:\ncount  500.000000\nmean     4.630000\nstd      1.448204\nmin      2.000000\n25%      4.000000\n50%      4.000000\n75%      5.000000\nmax     10.000000\n'

In [15]:
'''
Observations: program spends most time in narrowGuesses(), more specifically running np.delete(). 
proposal: finding a more efficient alternative?
Also, there are words that have the same letters (therefore same total_freq). So maybe we could find a way to reward letter placement as well.

NEW IDEA:
Although the program is using the letters that occur most frequent in the list of answers for its first word,
oater is not necessarily the best because it does not account for the positions of the letters.
To find the ACTUAL best word, I should run tests.
1. Get a list of possible best first words (don't know from where I will get this. If it was efficient I would run tests on all possible words)
2. Decide on a good number of times to run wordle with each first word to get a stable average # of tries
3. In theory, the first word with the lowest average # of tries SHOULD BE THE BEST ONE.
'''

"\nObservations: program spends most time in narrowGuesses(), more specifically running np.delete(). \nproposal: finding a more efficient alternative?\nAlso, there are words that have the same letters (therefore same total_freq). So maybe we could find a way to reward letter placement as well.\n\nNEW IDEA:\nAlthough the program is using the letters that occur most frequent in the list of answers for its first word,\noater is not necessarily the best because it does not account for the positions of the letters.\nTo find the ACTUAL best word, I should run tests.\n1. Get a list of possible best first words (don't know from where I will get this. If it was efficient I would run tests on all possible words)\n2. Decide on a good number of times to run wordle with each first word to get a stable average # of tries\n3. In theory, the first word with the lowest average # of tries SHOULD BE THE BEST ONE.\n"

In [16]:
def solve(currentGuesses=guesses):
    guess = input('Enter first guess:\n').lower()
    if guess == '':
        guess = bestGuess(currentGuesses)[0]
        print('First guess: ' + guess)
    while True:
        clues = []
        for i in range(5):
            prompt = 'Enter clue for letter ' + guess[i] + '\n'
            userIn = input(prompt)
            if userIn == 'quit':
                break
            clues.append(int(userIn))
        if len(clues) < 5:
            break
        if clues == [2,2,2,2,2]:
            print('Correct word has been found')
            break
        currentGuesses = narrowGuesses(clues, guess, currentGuesses)
        guess, bestWords = bestGuess(currentGuesses)
        print('Best guesses:', bestWords[:5])
        print('Best Guess:', guess)
        if len(bestWords) == 1:
            print('Correct word has been found')
            break
        sleep(0.5)


In [17]:
# triesLog = runWordle(3, display=True) # custom first word DOES NOT WORK
# analyze(triesLog)

In [19]:
solve()

First guess: oater
Best guesses: ['nails', 'laics', 'salic', 'dalis', 'laids']
Best Guess: nails
Best guesses: ['daych', 'gaucy', 'gauch', 'cadgy', 'gaudy']
Best Guess: daych


ValueError: invalid literal for int() with base 10: ''