### Import and install

In [135]:
import nltk
import string
import pandas as pd
import numpy as np
import copy
import re

from collections import defaultdict, Counter
nltk.download('words')

[nltk_data] Downloading package words to /Users/jackcook/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Get 5 letter words

Downloaded from https://www.powerlanguage.co.uk/wordle/main.e65ce0a5.js, the former (shorter) array of ~2,500 is [apparently the list of possible answers](https://puzzling.stackexchange.com/questions/114419/what-dictionary-is-wordle-based-on), whereas there are ~10,000 other valid guesses.

For our use case, we only care about answers as potential words.

In [131]:
with open('answers.txt') as file:
    possible_answers = file.readlines()

list_possible_answers = sorted([re.sub(r'[^A-Z]', '', t.upper()) for t in possible_answers[0].split(',')])
print(len(list_possible_answers),
      list_possible_answers[:5])

2315 ['ABACK', 'ABASE', 'ABATE', 'ABBEY', 'ABBOT']


Arrange as a DataFrame

In [136]:
arr_words_5l = np.array([list(w) for w in list_possible_answers])
df_words_5l = pd.DataFrame(data=arr_words_5l,
                           columns=[f'letter_{i+1}' for i in range(5)])
df_words_5l['word'] = list_possible_answers
df_words_5l.head()

Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,word
0,A,B,A,C,K,ABACK
1,A,B,A,S,E,ABASE
2,A,B,A,T,E,ABATE
3,A,B,B,E,Y,ABBEY
4,A,B,B,O,T,ABBOT


Assess how frequent letters are at each position

In [137]:
test_dict_letter_counts = Counter()
for i in range(5):
    test_dict_letter_counts[i+1] = Counter(df_words_5l[f'letter_{i+1}'])
    
test_dict_letter_counts[1]

Counter({'A': 141,
         'B': 173,
         'C': 198,
         'D': 111,
         'E': 72,
         'F': 136,
         'G': 115,
         'H': 69,
         'I': 34,
         'J': 20,
         'K': 20,
         'L': 88,
         'M': 107,
         'N': 37,
         'O': 41,
         'P': 142,
         'Q': 23,
         'R': 105,
         'S': 366,
         'T': 149,
         'U': 33,
         'V': 43,
         'W': 83,
         'Y': 6,
         'Z': 3})

Initialise game

In [198]:
list('acbc')[0]

'a'

In [263]:
# All letters in alphabet
alpha_list = list(string.ascii_uppercase)

class Game:
    
    def __init__(self, df_all_5l_words, list_possible_letters):
        
        # Start with whole alphabet as list of possible letters in word
        self.possible_letters = copy.deepcopy(alpha_list)
        
        # To store guessed letters that are correct, but in the wrong location
        self.dict_misplaced_letters = Counter()
        
        # Answer space
        self.df_possible_5l_words = df_all_5l_words.copy(deep=True)
        
        # Dictionary of answers, initialised as empty
        self.dict_letters = defaultdict(str)
        for i in range(5):
            self.dict_letters[i+1] = None
        
        # Initialise dictionary of letter counts at each position, updated as we play the game
        self.dict_letter_counts = defaultdict(str)
        for i in range(5):
            self.dict_letter_counts[i+1] = Counter(df_all_5l_words[f'letter_{i+1}'])
        

    def calculate_freq_score(self, letters):
        '''
        Based on a set of letters, sum their frequency score at each position
        '''
        
        score = 0
        for i, l in enumerate(list(letters.upper())):
            score += self.dict_letter_counts[i+1][l]
            
        return score
        
    
    def guess(self):
        '''
        Given a game state, produces the best guess possible based on the 'score' at
        each position.
        
        The list of possible words is already filtered for correct answers,
        and letters that fall within the list of possible list of letters for the remaining
        positions.
        '''
        
        # First, reset letter counts
        for i in range(5):
            self.dict_letter_counts[i+1] = Counter(self.df_possible_5l_words[f'letter_{i+1}'])
        
        # Vectorize frequency score function
        vect_calculate_freq_score = np.vectorize(self.calculate_freq_score)
        
        self.df_possible_5l_words['freq_score'] = vect_calculate_freq_score(self.df_possible_5l_words['word'])
        
        self.df_possible_5l_words = self.df_possible_5l_words.sort_values(by='freq_score', ascending=False)
        return self.df_possible_5l_words
    
    
    def check_misplaced_letters(self, word):
        '''
        Allows us to filter dataframe of possible words containing at least the misplaced letters.
        
        e.g. If I guess 'GREET', and the 3rd  E is green, I filter the dataframe of possible words
        for position 3 == "E". But if the 4th E is yellow, I also want to filter the possible words for 
        any where E falls in the 1st, 2nd, or 5th spots.
        
        We would first filter for position 4 != "E" (outside of this function), then for a given word - 
        check that the count of the letter "E" was >=1 outside of position 3.
        
        Returns a boolean, where True implies that the word has AT LEAST the letters contained in 
        self.dict_misplaced_letters (e.g. 2 "E"s outside of position 3 in the above example would be acceptable).
        '''
        
        # Break into letters
        list_word = list(word)
        
        # Get indices (1 indexed) of positions that have not yet been solved
        not_solved = [key for key, value in self.dict_letters.items() if value is None]
        
        # Filter list of words for those not yet solved, removing 1 from the index as our letters are 1 indexed
        list_word_unsolved = [list_word[i-1] for i in not_solved]
        
        # Check counts
        dict_count_letters = Counter(list_word_unsolved)
        
        # Compare to dictionary of misplaced letters
        valid = True
        for check_key, check_value in self.dict_misplaced_letters.items():
            if dict_count_letters[check_key] < check_value:
                valid = False
                
        return valid
        
    
    def update(self, guess, results):
        '''
        Takes a 5 letter guess as a string, and a list of results in the format:
        0 - incorrect
        1 - right letter, wrong place
        2 - right letter, right place
        
        Updates the game states (possible letters, misplaced letters, answers).
        
        Doesn't return anything
        '''
        
        assert len(guess) == 5, 'Guess must be 5 characters long'
        
        # Convert guess into list of letters
        list_guess = list(guess.upper())
        
        # Zip with results
        df_guess_results = pd.DataFrame(data=list(zip(list_guess, results)),
                                        columns=['letter', 'result'],
                                        index=np.arange(1,6))
        
        # To prevent iterating through already solved letters
        already_solved = [key for key, value in self.dict_letters.items() if value is not None]

        
        # Update correct answers
        df_corr_answers = df_guess_results.query('result==2')
        if df_corr_answers.shape[0] > 0:
            for idx, row in df_corr_answers.iterrows():
                
                # Prevent updates for previously solved letters
                if idx in already_solved:
                    pass
                else:
                    corr_letter = row['letter']
                    self.dict_letters[idx] = corr_letter
                
                    # If correct letter was previously guessed as a misplaced letter, remove it
                    if corr_letter in self.dict_misplaced_letters.keys():
                        self.dict_misplaced_letters[corr_letter] -= 1
                        
                    # And filter dataframe of possible words
                    self.df_possible_5l_words = self.df_possible_5l_words.query(f'letter_{idx}=="{corr_letter}"')

                        
        # Add misplaced letters to our list, if it's a new letter
        df_mispl_answers = df_guess_results.query('result==1')
        if df_mispl_answers.shape[0] > 0:
            
            # Filter dataframe to remove any words that have the misplaced letter in that column
            for idx, row in df_mispl_answers.iterrows():
                mispl_letter = row['letter']
                self.df_possible_5l_words = self.df_possible_5l_words.query(f'letter_{idx}!="{mispl_letter}"')  
            
            # Check how many we have of each letter that's misplaced
            guess_mispl_letters = df_mispl_answers['letter'].values
            dict_guess_mispl_letters = Counter(guess_mispl_letters)
            
            # Then update our dictionary of misplaced letters
            for key, value in dict_guess_mispl_letters.items():
                self.dict_misplaced_letters[key] = value   
            
            # Filter dataframe for words containing at least the count of the misplaced letters
            vect_check_misplaced_letters = np.vectorize(self.check_misplaced_letters)
            self.df_possible_5l_words['valid'] = vect_check_misplaced_letters(self.df_possible_5l_words['word'])
            self.df_possible_5l_words = self.df_possible_5l_words.query('valid == True')
            self.df_possible_5l_words = self.df_possible_5l_words.drop('valid', axis=1)    
                
        
        # Remove any incorrect letters from the list to guess from, if letter isn't in misplaced list
        df_wrong_answers = df_guess_results.query('result==0')
        if df_wrong_answers.shape[0] > 0:
            
            for l in df_wrong_answers['letter'].unique():
                if l not in self.dict_misplaced_letters.keys():
                    self.possible_letters.remove(l)
                
                
        # Finally, update list of possible 5 letter words by removing all rows where
        # for letters yet to be guessed, they don't fall in the list of possible letters
        yet_to_solve = [key for key, value in self.dict_letters.items() if value is None]
        for position in yet_to_solve:
            
            # Check all letters in a given position
            position_letters = self.df_possible_5l_words[f'letter_{position}']
            
            # Return a boolean list of whether that list is in the possible values or not
            position_in_possible_letters = [l in self.possible_letters for l in position_letters]
            
            # Filter
            self.df_possible_5l_words = self.df_possible_5l_words[position_in_possible_letters].copy(deep=True)
        
        
        # And recalibrate counter dictionary
        self.dict_most_likely = defaultdict(str)
        for i in range(5):
            self.dict_most_likely[i+1] = Counter(self.df_possible_5l_words[f'letter_{i+1}'])
        
MyGame = Game(df_words_5l, alpha_list)

Need to weight it for avoiding repeat letters, and to make sure that only words with the misplaced letters are picked from

In [289]:
df_words_5l.query('word=="PINCH"')

Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,word
1413,P,I,N,C,H,PINCH


In [290]:
MyGame = Game(df_words_5l, alpha_list)
MyGame.guess().head(5)

Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,word,freq_score
1777,S,L,A,T,E,SLATE,1437
1648,S,A,U,C,E,SAUCE,1411
1783,S,L,I,C,E,SLICE,1409
1704,S,H,A,L,E,SHALE,1403
1651,S,A,U,T,E,SAUTE,1398


In [291]:
MyGame.update('slate', [0, 0, 0, 0, 0])
print(MyGame.guess().shape, MyGame.df_possible_5l_words.query('word=="ELDER"').shape)
MyGame.guess().head(5)

(221, 7) (0, 7)


Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,word,freq_score
487,C,R,O,N,Y,CRONY,237
442,C,O,R,N,Y,CORNY,225
280,B,R,I,N,Y,BRINY,214
303,B,U,N,N,Y,BUNNY,212
1044,I,R,O,N,Y,IRONY,209


In [292]:
MyGame.update('crony', [1, 0, 0, 1, 0])
print(MyGame.guess().shape, MyGame.df_possible_5l_words.query('word=="ELDER"').shape)
MyGame.guess().head(5)

(7, 7) (0, 7)


Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,word,freq_score
1493,P,U,N,C,H,PUNCH,27
302,B,U,N,C,H,BUNCH,26
1276,M,U,N,C,H,MUNCH,26
1005,H,U,N,C,H,HUNCH,26
1413,P,I,N,C,H,PINCH,26


In [293]:
MyGame.update('punch', [2, 0, 2, 2, 2])
print(MyGame.guess().shape, MyGame.df_possible_5l_words.query('word=="ELDER"').shape)
MyGame.guess().head(5)

(1, 7) (0, 7)


Unnamed: 0,letter_1,letter_2,letter_3,letter_4,letter_5,word,freq_score
1413,P,I,N,C,H,PINCH,5
