In [1]:
# Imports 
import numpy as np
import gym 
from collections import defaultdict, deque
import re
from stable_baselines3 import PPO

In [3]:
# Get wordle words
wordle_words = open("scripts/wordle_words.txt", "r").read().split(",")
wordle_words = [word.replace('\n', '') for word in wordle_words]

In [6]:
class WordleSimple(gym.Env): 
    
    def __init__(self, 
                 n_letters: int = 5, 
                 n_guesses: int = 6, 
                 answer: str = None, 
                 valid_words: list = wordle_words, 
                 keep_answer_on_reset: bool = False): 
        
        # Store attributes 
        self.n_letters = n_letters
        self.n_guesses = n_guesses
        self.valid_words = valid_words
        self.n_valid_words = len(self.valid_words)
        self.answer = answer if answer is not None else np.random.choice(self.valid_words)
        self.keep_answer_on_reset = keep_answer_on_reset
        
        # Action + Observation Space
        self.action_space = gym.spaces.Discrete(self.n_valid_words)
        self.observation_space = gym.spaces.Box(low = 0, 
                                                high = 1, 
                                                shape = (self.n_valid_words,), 
                                                dtype = int)

        #  self.observation_space = gym.spaces.MultiDiscrete([2] * self.n_valid_words)
        
        # Init Stuff 
        self.state = np.ones(len(self.valid_words), dtype = int)
        self.guess_count = 0
        self.alphabet = list('abcdefghijklmnopqrstuvwxyz')
        self.possible_words = self.valid_words
        self.n_possible_words = len(self.possible_words)
        
        self.victory_buffer = deque(maxlen = 100)
        self.win = False
        
    def _compute_reward(self, guess): 
    
        
        # Init structures to check which letters are green and which are yellow
        greens = dict(zip(range(self.n_letters), ['']*self.n_letters))
        yellows = defaultdict(list)
        grays = []
        
        # Get which words are which
        for idx, (guess_letter, answer_letter) in enumerate(zip(guess, self.answer)): 
            
            if guess_letter == answer_letter: 
                greens.update({idx: guess_letter})
            elif guess_letter in self.answer: 
                yellows[idx].append(guess_letter)
            else: 
                grays.append(guess_letter)
                
        # Remove gray letters from the alphabet
        sorted(set(self.alphabet) - set(grays))
        
        # Create new pattern
        pattern = r''
        for i in range(self.n_letters):

            # Check if there is green or yellow
            is_green = greens[i] != ''
            has_yellow = len(yellows[i])  > 0

            if is_green:
                # if green then it should just be that letter as the only option
                letter_pattern = '[' + greens[i] + ']'

            elif has_yellow:

                # if yellow then it's the alphabet minus the letters that can't be there
                letter_alphabet = [letter for letter in self.alphabet if letter not in yellows[i]]
                letter_pattern = '[' + ''.join(letter_alphabet) + ']'

            else:
                # otherwise just the remaining alphabet
                letter_pattern = '[' + ''.join(self.alphabet) + ']'

            pattern += letter_pattern

        # Filter possible words 
        new_possible_words = [word for word in self.possible_words if bool(re.match(pattern, word))]


        # Compute reward
        reward = (len(self.possible_words) - len(new_possible_words))/len(self.possible_words)
        
        # Check if won 
        won = bool(guess == self.answer)

            
        return reward, won, new_possible_words
                
    def step(self, action): 
        
        # Grab decoded word 
        guess = self.valid_words[action]
        
        # Compute reward
        reward, win, new_possible_words = self._compute_reward(guess)
        
        # Add win/loss penalty
        if win: 
            reward += 1
        else: 
            reward -= 1
        
        # Add possible word penalty 
        if guess not in self.possible_words: 
            reward -= 1
        
        # Update state
        self.state = np.array([1 if word in new_possible_words else 0 for word in self.valid_words], dtype=int)
        assert(self.state.shape == self.observation_space.shape), f'{self.state.shape}'
        self.possible_words = new_possible_words
        self.n_possible_words = len(self.possible_words)

        
        # Increment guess count 
        self.guess_count += 1
        
        # Check if done
        done = (win) or (self.guess_count == self.n_guesses)
        
        # Info 
        info = {'guess_count': self.guess_count, 'won': win}
        
        self.win = win
                
        return self.state, reward, done, info
        
    def reset(self): 
        
        self.victory_buffer.append(self.win)
       
        # Reset possible words = all valid words
        self.possible_words = self.valid_words
        
        # Reset alphabet, state and guess count
        self.alphabet = list('abcdefghijklmnopqrstuvwxyz')
        self.state = np.ones(len(self.valid_words), dtype = int)
        self.guess_count = 0
        
        
        return self.state
    
    def _compute_win_ratio(self):
        """
        Computes the win ration of games currently in the victory buffer.
        :return: the win ratio
        """
        wins = sum(self.victory_buffer)
        return wins/len(self.victory_buffer)
    
    def do_logging(self, logs, num_games):
        """
        :param logs: dictionary containing values to be logged
        :param num_games: the number of games
        :return: logs values to tensorboard
        """
        print(f"Number of Games: {num_games}")
        for key, value in logs.items():
            print('{} : {}'.format(key, value))
            self.logger.log_scalar(value, key, num_games)
        print("\n")

In [7]:
env = WordleSimple()

In [None]:
agent = PPO(policy = 'MlpPolicy',
                env = env, 
                learning_rate = 0.0003, 
                n_steps = 2048, 
                batch_size = 64, 
                n_epochs = 10, 
                gamma = 0.99, 
                gae_lambda = 0.95, 
                clip_range = 0.2, 
                verbose = 1)
agent.learn(total_timesteps = 100, log_interval = 1) # remember total times steps is number of guesses NOT number of games
agent.save('simple')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
