In [1]:
import json, pathlib, random, time
from collections import defaultdict
import numpy as np
import pandas as pd
import multiprocessing as mp

from environment import Env, validate_against_hint, load_word_lists, construct_word_df


In [2]:
df = construct_word_df(*load_word_lists())

In [3]:


def construct_state_tensor(guesses, history):
        #print(history)
        #so the state is going to be:
            #  The number of green locations we know
            #  The number of other letters we know to be in the word
            #  The sequence number of the guess (1st guess, 2nd guess etc.)

        #the number of locations which were green at some point in the history
        num_green_locs = np.count_nonzero(history.max(axis=0) == 2)

        green_chars = [guesses[x][y] for x,y in np.argwhere(history == 2) ]
        orange_chars = [guesses[x][y] for x,y in np.argwhere(history == 1) ]
        black_chars = [guesses[x][y] for x,y in np.argwhere(history == 0) ]
        num_other_letters = len(set(orange_chars) - set(green_chars))
        num_black_letters = len(set(black_chars))

        sequence_number = int(history.size / 5)
        #print(f'construct_state() with seqno {sequence_number}')

        sequence_number_onehot = np.zeros(Env.num_guesses)
        sequence_number_onehot[sequence_number] = 1.0
        arr = np.concatenate((np.array([num_green_locs, num_other_letters, num_black_letters])/5, sequence_number_onehot))
        return torch.tensor(arr, device=device, dtype=torch.float)
        

    

The aim here is to use a NN to represent the policy, rather than the value function.  We will shrink the action space (ie, so that we have a few actions, rather than 12000).  This will remove the model's ability to learn novel strategies, rather it will just be learning when to employ the different strategies (actions) that I give it.  Start w
ith these 3 word selection tactics:

1. choose words which match the current history
1. choose words which contain the greatest number of new letters
1. choose words which have the highest frequency score

then we will construct 6 actions by choosing every possible order of these strategies
1. 1,2,3
1. 1,3,2
1. 2,1,3
1. 2,3,1
1. 3,1,2
1. 3,2,1

for all these actions there may be multiple words, so sample a random one.  The policy then becomes a logistic regressor which selects one of these actions to execute.  The loss to train the regressor will be derived using the policy gradiet theorem.


In [4]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count, permutations

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.distributions import Categorical


# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

#plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def clear(self):
        self.memory.clear()
        
    def __len__(self):
        return len(self.memory)
    

In [6]:
class PolicyNetNN(nn.Module):

    def __init__(self, num_inputs, num_actions):
        super(PolicyNetNN, self).__init__()
        self.fc1 = nn.Linear(num_inputs, 20)
        self.head = nn.Linear(20, num_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.fc1(x))
        return F.softmax(self.head(x))

In [7]:
class PolicyNetLinear(nn.Module):

    def __init__(self, num_inputs, num_actions):
        super(PolicyNetLinear, self).__init__()
        self.head = nn.Linear(num_inputs, num_actions)
        #print(f'PolicyNetLinear {num_inputs}, {num_actions}')

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        print(f'forward() {x.shape}')
        x = x.to(device)
        return F.softmax(self.head(x), dim=0)

In [8]:
#define the word-selection tactics
n_state_features = 9

class PolicyHelper:
    def __init__(self, env):
        self.env = env
        actions = [[env.find_words_matching_current_history]]
        actions.append([env.find_words_matching_current_history, env.find_words_with_highest_freq_score])
        actions.append([env.find_words_matching_current_history, env.find_words_with_most_new_letters])
        actions.append([env.find_words_matching_current_history, env.find_words_with_most_new_letters, env.find_words_with_highest_freq_score])
        actions.append([env.find_words_with_most_new_letters, env.find_words_with_highest_freq_score])
        actions.append([env.find_words_with_most_new_letters])
        
        self.actions = actions.copy()
        for a in actions:
                self.actions.append([env.find_target_words] + a)
                
        self.num_actions = len(self.actions)
        #self.net = PolicyNetLinear(n_state_features, len(self.actions))
        
    def perform_action(self, action_idx):
        tactic_tuple = self.actions[action_idx]
        df = self.env.df
        for tactic in tactic_tuple: # apply all the tactics in the given order
            newdf = tactic(df)
            if not newdf.empty: #if that tactic produced no results, then quit
                df = newdf
        return df.sample()['word'][0] # then pick a random word from what is left
    
    

In [9]:





def plot_values(vals, axes=['duration', 'episode']):
    plt.figure(2)
    plt.clf()
    plt.title('Training...')
    plt.xlabel(axes[1])
    plt.ylabel(axes[0])
    plt.plot(np.array(vals))
    # Take 20 episode averages and plot them too
    window_width = 20
    if len(vals) >= window_width:
        cumsum_vec = np.cumsum(np.insert(vals, 0, 0)) 
        ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width
        plt.plot(np.insert(ma_vec, 0, [None]*int(window_width/2)))

    plt.pause(0.001)  # pause a bit so that plots are updated
    #if is_ipython:
    #    display.clear_output(wait=True)
    #    display.display(plt.gcf())
    
def plot_all(episode_durations, episode_rewards, losses, epsilons, gammas):
    plot_values(episode_durations, axes=['duration', 'episode'])
    plot_values(episode_rewards, axes=['reward', 'episode'])
    if losses: plot_values(losses, axes=['loss', 'step'])
    if epsilons: plot_values(epsilons, axes=['epsilon', 'step'])
    if gammas: plot_values(gammas, axes=['gamma', 'step'])
    #plt.ioff()
    plt.show()

In [43]:
def optimize_model(model, optimizer, memory, batch_size=128):
    transitions = memory.sample(batch_size)

    state_batch = torch.stack([tr.state for tr in transitions])
    action_batch = torch.stack([tr.action for tr in transitions])
    reward_batch = np.stack([tr.reward for tr in transitions])
    
    
    # calculate gradient
    probs = model(state_batch)
    sampler = Categorical(probs)
    print(f'optimize_model() {action_batch.shape}\n{reward_batch}')
    log_probs = -sampler.log_prob(action_batch)   # "-" because it was built to work with gradient descent, but we are using gradient ascent
    
    print(f'optimize_model() {log_probs}')
    pseudo_loss = torch.sum(log_probs * reward_batch) # loss that when differentiated with autograd gives the gradient of J(θ)
    # update policy weights
    optimizer.zero_grad()
    pseudo_loss.backward()
    optimizer.step()
    
    return pseudo_loss

class TrainConfig():
    def __init__(self, train_interval=128, batch_size=128, clear_memory=False, lr=0.01):
        self.train_interval = train_interval
        self.batch_size = batch_size
        self.clear_memory = clear_memory
        self.lr = lr
        
class ValueConfig():
    def __init__(self, name='reward', gamma=[0.9, 0.05, 200]):
        self.name = name
        self.gamma = gamma
        
class ModelConfig():
    def __init__(self, name='naive', startword=None, target_list_only=None):
        self.name = name
        self.startword = startword
        self.target_list_only = target_list_only

In [51]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
def run_experiment(model=ModelConfig(name='naive', startword=None, target_list_only=False),
                   num_episodes=128,
                   eps=[0.9, 0.05, 200],
                   value_function=ValueConfig(name='reward',gamma=[0.0, 1.0, 200]),
                   training=TrainConfig(clear_memory=False, batch_size=128, train_interval=128)):
    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)
    GAMMA_START, GAMMA_END, GAMMA_DECAY = value_function.gamma
    env = Env(df)
    memory = ReplayMemory(10000)
    print('run_experiment()')
    starting_state = construct_state_tensor(env.guesses, env.history)

    print('constructed state tensor')
    steps_done = 0
    last_training = 0
    losses = []
    episode_rewards = []
    episode_durations = []
    epsilons = []
    gammas = []
    
    policy_helper = PolicyHelper(env)
    
    print('made policy helper')
    
    if model.name == 'linear':
        policy_net = PolicyNetLinear(n_state_features, len(policy_helper.actions)).to(device)
        optimizer = optim.Adam(policy_net.parameters(), lr=training.lr)
    else:
        policy_net = PolicyNetNN(n_state_features, len(policy_helper.actions)).to(device)
        optimizer = optim.Adam(policy_net.parameters(), lr=training.lr)

    print(f'pn params {list(policy_net.parameters())}')
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        print(f'=========================episode {i_episode} {env.target}======================')

        episode_memory = []
        state = starting_state
        guesses = []
        for t in count():
            GAMMA = GAMMA_END + (GAMMA_START - GAMMA_END) * math.exp(-1. * steps_done / GAMMA_DECAY)
            gammas.append(GAMMA)
            steps_done += 1
            # Select and perform an action
            #print(state, actions)
            probs = policy_net(state)
            sampler = Categorical(probs)
            action_idx = sampler.sample()
            chosen_word = policy_helper.perform_action(action_idx)
            guesses.append(chosen_word)
            print(f'------guess {t} {action_idx} {guesses[-1]}-------')
            history, reward, done = env.step(chosen_word)
            #here next_state == env.history
            if not done:
                next_state = construct_state_tensor(guesses, history)
            
            #action_tensor = action_idx.clone().detach()
            action = F.one_hot(action_idx, num_classes=policy_helper.num_actions)
                
            print(f'reward {reward} done {done} action shape {action_idx.__class__} {action.shape} {action}')
            #reward = np.array([reward])

            # Store the transition in memory
            #memory.push(state, action_idx, reward)
            episode_memory.append([state, action, reward])
            # Move to the next state
            state = next_state

            if done:
                episode_durations.append(t + 1)
                episode_reward = sum([tr[2] for tr in episode_memory])
                print(f'episode {i_episode} finished.  reward {episode_reward}  eps {eps}  gamma {GAMMA}  steps {steps_done}  memory {len(memory)}')
                episode_rewards.append(episode_reward)
                
                for tr in episode_memory:
                    print(f'pushing episode_reward {episode_reward} {episode_reward.__class__}')
                    memory.push(tr[0], tr[1], episode_reward)
                    episode_reward -= tr[2]
                
                # If we have gathered enough data, Perform one step of the optimization (on the policy network)
                if len(memory) >= training.batch_size \
                    and steps_done - last_training > training.train_interval:
                    loss = optimize_model(policy_net, optimizer, memory, batch_size=training.batch_size)
                    losses.append(loss)
                    if training.clear_memory: memory.clear()
                    last_training = steps_done
                #plot_durations()
                break

    print('Complete')
    
    return episode_durations, episode_rewards, losses, epsilons, gammas

#env.render()
#env.close()
#plt.ioff()
#plt.show()

In [52]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.0, 0.0, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.07)
    ))

run_experiment()
constructed state tensor
made policy helper
pn params [Parameter containing:
tensor([[-0.0025,  0.1788, -0.2743, -0.2453, -0.1284,  0.0894, -0.0066,  0.2643,
         -0.0296],
        [ 0.0882, -0.1007, -0.0655, -0.3184, -0.2208, -0.1374,  0.0123,  0.1318,
          0.2000],
        [-0.2260, -0.1452,  0.1211,  0.2768, -0.0686,  0.2494, -0.0537,  0.0353,
          0.3018],
        [-0.3092, -0.2098, -0.0844, -0.1299,  0.2880, -0.2161, -0.1534, -0.2329,
         -0.3122],
        [-0.1946,  0.2865,  0.1487,  0.1616,  0.0175, -0.1709,  0.0564, -0.3112,
         -0.2409],
        [-0.1718,  0.2103,  0.1954, -0.1478, -0.0120,  0.2132,  0.3314,  0.1323,
          0.0450],
        [ 0.2235, -0.1963,  0.0621, -0.2584, -0.2310, -0.1722,  0.1508,  0.1341,
         -0.1975],
        [ 0.1007,  0.1830, -0.0421,  0.0127,  0.0772,  0.2068,  0.3201, -0.2569,
         -0.1222],
        [ 0.1310,  0.2762,  0.2901,  0.2941,  0.0663, -0.2899,  0.0307, -0.2085,
         -0.3107],
      

reward -1.0 done False action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
forward() torch.Size([9])
------guess 1 6 strap-------
reward -1.0 done False action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])
forward() torch.Size([9])
------guess 2 2 syrah-------
reward -1.0 done False action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
forward() torch.Size([9])
------guess 3 5 clung-------
reward -1.0 done False action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])
forward() torch.Size([9])
------guess 4 10 media-------
reward -1.0 done False action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])
forward() torch.Size([9])
------guess 5 11 borax-------
reward -1.0 done True action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

------guess 1 10 salon-------
reward -1.0 done False action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])
forward() torch.Size([9])
------guess 2 0 inbye-------
reward -1.0 done False action shape <class 'torch.Tensor'> torch.Size([12]) tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
forward() torch.Size([9])
------guess 3 11 dutch-------
reward -1.0 done False action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
forward() torch.Size([9])
------guess 4 5 swamp-------
reward -1.0 done False action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])
forward() torch.Size([9])
------guess 5 8 twine-------
reward 0.0 done True action shape <class 'torch.Tensor'> torch.Size([12]) tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])
episode 13 finished.  reward -5.0  eps [0.0, 0.0, 400]  gamma 0.0  steps 68  memory 62
pushing episode_reward -5.0 <class 'float'>
pushing episode

ValueError: Value is not broadcastable with batch_shape+event_shape: torch.Size([64, 12]) vs torch.Size([64]).

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.0, 0.0, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.03)
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.0, 0.0, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.1)
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.3, 0.3, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.07)
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.6, 0.6, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.07)
    ))