In [1]:
import json, pathlib, random, time
from collections import defaultdict
import numpy as np
import pandas as pd
import multiprocessing as mp

from environment import Env, validate_against_hint, load_word_lists, construct_word_df


In [2]:
df = construct_word_df(*load_word_lists())

In [3]:

def word_to_action(word, guesses, history):
    return dfword_to_action((word, df.loc[word]), guesses, history)
    
def dfword_to_action(dfword, guesses, history):
    #the action is going to be a word that we will submit next
    #for the purposes of feeding into the model, we will represent the action word as:
    #  how many of the entries in the hint history this word conforms to
    #  how many untried letters it gives us
    #  the number of uniq letters in the word
    #  the frequency of the letters in the word
    #  whether or not the word is in the guess list (as opposed to the target list)
    word = dfword[0]
    dfword = dfword[1]
    
    if guesses:
        conforms_to_history = sum([int(validate_against_hint(word,g,history[i])) for i,g in enumerate(guesses)]) / len(guesses)
    else: # we haven't made any guess yet, so this must conform
        conforms_to_history = 1.0
    num_untried_letters = len(set(word) - set(''.join(guesses))) / 5 #normalise to 1
    action = np.array([conforms_to_history, num_untried_letters, dfword['freq_score'], dfword['uniq_score'], dfword['is_guess_word']])
    
    #if word == 'aargh':
    #    print(f'recons', action, history, guesses)
    return action   
    

def construct_action_vectors_global(arg): #guesses, history, start_idx, end_idx):
    st = time.time()
    guesses, history, start_idx, end_idx = arg
    #print(guesses, history, start_idx, end_idx)
    ret = np.array([dfword_to_action(dfword, guesses, history) for dfword in df.iloc[start_idx:end_idx].iterrows()])
    #print(f'construct_actions_global took {time.time() - st}')
    return ret
           
def construct_action_vectors(guesses, history, df):
        return np.array([dfword_to_action(dfword, guesses, history) for dfword in df.iterrows()])
    
NUM_PROCESSES = mp.cpu_count() - 1
def construct_action_vectors_mp(guesses, history, df):
        grp_lst_args = []
        grp_guesses = [guesses] * NUM_PROCESSES
        grp_history = [history] * NUM_PROCESSES
        
        chunk_size = int(len(df) / NUM_PROCESSES) + 1
        start_offsets = list(range(0, len(df), chunk_size))
        end_offsets = start_offsets[1:] + [len(df)]
        grp_lst_args = list(zip(grp_guesses, grp_history, start_offsets, end_offsets))
        
        #print(grp_lst_args)
        pool = mp.Pool(processes=NUM_PROCESSES)
        results = pool.map(construct_action_vectors_global, grp_lst_args)
        pool.close()
        pool.join()
        return np.concatenate(results)
    
    
def construct_state_vector(guesses, history):
        #print(history)
        #so the state is going to be:
            #  The number of green locations we know
            #  The number of other letters we know to be in the word
            #  The sequence number of the guess (1st guess, 2nd guess etc.)

        #the number of locations which were green at some point in the history
        num_green_locs = np.count_nonzero(history.max(axis=0) == 2)

        green_chars = [guesses[x][y] for x,y in np.argwhere(history == 2) ]
        orange_chars = [guesses[x][y] for x,y in np.argwhere(history == 1) ]
        black_chars = [guesses[x][y] for x,y in np.argwhere(history == 0) ]
        num_other_letters = len(set(orange_chars) - set(green_chars))
        num_black_letters = len(set(black_chars))

        sequence_number = int(history.size / 5)
        #print(f'construct_state() with seqno {sequence_number}')

        sequence_number_onehot = np.zeros(Env.num_guesses)
        sequence_number_onehot[sequence_number] = 1.0
        return np.concatenate((np.array([num_green_locs, num_other_letters, num_black_letters])/5, sequence_number_onehot))


    

   
so the state is going to be:
* The number of green locations we know
*  The number of other letters we know to be in the word
*  The number of letters we know to not be in the word
*  The sequence number of the guess (1st guess, 2nd guess etc.)

the action is going to be a word that we will submit next
for the purposes of feeding into the model, we will represent the action word as:
*  whether or not it conforms to the hint history
*  how many new letters it gives us
*  the number of uniq letters in the word
*  the frequency of the letters in the word

the reward is going to be:
*  the score improvement (if any) gained on the last guess
*  the score will be calculated as 2 * num_green_letters + num_orange_letters

In [4]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

#plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def clear(self):
        self.memory.clear()
        
    def __len__(self):
        return len(self.memory)
    

In [6]:
#Get number of actions from gym action space
#n_actions = env.action_space.n
n_action_features = 5
n_state_features = 9
n_input_features = n_action_features + n_state_features


def select_action(policy_net, state, actions, eps_threshold):
    sample = random.random()
    if sample > eps_threshold:
        with torch.no_grad():
            #now combine the state (shape 3,) and action (shape 5, n) into one input array (shape 8,n)
            #first expand the state so that it is shape 3,1
            #then repeat it to 3,n
            states = np.repeat(np.expand_dims(state, 0), actions.shape[0], axis=0)
            #print(f'states shape {states.shape} actions shape {actions.shape}')
            #then concatenate to 8,n
            state_actions = np.concatenate((states, actions), axis=1)
            # policy_net(state_action) will return a single value estimate for each state/action row
            # so, probably shape (1,n)
            # Then return the index which has the max value
            
            estimate = policy_net(torch.tensor(state_actions, device=device, dtype=torch.float))
            #print(f'ESTIMATE>>>{estimate.__class__} {estimate.shape} {estimate} {estimate.max(0).indices.item()}<<<')
            return estimate.max(0).indices.item()
    else:
        randindex = random.randrange(len(actions))
        print(f'returning random index {randindex}')
        return randindex #torch.tensor([[randindex]], device=device, dtype=torch.long)



def plot_values(vals, axes=['duration', 'episode']):
    plt.figure(2)
    plt.clf()
    plt.title('Training...')
    plt.xlabel(axes[1])
    plt.ylabel(axes[0])
    plt.plot(np.array(vals))
    # Take 20 episode averages and plot them too
    window_width = 20
    if len(vals) >= window_width:
        cumsum_vec = np.cumsum(np.insert(vals, 0, 0)) 
        ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width
        plt.plot(np.insert(ma_vec, 0, [None]*int(window_width/2)))

    plt.pause(0.001)  # pause a bit so that plots are updated
    #if is_ipython:
    #    display.clear_output(wait=True)
    #    display.display(plt.gcf())
    
def plot_all(episode_durations, episode_rewards, losses, epsilons, gammas):
    plot_values(episode_durations, axes=['duration', 'episode'])
    plot_values(episode_rewards, axes=['reward', 'episode'])
    if losses: plot_values(losses, axes=['loss', 'step'])
    if epsilons: plot_values(epsilons, axes=['epsilon', 'step'])
    if gammas: plot_values(gammas, axes=['gamma', 'step'])
    #plt.ioff()
    plt.show()

In [7]:
class DQN(nn.Module):

    def __init__(self, inputs):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(inputs, 20)
        self.fc2 = nn.Linear(20, 16)
        self.fc3 = nn.Linear(16, 20)
        self.head = nn.Linear(20, 1)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.head(x)

In [8]:
class LinearQ(nn.Module):

    def __init__(self, inputs):
        super(LinearQ, self).__init__()
        self.head = nn.Linear(inputs, 1)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        return self.head(x)

In [9]:
def optimize_model(model, optimizer, memory, batch_size=128):

    transitions = memory.sample(batch_size)

    #for tr in transitions:
    #    print(list(tr.state), list(tr.action), tr.reward)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    state_batch = np.stack([tr.state for tr in transitions])
    action_batch = np.stack([tr.action for tr in transitions])
      
    reward_batch = np.stack([tr.reward for tr in transitions])
    state_action_batch = np.concatenate((state_batch, action_batch), axis=1)
    
    #print(reward_batch)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_value_estimates = model(torch.tensor(state_action_batch, device=device, dtype=torch.float))
    #print(f'ESTIMATE>>>{estimate.__class__} {estimate.shape} {estimate} {estimate.max(0).indices.item()}<<<')
       
    expected_state_action_values = torch.tensor(reward_batch, device=device, dtype=torch.float)
    # Compute Huber loss
    print('loss shapes')
    print(state_action_value_estimates.shape)
    print(expected_state_action_values.shape)
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_value_estimates, expected_state_action_values)
    
    print(f'loss {loss}')

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    #for param in model.parameters():
        #param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return loss

class TrainConfig():
    def __init__(self, train_interval=128, batch_size=128, clear_memory=False, lr=0.01):
        self.train_interval = train_interval
        self.batch_size = batch_size
        self.clear_memory = clear_memory
        self.lr = lr
        
class ValueConfig():
    def __init__(self, name='reward', gamma=[0.9, 0.05, 200]):
        self.name = name
        self.gamma = gamma
        
class ModelConfig():
    def __init__(self, name='naive', startword=None, target_list_only=None):
        self.name = name
        self.startword = startword
        self.target_list_only = target_list_only

In [10]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
def run_experiment(model=ModelConfig(name='naive', startword=None, target_list_only=False),
                   num_episodes=128,
                   eps=[0.9, 0.05, 200],
                   value_function=ValueConfig(name='reward',gamma=[0.0, 1.0, 200]),
                   training=TrainConfig(clear_memory=False, batch_size=128, train_interval=128),
                   seed=0,
                   run_test=False):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    EPS_START = eps[0]
    EPS_END = eps[1]
    EPS_DECAY = eps[2]
    GAMMA_START, GAMMA_END, GAMMA_DECAY = value_function.gamma
    env = Env(df)
    memory = ReplayMemory(10000)
    starting_actions = construct_action_vectors(env.guesses, env.history, env.df)
    starting_state = construct_state_vector(env.guesses, env.history)

    steps_done = 0
    last_training = 0
    losses = []
    episode_rewards = []
    episode_durations = []
    epsilons = []
    gammas = []
    
    if model.name == 'linear':
        policy_net = LinearQ(n_input_features).to(device)
        optimizer = optim.RMSprop(policy_net.parameters(), lr=training.lr)
    else:
        policy_net = DQN(n_input_features).to(device)
        optimizer = optim.RMSprop(policy_net.parameters(), lr=training.lr)

    print(f'pn params {list(policy_net.parameters())}')
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        print(f'=========================episode {i_episode} {env.target}======================')

        episode_memory = []
        state = starting_state
        actions = starting_actions
        guesses = []
        for t in count():
            eps = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
            GAMMA = GAMMA_END + (GAMMA_START - GAMMA_END) * math.exp(-1. * steps_done / GAMMA_DECAY)
            
            epsilons.append(eps)
            #print(epsilons)
            gammas.append(GAMMA)
            steps_done += 1
            # Select and perform an action
            #print(state)
            action_idx = select_action(policy_net, state, actions, eps)
            selected_action = actions[action_idx]
            #print(action_idx, selected_action)
            guesses.append(env.word_from_index(action_idx))
            print(f'------guess {t} {action_idx} {guesses[-1]} {selected_action}-------')
            
            history, reward, done = env.step_by_index(action_idx)
            #here next_state == env.history
            if not done:
                next_state = construct_state_vector(guesses, history)
                actions = construct_action_vectors_mp(guesses, history, env.df)
            
            print(f'reward {reward} done {done} ')
            #reward = np.array([reward])

            # Store the transition in memory
            #memory.push(state, selected_action, reward)
            episode_memory.append([state, selected_action, reward])
            # Move to the next state
            state = next_state

            if done:
                episode_durations.append(t + 1)
                scores = history.sum(axis=1) # calc the score for each guess from the history
                episode_reward = sum(tr[2] for tr in episode_memory)
                print(f'episode {i_episode} finished.  reward {episode_reward}  eps {eps}  gamma {GAMMA}  steps {steps_done}  memory {len(memory)}')
                episode_rewards.append(episode_reward)
                # reward[i] = max(0, scores[i] - max(scores[0:i-1]))
                
                for i,tr in enumerate(episode_memory):
                    if i == 0:
                        score = scores[i]
                    else:
                        score = max(0, scores[i] - scores[0:i].max())
                    print(f'{guesses[i]} {history[i]} {score}')    
                
                    memory.push(tr[0], tr[1], [score])
                    
                # If we have gathered enough data, Perform one step of the optimization (on the policy network)
                if len(memory) >= training.batch_size \
                    and steps_done - last_training > training.train_interval:
                    loss = optimize_model(policy_net, optimizer, memory, batch_size=training.batch_size)
                    losses.append(loss)
                    if training.clear_memory: memory.clear()
                    last_training = steps_done
                #plot_durations()
                break

    print('Complete')
    
    if run_test:
        performance_hist = [0] * (1 + env.num_guesses)
        for e in env.foreach_target_word():
            state = starting_state
            actions = starting_actions
            done = False
            reward = 0
            guesses = []
            while not done:
                action_idx = select_action(policy_net, state, actions, 0)
                selected_action = actions[action_idx]
                chosen_word = env.word_from_index(action_idx)
                guesses.append(chosen_word)
                history, reward, done = env.step_by_index(action_idx)
                
                if not done:
                    state = construct_state_vector(guesses, history)
                    actions = construct_action_vectors_mp(guesses, history, env.df)
                
            
            if len(guesses) == 6 and reward == -1:
                num_guesses = 0
            else:
                num_guesses = len(guesses)
            print(f'{e.target} {num_guesses}')    
            performance_hist[num_guesses] += 1
            
        for i,p in enumerate(performance_hist):
            print(f'{i}: {p}')
    
    return episode_durations, episode_rewards, losses, epsilons, gammas

#env.render()
#env.close()
#plt.ioff()
#plt.show()

In [None]:

plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.0, 0.0, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=256,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.07),
    seed=0,
    run_test=True
    ))

pn params [Parameter containing:
tensor([[-0.0020,  0.1434, -0.2200, -0.1967, -0.1029,  0.0717, -0.0053,  0.2119,
         -0.0237,  0.0707, -0.0808, -0.0525, -0.2553, -0.1770]],
       requires_grad=True), Parameter containing:
tensor([-0.1102], requires_grad=True)]
------guess 0 12621 jumpy [1.         1.         0.25985401 0.         0.        ]-------
reward -1 done False 
------guess 1 11041 gawky [1.         0.8        0.34387672 0.         0.        ]-------
reward -1 done False 
------guess 2 12621 jumpy [0.5        0.         0.25985401 0.         0.        ]-------
reward -1 done False 
------guess 3 11041 gawky [0.66666667 0.         0.34387672 0.         0.        ]-------
reward -1 done False 
------guess 4 12621 jumpy [0.5        0.         0.25985401 0.         0.        ]-------
reward -1 done False 
------guess 5 11041 gawky [0.6        0.         0.34387672 0.         0.        ]-------
reward -1 done True 
episode 0 finished.  reward -6  eps 0.0  gamma 0.0  steps 6  

reward -1 done False 
------guess 3 11041 gawky [0.66666667 0.         0.34387672 0.         0.        ]-------
reward -1 done False 
------guess 4 12621 jumpy [0.5        0.         0.25985401 0.         0.        ]-------
reward -1 done False 
------guess 5 11041 gawky [0.6        0.         0.34387672 0.         0.        ]-------
reward -1 done True 
episode 8 finished.  reward -6  eps 0.0  gamma 0.0  steps 54  memory 48
jumpy [0. 0. 0. 0. 2.] 2.0
gawky [0. 0. 0. 0. 2.] 0
jumpy [0. 0. 0. 0. 2.] 0
gawky [0. 0. 0. 0. 2.] 0
jumpy [0. 0. 0. 0. 2.] 0
gawky [0. 0. 0. 0. 2.] 0
------guess 0 12621 jumpy [1.         1.         0.25985401 0.         0.        ]-------
reward -1 done False 
------guess 1 11260 quick [1.         0.8        0.30072993 0.         0.        ]-------
reward -1 done False 
------guess 2 12621 jumpy [0.5        0.         0.25985401 0.         0.        ]-------
reward -1 done False 
------guess 3 11260 quick [0.66666667 0.         0.30072993 0.         0.        ]-

reward -1 done False 
------guess 3 12794 mamma [0.         0.         0.47137064 1.         0.        ]-------
reward -1 done False 
------guess 4 12794 mamma [0.         0.         0.47137064 1.         0.        ]-------
reward -1 done False 
------guess 5 12794 mamma [0.         0.         0.47137064 1.         0.        ]-------
reward -1 done True 
episode 18 finished.  reward -6  eps 0.0  gamma 0.0  steps 103  memory 97
mamma [0. 1. 0. 2. 0.] 3.0
mamma [0. 1. 0. 2. 0.] 0
mamma [0. 1. 0. 2. 0.] 0
mamma [0. 1. 0. 2. 0.] 0
mamma [0. 1. 0. 2. 0.] 0
mamma [0. 1. 0. 2. 0.] 0
loss shapes
torch.Size([64, 1])
torch.Size([64, 1])
loss 0.49001291394233704
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12646 noise [1.         0.8        0.63292782 0.         0.        ]-------
reward -1 done False 
------guess 2 1610 chuse [1.         0.6        0.52473642 0.         1.        ]-------
reward -1 done False 
-----

reward -1 done False 
------guess 1 11563 poser [1.         0.6        0.63617194 0.         0.        ]-------
reward -1 done False 
------guess 2 11382 nicer [1.         0.6        0.62530414 0.         0.        ]-------
reward -1 done False 
------guess 3 11751 buyer [1.         0.6        0.53609084 0.         0.        ]-------
reward -1 done False 
------guess 4 11806 dryer [1.         0.2        0.6243309  0.33333333 0.        ]-------
reward 0 done True 
episode 30 finished.  reward -4  eps 0.0  gamma 0.0  steps 151  memory 146
later [0. 0. 0. 2. 2.] 4.0
poser [0. 0. 0. 2. 2.] 0
nicer [0. 0. 0. 2. 2.] 0
buyer [0. 0. 2. 2. 2.] 2.0
dryer [2. 2. 2. 2. 2.] 4.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 11527 ocean [1.         0.6        0.65174371 0.         0.        ]-------
reward -1 done False 
------guess 2 10697 adobe [1.         0.4        0.59042985 0.         0.        ]-------
reward -1 do

reward -1 done False 
------guess 1 11147 scorn [1.         0.8        0.54728305 0.         0.        ]-------
reward -1 done False 
------guess 2 12244 curio [1.         0.4        0.53008921 0.         0.        ]-------
reward -1 done False 
------guess 3 11956 rocky [1.         0.4        0.44849959 0.         0.        ]-------
reward 0 done True 
episode 41 finished.  reward -3  eps 0.0  gamma 0.0  steps 198  memory 194
later [0. 0. 0. 0. 1.] 1.0
scorn [0. 1. 1. 1. 0.] 2.0
curio [1. 0. 1. 0. 1.] 0
rocky [2. 2. 2. 2. 2.] 7.0
loss shapes
torch.Size([64, 1])
torch.Size([64, 1])
loss 0.7734197378158569
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 10929 cater [1.         0.2        0.70024331 0.         0.        ]-------
reward -1 done False 
------guess 2 11364 hater [1.         0.2        0.68596918 0.         0.        ]-------
reward 0 done True 
episode 42 finished.  reward -2  eps 0.0  gamma 0.0  

reward -1 done False 
------guess 1 12646 noise [1.         0.8        0.63292782 0.         0.        ]-------
reward -1 done False 
------guess 2 11558 since [1.         0.2        0.58799676 0.         0.        ]-------
reward -1 done False 
------guess 3 11285 singe [1.         0.2        0.56107056 0.         0.        ]-------
reward 0 done True 
episode 54 finished.  reward -3  eps 0.0  gamma 0.0  steps 243  memory 239
later [0. 0. 0. 1. 0.] 1.0
noise [1. 0. 1. 1. 2.] 4.0
since [2. 2. 2. 0. 2.] 3.0
singe [2. 2. 2. 2. 2.] 2.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 11628 salon [1.         0.6        0.59951338 0.         0.        ]-------
reward -1 done False 
------guess 2 12601 daily [1.         0.6        0.51695053 0.         0.        ]-------
reward -1 done False 
------guess 3 12832 madly [1.        0.2       0.4593674 0.        0.       ]-------
reward 0 done True 
episode 55 finished.

reward -1 done False 
------guess 1 12646 noise [1.         0.8        0.63292782 0.         0.        ]-------
reward -1 done False 
------guess 2 11629 nudge [1.         0.6        0.48321168 0.         0.        ]-------
reward 0 done True 
episode 66 finished.  reward -2  eps 0.0  gamma 0.0  steps 289  memory 286
later [0. 0. 0. 1. 0.] 1.0
noise [2. 0. 0. 0. 2.] 3.0
nudge [2. 2. 2. 2. 2.] 6.0
loss shapes
torch.Size([64, 1])
torch.Size([64, 1])
loss 1.0498234033584595
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12523 sonar [1.         0.6        0.62871046 0.         0.        ]-------
reward -1 done False 
------guess 2 10657 cigar [1.         0.6        0.54128143 0.         0.        ]-------
reward -1 done False 
------guess 3 10792 briar [1.         0.2        0.60486618 0.33333333 0.        ]-------
reward -1 done False 
------guess 4 11685 friar [1.         0.2        0.59659367 0.33333333 0.   

reward -1 done False 
------guess 1 12646 noise [1.         0.8        0.63292782 0.         0.        ]-------
reward -1 done False 
------guess 2 12421 chide [1.         0.6        0.51305758 0.         0.        ]-------
reward -1 done False 
------guess 3 12327 guide [1.         0.4        0.49878345 0.         0.        ]-------
reward 0 done True 
episode 79 finished.  reward -3  eps 0.0  gamma 0.0  steps 336  memory 332
later [0. 0. 0. 1. 0.] 1.0
noise [0. 0. 2. 0. 2.] 3.0
chide [0. 0. 2. 2. 2.] 2.0
guide [2. 2. 2. 2. 2.] 4.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 10834 sonic [1.         1.         0.51030008 0.         0.        ]-------
reward -1 done False 
------guess 2 11713 pouch [1.         0.6        0.39805353 0.         0.        ]-------
reward -1 done False 
------guess 3 11005 vouch [1.         0.2        0.36334144 0.         0.        ]-------
reward -1 done False 
------guess 4

reward -1 done False 
------guess 2 11762 modal [1.         0.6        0.51273317 0.         0.        ]-------
reward -1 done False 
------guess 3 10726 offal [1.         0.2        0.47234388 0.33333333 0.        ]-------
reward 0 done True 
episode 90 finished.  reward -3  eps 0.0  gamma 0.0  steps 383  memory 379
later [1. 1. 0. 0. 0.] 2.0
snail [0. 0. 1. 0. 2.] 1.0
modal [0. 1. 0. 2. 2.] 2.0
offal [2. 2. 2. 2. 2.] 5.0
loss shapes
torch.Size([64, 1])
torch.Size([64, 1])
loss 0.9400404691696167
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 11537 solid [1.         0.8        0.52003244 0.         0.        ]-------
reward -1 done False 
------guess 2 12541 scowl [1.         0.4        0.45644769 0.         0.        ]-------
reward -1 done False 
------guess 3 12530 sloop [1.         0.2        0.52927818 0.33333333 0.        ]-------
reward -1 done False 
------guess 4 10936 slosh [1.         0.2        

reward -1 done False 
------guess 1 12063 stoic [1.         0.8        0.53527981 0.         0.        ]-------
reward -1 done False 
------guess 2 12130 dusty [1.        0.6       0.4351987 0.        0.       ]-------
reward -1 done False 
------guess 3 11213 musty [1.         0.2        0.42270884 0.         0.        ]-------
reward 0 done True 
episode 103 finished.  reward -3  eps 0.0  gamma 0.0  steps 431  memory 427
later [0. 0. 1. 0. 0.] 1.0
stoic [1. 1. 0. 0. 0.] 1.0
dusty [0. 2. 2. 2. 2.] 6.0
musty [2. 2. 2. 2. 2.] 2.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12908 snore [1.         0.6        0.66991079 0.         0.        ]-------
reward -1 done False 
------guess 2 11286 price [1.         0.6        0.59156529 0.         0.        ]-------
reward -1 done False 
------guess 3 10687 pride [1.         0.2        0.57793998 0.         0.        ]-------
reward -1 done False 
------guess 4 116

reward -1 done False 
------guess 1 12063 stoic [1.         0.8        0.53527981 0.         0.        ]-------
reward -1 done False 
------guess 2 11228 count [1.         0.4        0.48694242 0.         0.        ]-------
reward -1 done False 
------guess 3 11774 touch [1.        0.2       0.4567721 0.        0.       ]-------
reward 0 done True 
episode 116 finished.  reward -3  eps 0.0  gamma 0.0  steps 476  memory 472
later [0. 0. 1. 0. 0.] 1.0
stoic [0. 1. 1. 0. 1.] 2.0
count [1. 2. 2. 0. 1.] 3.0
touch [2. 2. 2. 2. 2.] 4.0
loss shapes
torch.Size([64, 1])
torch.Size([64, 1])
loss 0.9539308547973633
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 10834 sonic [1.         1.         0.51030008 0.         0.        ]-------
reward -1 done False 
------guess 2 11362 pushy [1.         0.8        0.37583131 0.         0.        ]-------
reward -1 done False 
------guess 3 11650 husky [1.         0.2        0.35

reward -1 done False 
------guess 3 12200 shale [1.         0.2        0.64703974 0.         0.        ]-------
reward 0 done True 
episode 127 finished.  reward -3  eps 0.0  gamma 0.0  steps 521  memory 517
later [1. 1. 0. 1. 0.] 3.0
aisle [1. 0. 1. 2. 2.] 3.0
scale [2. 0. 2. 2. 2.] 2.0
shale [2. 2. 2. 2. 2.] 2.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12646 noise [1.         0.8        0.63292782 0.         0.        ]-------
reward -1 done False 
------guess 2 11168 denim [1.         0.4        0.51711273 0.         0.        ]-------
reward 0 done True 
episode 128 finished.  reward -2  eps 0.0  gamma 0.0  steps 524  memory 521
later [0. 0. 0. 1. 0.] 1.0
noise [1. 0. 1. 0. 1.] 2.0
denim [2. 2. 2. 2. 2.] 7.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 11912 broil [1.         0.6        0.53917275 0.         0.        

reward -1 done False 
------guess 1 10834 sonic [1.         1.         0.51030008 0.         0.        ]-------
reward -1 done False 
------guess 2 11637 unzip [1.         0.6        0.34387672 0.         0.        ]-------
reward 0 done True 
episode 139 finished.  reward -2  eps 0.0  gamma 0.0  steps 568  memory 565
later [0. 0. 0. 0. 0.] 0.0
sonic [0. 0. 1. 2. 0.] 3.0
unzip [2. 2. 2. 2. 2.] 7.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 11563 poser [1.         0.6        0.63617194 0.         0.        ]-------
reward -1 done False 
------guess 2 11382 nicer [1.         0.6        0.62530414 0.         0.        ]-------
reward -1 done False 
------guess 3 11787 cyber [1.        0.4       0.5377129 0.        0.       ]-------
reward -1 done False 
------guess 4 12363 cheer [1.         0.2        0.68629359 0.33333333 0.        ]-------
reward 0 done True 
episode 140 finished.  reward -4  eps 0.0  gam

reward -1 done False 
------guess 1 10834 sonic [1.         1.         0.51030008 0.         0.        ]-------
reward -1 done False 
------guess 2 11441 dumpy [0.5        1.         0.31922141 0.         0.        ]-------
reward -1 done False 
------guess 3 12204 whiff [0.66666667 0.6        0.27818329 0.33333333 0.        ]-------
reward -1 done False 
------guess 4 11481 scion [1.         0.         0.51030008 0.         0.        ]-------
reward 0 done True 
episode 151 finished.  reward -4  eps 0.0  gamma 0.0  steps 617  memory 612
later [0. 0. 0. 0. 0.] 0.0
sonic [2. 1. 1. 1. 1.] 6.0
dumpy [0. 0. 0. 0. 0.] 0
whiff [0. 0. 2. 0. 0.] 0
scion [2. 2. 2. 2. 2.] 4.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12338 olden [1.         0.6        0.59594485 0.         0.        ]-------
reward -1 done False 
------guess 2 10744 golem [1.        0.4       0.5406326 0.        0.       ]-------
reward -1 done F

reward -1 done False 
------guess 1 11249 canoe [1.         0.6        0.65174371 0.         0.        ]-------
reward -1 done False 
------guess 2 10893 pause [1.        0.6       0.6025953 0.        0.       ]-------
reward -1 done False 
------guess 3 12702 mauve [1.         0.4        0.51062449 0.         0.        ]-------
reward 0 done True 
episode 164 finished.  reward -3  eps 0.0  gamma 0.0  steps 662  memory 658
later [0. 2. 0. 1. 0.] 3.0
canoe [0. 2. 0. 0. 2.] 1.0
pause [0. 2. 2. 0. 2.] 2.0
mauve [2. 2. 2. 2. 2.] 4.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 10834 sonic [1.         1.         0.51030008 0.         0.        ]-------
reward -1 done False 
------guess 2 12915 hound [1.         0.6        0.41816707 0.         0.        ]-------
reward 0 done True 
episode 165 finished.  reward -2  eps 0.0  gamma 0.0  steps 665  memory 662
later [0. 0. 0. 0. 0.] 0.0
sonic [0. 2. 1. 0. 0.] 3.0
h

reward -1 done False 
------guess 3 11328 idiot [1.         0.2        0.52197891 0.33333333 0.        ]-------
reward 0 done True 
episode 176 finished.  reward -3  eps 0.0  gamma 0.0  steps 706  memory 702
later [0. 0. 1. 0. 0.] 1.0
stoic [0. 1. 1. 1. 0.] 2.0
point [0. 1. 2. 0. 2.] 2.0
idiot [2. 2. 2. 2. 2.] 5.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 11490 stain [1.         0.6        0.58767234 0.         0.        ]-------
reward -1 done False 
------guess 2 12537 scant [1.         0.2        0.55620438 0.         0.        ]-------
reward 0 done True 
episode 177 finished.  reward -2  eps 0.0  gamma 0.0  steps 709  memory 706
later [0. 1. 1. 0. 0.] 2.0
stain [2. 1. 2. 0. 1.] 4.0
scant [2. 2. 2. 2. 2.] 4.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12917 tonal [1.         0.4        0.60924574 0.         0.        

reward -1 done False 
------guess 3 12695 slyly [1.         0.2        0.47964315 0.66666667 0.        ]-------
reward 0 done True 
episode 189 finished.  reward -3  eps 0.0  gamma 0.0  steps 751  memory 747
later [1. 0. 0. 0. 0.] 1.0
solid [2. 0. 1. 0. 0.] 2.0
slung [2. 2. 0. 0. 0.] 1.0
slyly [2. 2. 2. 2. 2.] 6.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12049 snort [1.         0.6        0.58815896 0.         0.        ]-------
reward -1 done False 
------guess 2 12740 droit [1.         0.4        0.55896188 0.         0.        ]-------
reward -1 done False 
------guess 3 12153 grout [1.         0.4        0.51257097 0.         0.        ]-------
reward -1 done False 
------guess 4 11639 trout [1.         0.         0.58037307 0.33333333 0.        ]-------
reward 0 done True 
episode 190 finished.  reward -4  eps 0.0  gamma 0.0  steps 756  memory 751
later [0. 0. 1. 0. 1.] 2.0
snort [0. 0. 2. 1. 2.] 

reward -1 done False 
------guess 2 11286 price [1.         0.6        0.59156529 0.         0.        ]-------
reward 0 done True 
episode 201 finished.  reward -2  eps 0.0  gamma 0.0  steps 797  memory 794
later [0. 0. 0. 1. 1.] 2.0
snore [0. 0. 0. 1. 2.] 1.0
price [2. 2. 2. 2. 2.] 7.0
loss shapes
torch.Size([64, 1])
torch.Size([64, 1])
loss 1.059760570526123
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12646 noise [1.         0.8        0.63292782 0.         0.        ]-------
reward -1 done False 
------guess 2 12873 whine [1.         0.4        0.49683698 0.         0.        ]-------
reward 0 done True 
episode 202 finished.  reward -2  eps 0.0  gamma 0.0  steps 800  memory 797
later [0. 0. 0. 1. 0.] 1.0
noise [1. 0. 2. 0. 2.] 4.0
whine [2. 2. 2. 2. 2.] 5.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12908 snore [1.    

reward -1 done False 
------guess 2 11441 dumpy [1.         1.         0.31922141 0.         0.        ]-------
reward -1 done False 
------guess 3 12621 jumpy [1.         0.2        0.25985401 0.         0.        ]-------
reward 0 done True 
episode 214 finished.  reward -3  eps 0.0  gamma 0.0  steps 843  memory 839
later [0. 0. 0. 0. 0.] 0.0
sonic [0. 0. 0. 0. 0.] 0
dumpy [0. 2. 2. 2. 2.] 8.0
jumpy [2. 2. 2. 2. 2.] 2.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 12908 snore [1.         0.6        0.66991079 0.         0.        ]-------
reward -1 done False 
------guess 2 10853 rebus [1.        0.4       0.5756691 0.        0.       ]-------
reward -1 done False 
------guess 3 11393 cress [1.         0.2        0.64022709 0.33333333 0.        ]-------
reward -1 done False 
------guess 4 12455 dress [1.         0.2        0.62660178 0.33333333 0.        ]-------
reward -1 done False 
------guess 5 12892

reward -1 done False 
------guess 1 10681 stool [1.         0.6        0.58799676 0.33333333 0.        ]-------
reward -1 done False 
------guess 2 11388 tunic [1.         0.8        0.47347932 0.         0.        ]-------
reward -1 done False 
------guess 3 10701 digit [1.         0.4        0.45012165 0.33333333 0.        ]-------
reward 0 done True 
episode 226 finished.  reward -3  eps 0.0  gamma 0.0  steps 891  memory 887
eater [0. 0. 1. 0. 0.] 1.0
stool [0. 1. 0. 0. 0.] 0
tunic [1. 0. 0. 2. 0.] 2.0
digit [2. 2. 2. 2. 2.] 7.0
loss shapes
torch.Size([64, 1])
torch.Size([64, 1])
loss 0.9854925870895386
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 11822 piano [1.         0.8        0.54274128 0.         0.        ]-------
reward -1 done False 
------guess 2 11232 mocha [1.         0.6        0.47283049 0.         0.        ]-------
reward -1 done False 
------guess 3 10953 comma [1.         0.         0

reward -1 done False 
------guess 1 10929 cater [1.         0.2        0.70024331 0.         0.        ]-------
reward -1 done False 
------guess 2 11364 hater [1.         0.2        0.68596918 0.         0.        ]-------
reward -1 done False 
------guess 3 12308 water [1.         0.2        0.65450122 0.         0.        ]-------
reward 0 done True 
episode 239 finished.  reward -3  eps 0.0  gamma 0.0  steps 937  memory 933
later [0. 2. 2. 2. 2.] 8.0
cater [0. 2. 2. 2. 2.] 0
hater [0. 2. 2. 2. 2.] 0
water [2. 2. 2. 2. 2.] 2.0
------guess 0 11530 later [1.         1.         0.73949716 0.         0.        ]-------
reward -1 done False 
------guess 1 11527 ocean [1.         0.6        0.65174371 0.         0.        ]-------
reward -1 done False 
------guess 2 11309 chase [1.         0.4        0.60778589 0.         0.        ]-------
reward -1 done False 
------guess 3 11888 space [1.         0.2        0.60421736 0.         0.        ]-------
reward 0 done True 
episode 240 finish

reward -1 done False 
------guess 2 11414 nerve [1.         0.4        0.66390916 0.33333333 0.        ]-------
reward 0 done True 
episode 250 finished.  reward -2  eps 0.0  gamma 0.0  steps 982  memory 979
eerie [0. 2. 2. 0. 2.] 6.0
terse [0. 2. 2. 0. 2.] 0
nerve [2. 2. 2. 2. 2.] 4.0
------guess 0 11277 eerie [1.         0.6        0.85466342 0.66666667 0.        ]-------
reward -1 done False 
------guess 1 12266 salsa [1.         0.6        0.6512571  0.66666667 0.        ]-------
reward -1 done False 
------guess 2 11842 tooth [1.         0.6        0.54420114 0.66666667 0.        ]-------
reward -1 done False 
------guess 3 11869 bunny [1.         0.8        0.37680454 0.33333333 0.        ]-------
reward -1 done False 
------guess 4 11479 buddy [1.         0.2        0.31776156 0.33333333 0.        ]-------
reward 0 done True 
episode 251 finished.  reward -4  eps 0.0  gamma 0.0  steps 987  memory 982
eerie [0. 0. 0. 0. 0.] 0.0
salsa [0. 0. 0. 0. 0.] 0
tooth [0. 0. 0. 0. 0.] 0
bu

dream 5
stale 3
vomit 4
ombre 4
fanny 6
unite 4
snarl 4
baker 0
there 3
glyph 4
pooch 4
hippy 5
spell 5
folly 0
louse 4
gulch 5
vault 5
godly 0
threw 4
fleet 3
grave 0
inane 3
shock 6
crave 5
spite 4
valve 5
skimp 6
claim 3
rainy 4
musty 5
pique 5
daddy 5
quasi 5
arise 3
aging 4
valet 3
opium 4
avert 4
stuck 6
recut 4
mulch 4
genre 3
plume 4
rifle 4
count 4
incur 4
total 3
wrest 4
mocha 4
deter 3
study 5
lover 6
safer 6
rivet 4
funny 5
smoke 6
mound 5
undue 4
sedan 3
pagan 4
swine 6
guile 4
gusty 6
equip 4
tough 5
canoe 4
chaos 4
covet 5
human 5
udder 6
lunch 4
blast 3
stray 3
manga 4
melee 3
lefty 3
quick 6
paste 5
given 4
octet 3
risen 3
groan 4
leaky 5
grind 5
carve 5
loose 3
sadly 3
spilt 4
apple 4
slack 5
honey 5
final 3
sheen 3
eerie 1
minty 5
slick 6
derby 5
wharf 6
spelt 3
coach 4
erupt 3
singe 5
price 4
spawn 5
fairy 6
jiffy 6
filmy 5
stack 4
chose 4
sleep 3
ardor 3
nanny 4
niece 2
woozy 0
handy 0
grace 4
ditto 3
stank 5
cream 5
usual 3
diode 3
valor 5
angle 4
ninja 4
muddy 0


In [None]:

plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.0, 0.0, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=128, clear_memory=False, lr=0.07),
    seed=0,
    run_test=True
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.0, 0.0, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=512,
    training=TrainConfig(train_interval=32, batch_size=128, clear_memory=True, lr=0.07),
    seed=1,
    run_test=True))