In [1]:
import json, pathlib, random, time
from collections import defaultdict
import numpy as np
import pandas as pd
import multiprocessing as mp

from environment import Env, validate_against_hint, load_word_lists, construct_word_df


In [2]:
df = construct_word_df(*load_word_lists())

In [3]:


def construct_state_tensor(guesses, history):
        #print(history)
        #so the state is going to be:
            #  The number of green locations we know
            #  The number of other letters we know to be in the word
            #  The sequence number of the guess (1st guess, 2nd guess etc.)

        #the number of locations which were green at some point in the history
        num_green_locs = np.count_nonzero(history.max(axis=0) == 2)

        green_chars = [guesses[x][y] for x,y in np.argwhere(history == 2) ]
        orange_chars = [guesses[x][y] for x,y in np.argwhere(history == 1) ]
        black_chars = [guesses[x][y] for x,y in np.argwhere(history == 0) ]
        num_other_letters = len(set(orange_chars) - set(green_chars))
        num_black_letters = len(set(black_chars))

        sequence_number = int(history.size / 5)
        #print(f'construct_state() with seqno {sequence_number}')

        sequence_number_onehot = np.zeros(Env.num_guesses)
        sequence_number_onehot[sequence_number] = 1.0
        arr = np.concatenate((np.array([num_green_locs, num_other_letters, num_black_letters])/5, sequence_number_onehot))
        return torch.tensor(arr, device=device, dtype=torch.float)
        

    

The aim here is to use a NN to represent the policy, rather than the value function.  We will shrink the action space (ie, so that we have a few actions, rather than 12000).  This will remove the model's ability to learn novel strategies, rather it will just be learning when to employ the different strategies (actions) that I give it.  Start w
ith these 3 word selection tactics:

1. choose words which match the current history
1. choose words which contain the greatest number of new letters
1. choose words which have the highest frequency score

then we will construct 6 actions by choosing every possible order of these strategies
1. 1,2,3
1. 1,3,2
1. 2,1,3
1. 2,3,1
1. 3,1,2
1. 3,2,1

for all these actions there may be multiple words, so sample a random one.  The policy then becomes a logistic regressor which selects one of these actions to execute.  The loss to train the regressor will be derived using the policy gradiet theorem.


In [4]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count, permutations

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.distributions import Categorical


# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

#plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def clear(self):
        self.memory.clear()
        
    def __len__(self):
        return len(self.memory)
    

In [6]:
class PolicyNetNN(nn.Module):

    def __init__(self, num_inputs, num_actions):
        super(PolicyNetNN, self).__init__()
        self.fc1 = nn.Linear(num_inputs, 20)
        self.head = nn.Linear(20, num_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.fc1(x))
        return F.softmax(self.head(x), dim=0)

In [7]:
class PolicyNetLinear(nn.Module):

    def __init__(self, num_inputs, num_actions):
        super(PolicyNetLinear, self).__init__()
        self.head = nn.Linear(num_inputs, num_actions)
        #print(f'PolicyNetLinear {num_inputs}, {num_actions}')

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        return F.softmax(self.head(x), dim=0)
    
class PolicyHybrid(nn.Module):
    def __init__(self, num_guesses, num_inputs, num_actions):
        super(self.__class__, self).__init__()
        self.functions = torch.nn.ModuleList()
        self.function0 = torch.nn.Parameter(torch.ones((num_actions), dtype=float), requires_grad=True)
        for i in range(1, num_guesses):
            self.functions.append(nn.Linear(num_inputs - num_guesses, num_actions))
        self.x = torch.Tensor([1.0,1.0])

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, state):
        onehot = state[3:]
        step_idx = torch.argmax(onehot, dim=0)
        if step_idx == 0:
            y = self.x.mul(self.function0)
        else:
            x = state[0:3].to(device)
            y = self.functions[step_idx-1](x)
        return F.softmax(y, dim=0)
    
class PolicyMonteCarlo(nn.Module):
    def __init__(self, num_guesses, num_actions):
        super(PolicyMonteCarlo, self).__init__()
        #self.weights = torch.nn.Parameter(torch.rand((num_guesses, num_actions), dtype=float), requires_grad=True)
        self.weights = torch.nn.Parameter(torch.ones((num_guesses, num_actions), dtype=float), requires_grad=True)
        #self.weights.require_grad = True
        self.x = torch.Tensor([1.0,1.0])
        
    def forward(self, state):
        onehot = state[3:]
        step_idx = torch.argmax(onehot, dim=0)
        
        y = F.softmax(self.x.mul(self.weights[step_idx]), dim=0)
        #print(f'onehot {onehot}, step_idx {step_idx}')
        #print(self.weights)
        #print(self.weights[step_idx])
        #print(y)
        return y
    
class PolicyAvgReward():
    def __init__(self, num_guesses, num_actions):
        #self.weights = torch.Tensor([[.5,.5],[1,0],[1,0],[1,0],[1,0],[1,0]])
        self.weights = torch.rand((num_guesses, num_actions), dtype=float)
        
        self.reward_stats = [(defaultdict(int),defaultdict(int)) for _ in range(num_guesses)]
        self.num_guesses = num_guesses
        self.num_actions = num_actions
        
    def __call__(self, state):
        onehot = state[3:]
        step_idx = torch.argmax(onehot, dim=0)
        #if step_idx == 0:
        return F.softmax(self.weights[step_idx], dim=0)
        #else:
        #    return self.weights[step_idx]
    
    def calc_avgs(self):
        for action in range(self.num_actions):
            for step_idx in range(self.num_guesses):
                if self.reward_stats[step_idx][action]['count'] > 0:
                    self.weights[step_idx][action] = self.reward_stats[step_idx][action]['total'] / self.reward_stats[step_idx][action]['count']
        

In [8]:
#define the word-selection tactics
n_state_features = 9

class PolicyHelper:
    def __init__(self, env):
        self.env = env
        self.actions = [[env.find_target_words, env.find_words_matching_current_history]]
        self.actions.append([env.find_words_with_highest_new_letter_freq_score])
                
        self.num_actions = len(self.actions)
        #self.net = PolicyNetLinear(n_state_features, len(self.actions))
        
    def perform_action(self, action_idx):
        tactic_tuple = self.actions[action_idx]
        df = self.env.df
        for tactic in tactic_tuple: # apply all the tactics in the given order
            newdf = tactic(df)
            if not newdf.empty: #if that tactic produced no results, then quit
                df = newdf
        return df.sample()['word'][0] # then pick a random word from what is left
    
    

In [9]:





def plot_values(vals, axes=['duration', 'episode']):
    plt.figure(2)
    plt.clf()
    plt.title('Training...')
    plt.xlabel(axes[1])
    plt.ylabel(axes[0])
    plt.plot(np.array(vals))
    # Take 20 episode averages and plot them too
    window_width = 20
    if len(vals) >= window_width:
        cumsum_vec = np.cumsum(np.insert(vals, 0, 0)) 
        ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width
        plt.plot(np.insert(ma_vec, 0, [None]*int(window_width/2)))

    plt.pause(0.001)  # pause a bit so that plots are updated
    #if is_ipython:
    #    display.clear_output(wait=True)
    #    display.display(plt.gcf())
    
def plot_all(episode_durations, episode_rewards, losses, epsilons, gammas):
    plot_values(episode_durations, axes=['duration', 'episode'])
    plot_values(episode_rewards, axes=['reward', 'episode'])
    if losses: plot_values(losses, axes=['loss', 'step'])
    if epsilons: plot_values(epsilons, axes=['epsilon', 'step'])
    if gammas: plot_values(gammas, axes=['gamma', 'step'])
    #plt.ioff()
    plt.show()

In [10]:
def optimize_model_batch(model, optimizer, memory, batch_size=128):
    if batch_size <= 0:
        transitions = memory.memory
    else:
        transitions = memory.sample(batch_size)
    print(f'optimize_model_batch {batch_size} {len(transitions)}')
    losses = [optimize_model_single(model, optimizer, tr.state, tr.action, tr.reward) for tr in transitions]
    
    return losses

optimizations_run = 0

def optimize_model_single(model, optimizer, state, action, reward):
    global optimizations_run
    optimizations_run += 1
    if isinstance(model, PolicyAvgReward):
        onehot = state[3:]
        step_idx = torch.argmax(onehot, dim=0)
        model.reward_stats[step_idx][action]['count'] += 1
        model.reward_stats[step_idx][action]['total'] += reward
        model.calc_avgs()
        return reward
    # calculate gradient
    probs = model(state)
    sampler = Categorical(probs)
    #print(f'sampler {sampler}')
    log_probs = -sampler.log_prob(action)   # "-" because it was built to work with gradient descent, but we are using gradient ascent

    pseudo_loss = log_probs * reward # loss that when differentiated with autograd gives the gradient of J(θ)
    #print(f'log_prob {log_probs}, reward {reward}, loss {pseudo_loss} ')
    # update policy weights
    optimizer.zero_grad()
    pseudo_loss.backward()
    optimizer.step()

    return pseudo_loss


class TrainConfig():
    def __init__(self, optimizer='adam', batch_size=64, train_interval=64, clear_memory=False, lr=0.01):
        self.optimizer = optimizer
        self.clear_memory = clear_memory
        self.lr = lr
        self.train_interval = train_interval
        self.batch_size = batch_size
        
class ValueConfig():
    def __init__(self, name='reward', gamma=[0.9, 0.05, 200]):
        self.name = name
        self.gamma = gamma
        
class ModelConfig():
    def __init__(self, name='naive', startword=None, target_list_only=None):
        self.name = name
        self.startword = startword
        self.target_list_only = target_list_only

In [11]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
def run_experiment(model=ModelConfig(name='naive', startword=None, target_list_only=False),
                   num_episodes=128,
                   eps=[0.9, 0.05, 200],
                   value_function=ValueConfig(name='reward',gamma=[0.0, 1.0, 200]),
                   training=TrainConfig(),
                   seed=0,
                   run_test=False):
    global optimizations_run
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    GAMMA_START, GAMMA_END, GAMMA_DECAY = value_function.gamma
    env = Env(df)
    memory = ReplayMemory(10000)
    starting_state = construct_state_tensor(env.guesses, env.history)

    steps_done = 0
    last_training = 0
    losses = []
    episode_rewards = []
    episode_durations = []
    epsilons = []
    gammas = []
    reward_stats = [(defaultdict(int),defaultdict(int)) for _ in range(env.num_guesses)]
    transitions_added_to_memory = 0
    optimizations_run = 0
    
    policy_helper = PolicyHelper(env)
    
    if model.name == 'linear':
        policy_net = PolicyNetLinear(n_state_features, len(policy_helper.actions)).to(device)
    elif model.name == 'monte':
        policy_net = PolicyMonteCarlo(env.num_guesses, len(policy_helper.actions)).to(device)
        print('monte weights')
        print(policy_net.weights)
        print(F.softmax(policy_net.weights, dim=1))
    elif model.name == 'avg_reward':
        policy_net = PolicyAvgReward(env.num_guesses, len(policy_helper.actions))
    elif model.name == 'hybrid':
        policy_net = PolicyHybrid(env.num_guesses, n_state_features, len(policy_helper.actions))
    else:
        policy_net = PolicyNetNN(n_state_features, len(policy_helper.actions)).to(device)
    
    if model.name == 'avg_reward':
        optimizer = None
    elif training.optimizer == 'rmsprop':
            optimizer = optim.RMSprop(policy_net.parameters(), lr=training.lr)
    elif training.optimizer == 'sgd':
            optimizer = optim.SGD(policy_net.parameters(), lr=training.lr)
    else:
            optimizer = optim.Adam(policy_net.parameters(), lr=training.lr)

        #print(f'pn params {list(policy_net.parameters())}')
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        print(f'=========================episode {i_episode} {env.target}======================')

        episode_memory = []
        state = starting_state
        guesses = []
        for t in count():
            GAMMA = GAMMA_END + (GAMMA_START - GAMMA_END) * math.exp(-1. * steps_done / GAMMA_DECAY)
            gammas.append(GAMMA)
            steps_done += 1
            # Select and perform an action
            #print(state, actions)
            probs = policy_net(state)
            sampler = Categorical(probs)
            action_idx = sampler.sample()
            chosen_word = policy_helper.perform_action(action_idx)
            guesses.append(chosen_word)
            print(f'------guess {t} {action_idx} {guesses[-1]}-------')
            history, reward, done = env.step(chosen_word)
            #here next_state == env.history
            if not done:
                next_state = construct_state_tensor(guesses, history)
            
            #action_tensor = action_idx.clone().detach()
            action = action_idx #F.one_hot(action_idx, num_classes=policy_helper.num_actions)
                
            print(f'reward {reward} done {done} action {action}')
            #reward = np.array([reward])

            # Store the transition in memory
            #memory.push(state, action_idx, reward)
            episode_memory.append([state, action, reward])
            # Move to the next state
            state = next_state

            if done:
                episode_durations.append(t + 1)
                episode_reward = sum([tr[2] for tr in episode_memory])
                print(f'episode {i_episode} finished.  reward {episode_reward}  eps {eps}  gamma {GAMMA}  steps {steps_done}  memory {len(memory)}')
                episode_rewards.append(episode_reward)
                
                for idx,tr in enumerate(episode_memory):
                    #print(f'pushing episode_reward {episode_reward} {episode_reward.__class__}')
                    transitions_added_to_memory += 1
                    memory.push(tr[0], tr[1], episode_reward)
                    #loss = optimize_model_single(policy_net, optimizer, tr[0], tr[1], episode_reward)
                    #losses.append(loss)
                
                    reward_stats[idx][tr[1]]['count'] += 1
                    reward_stats[idx][tr[1]]['total'] += episode_reward
                    episode_reward -= tr[2]
                    
                    
                # If we have gathered enough data, Perform one step of the optimization (on the policy network)
                if len(memory) >= max(1,training.batch_size) \
                    and (i_episode + 1) % training.train_interval == 0:
                    losses += optimize_model_batch(policy_net, optimizer, memory, batch_size=training.batch_size)
                    if training.clear_memory: memory.clear()
                        
                    if model.name == 'monte':
                        print('monte weights')
                        print(policy_net.weights)
                        print(F.softmax(policy_net.weights, dim=1))
                        for rs in reward_stats:
                            if rs[0]['count'] > 0:
                                rs[0]['avg'] = rs[0]['total'] / rs[0]['count']
                            if rs[1]['count'] > 0:
                                rs[1]['avg'] = rs[1]['total'] / rs[1]['count']
                            print(dict(rs[0]), dict(rs[1]))
                    elif model.name == 'avg_reward':
                        print('avg_reward weights')
                        print(policy_net.weights)
                        print(F.softmax(policy_net.weights, dim=1))
                        for rs in policy_net.reward_stats:
                            if rs[0]['count'] > 0:
                                rs[0]['avg'] = rs[0]['total'] / rs[0]['count']
                            if rs[1]['count'] > 0:
                                rs[1]['avg'] = rs[1]['total'] / rs[1]['count']
                            print(dict(rs[0]), dict(rs[1]))
                    elif model.name == 'hybrid':
                        print('hybrid weights')
                        print(policy_net.function0)
                        print(F.softmax(policy_net.function0, dim=0))
                    print(f'done {optimizations_run} optimizations, {transitions_added_to_memory} transitions added to memory')
                    
                        
                        
                
                #plot_durations()
                break

    print('Training Complete')
    
    if run_test:
        performance_hist = [0] * (1 + env.num_guesses)
        for e in env.foreach_target_word():
            state = starting_state
            done = False
            reward = 0
            num_guesses = 0
            while not done:
                probs = policy_net(state)
                sampler = Categorical(probs)
                action_idx = sampler.sample()
                chosen_word = policy_helper.perform_action(action_idx)
                _, reward, done = e.step(chosen_word)
                num_guesses += 1
            
            if num_guesses == 6 and reward == -1:
                num_guesses = 0
            print(f'{e.target} {num_guesses}')    
            performance_hist[num_guesses] += 1
            
        for i,p in enumerate(performance_hist):
            print(f'{i}: {p}')
    
    return episode_durations, episode_rewards, losses, epsilons, gammas

#env.render()
#env.close()
#plt.ioff()
#plt.show()

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='avg_reward'),
    num_episodes=512,
    training=TrainConfig(optimizer='sgd', lr=0.01, batch_size=-1, train_interval=128, clear_memory=True),
    seed=1,
    run_test=True
    ))

------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 mammy-------
reward -1 done False action 0
------guess 2 0 gassy-------
reward -1 done False action 0
------guess 3 0 happy-------
reward -1 done False action 0
------guess 4 1 unlid-------
reward -1 done False action 1
------guess 5 0 fancy-------
reward -1 done True action 0
episode 0 finished.  reward -6  eps [0.9, 0.05, 200]  gamma 0.024690087971667385  steps 6  memory 0
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 1 chump-------
reward -1 done False action 1
------guess 3 1 budge-------
reward -1 done False action 1
------guess 4 1 wakfs-------
reward -1 done False action 1
------guess 5 1 avyze-------
reward -1 done True action 1
episode 1 finished.  reward -6  eps [0.9, 0.05, 200]  gamma 0.05351485204651618  steps 12  memory 6
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 quart-------
reward

------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 pouch-------
reward -1 done False action 0
------guess 3 0 cough-------
reward 0 done True action 0
episode 20 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.36237184837822667  steps 91  memory 87
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 1 chump-------
reward -1 done False action 1
------guess 3 0 aloft-------
reward -1 done False action 0
------guess 4 0 float-------
reward 0 done True action 0
episode 21 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.3781149435349799  steps 96  memory 91
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 1 chump-------
reward -1 done False action 1
------guess 3 1 budge-------
reward -1 done False action 1
------guess 4 1 wakfs-------
rewar

------guess 2 0 poser-------
reward -1 done False action 0
------guess 3 1 duchy-------
reward -1 done False action 1
------guess 4 0 sower-------
reward -1 done False action 0
------guess 5 1 bumfs-------
reward -1 done True action 1
episode 39 finished.  reward -6  eps [0.9, 0.05, 200]  gamma 0.6074141344684816  steps 188  memory 182
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 0 paddy-------
reward -1 done False action 0
------guess 2 0 canal-------
reward -1 done False action 0
------guess 3 1 gibus-------
reward -1 done False action 1
------guess 4 1 fehme-------
reward -1 done False action 1
------guess 5 0 sauna-------
reward 0 done True action 0
episode 40 finished.  reward -5  eps [0.9, 0.05, 200]  gamma 0.6190168002606627  steps 194  memory 188
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 0 cover-------
reward -1 done False action 0
------guess 2 1 lysin-------
reward -1 done False action 1
------guess 3 1 bumph-------
r

------guess 4 1 gamba-------
reward -1 done False action 1
------guess 5 1 wakfs-------
reward -1 done True action 1
episode 58 finished.  reward -6  eps [0.9, 0.05, 200]  gamma 0.7471604041952535  steps 276  memory 270
------guess 0 0 ingot-------
reward -1 done False action 0
------guess 1 0 endow-------
reward -1 done False action 0
------guess 2 1 carls-------
reward -1 done False action 1
------guess 3 1 humpy-------
reward -1 done False action 1
------guess 4 0 envoy-------
reward -1 done False action 0
------guess 5 1 bumfs-------
reward -1 done True action 1
episode 59 finished.  reward -6  eps [0.9, 0.05, 200]  gamma 0.7546329435470737  steps 282  memory 276
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 1 chump-------
reward -1 done False action 1
------guess 3 0 vomit-------
reward -1 done False action 0
------guess 4 1 badge-------
reward -1 done False action 1
------guess 5 1 wakfs-------


------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 0 edify-------
reward -1 done False action 0
------guess 2 0 geese-------
reward -1 done False action 0
------guess 3 0 spell-------
reward -1 done False action 0
------guess 4 0 swell-------
reward 0 done True action 0
episode 76 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.8419746791103522  steps 370  memory 365
------guess 0 0 murky-------
reward -1 done False action 0
------guess 1 1 toeas-------
reward -1 done False action 1
------guess 2 0 force-------
reward -1 done False action 0
------guess 3 1 pling-------
reward -1 done False action 1
------guess 4 0 gorge-------
reward 0 done True action 0
episode 77 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.8458763381848686  steps 375  memory 370
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 title-------
reward 0 done True action 0
episode 78 finished.  reward -2  

------guess 4 0 comic-------
reward 0 done True action 0
episode 95 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.8972030915647136  steps 456  memory 451
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 0 stuck-------
reward -1 done False action 0
------guess 2 0 sting-------
reward -1 done False action 0
------guess 3 1 lymph-------
reward -1 done False action 1
------guess 4 1 bifid-------
reward -1 done False action 1
------guess 5 0 stiff-------
reward 0 done True action 0
episode 96 finished.  reward -5  eps [0.9, 0.05, 200]  gamma 0.9002411993463814  steps 462  memory 456
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 allow-------
reward -1 done False action 0
------guess 2 0 zonal-------
reward -1 done False action 0
------guess 3 1 scudi-------
reward -1 done False action 1
------guess 4 1 nymph-------
reward -1 done False action 1
------guess 5 0 vocal-------
reward -1 done True action 0
episode 97 finished.  reward -6 

------guess 3 0 wafer-------
reward -1 done False action 0
------guess 4 0 caper-------
reward -1 done False action 0
------guess 5 0 gazer-------
reward -1 done True action 0
episode 115 finished.  reward -6  eps [0.9, 0.05, 200]  gamma 0.9347807103318725  steps 547  memory 541
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 mogul-------
reward -1 done False action 0
------guess 3 1 chawk-------
reward -1 done False action 1
------guess 4 1 biped-------
reward -1 done False action 1
------guess 5 0 cloud-------
reward 0 done True action 0
episode 116 finished.  reward -5  eps [0.9, 0.05, 200]  gamma 0.9367082316403592  steps 553  memory 547
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 moral-------
reward -1 done False action 0
------guess 2 0 abhor-------
reward -1 done False action 0
------guess 3 1 incus-------
reward -1 done False action 1
------guess 4 1 pudgy-------

------guess 3 1 nymph-------
reward -1 done False action 1
------guess 4 1 fudge-------
reward -1 done False action 1
------guess 5 0 quilt-------
reward 0 done True action 0
episode 130 finished.  reward -5  eps [0.9, 0.05, 200]  gamma 0.9558428315803071  steps 625  memory 8
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 0 fairy-------
reward -1 done False action 0
------guess 2 0 radii-------
reward -1 done False action 0
------guess 3 0 rabbi-------
reward 0 done True action 0
episode 131 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9567172020980341  steps 629  memory 14
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 fetus-------
reward -1 done False action 0
------guess 2 1 linch-------
reward -1 done False action 1
------guess 3 1 dampy-------
reward -1 done False action 1
------guess 4 0 teddy-------
reward 0 done True action 0
episode 132 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9577858581858936  steps 634  

------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 0 first-------
reward -1 done False action 0
------guess 2 0 crust-------
reward -1 done False action 0
------guess 3 0 trust-------
reward 0 done True action 0
episode 150 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9714186250837188  steps 712  memory 97
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 0 duchy-------
reward -1 done False action 0
------guess 2 0 hippy-------
reward -1 done False action 0
------guess 3 1 limns-------
reward -1 done False action 1
------guess 4 0 hilly-------
reward 0 done True action 0
episode 151 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.972124301744753  steps 717  memory 101
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 great-------
reward -1 done False action 0
------guess 3 0 treat-------
reward 0 done True action 0
episode 152 finished.  reward -3 

------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 gusty-------
reward -1 done False action 0
------guess 2 1 linch-------
reward -1 done False action 1
------guess 3 0 witty-------
reward -1 done False action 0
------guess 4 0 ditty-------
reward -1 done False action 0
------guess 5 0 bitty-------
reward -1 done True action 0
episode 170 finished.  reward -6  eps [0.9, 0.05, 200]  gamma 0.9815002858801808  steps 799  memory 182
------guess 0 0 think-------
reward -1 done False action 0
------guess 1 0 etude-------
reward -1 done False action 0
------guess 2 0 facet-------
reward -1 done False action 0
------guess 3 1 slorm-------
reward -1 done False action 1
------guess 4 1 pygmy-------
reward -1 done False action 1
------guess 5 0 metal-------
reward 0 done True action 0
episode 171 finished.  reward -5  eps [0.9, 0.05, 200]  gamma 0.9820470350604972  steps 805  memory 188
------guess 0 0 boney-------
reward -1 done False action 0
------guess 1 1 trail-------

------guess 1 1 islet-------
reward -1 done False action 1
------guess 2 0 juicy-------
reward -1 done False action 0
------guess 3 1 whomp-------
reward -1 done False action 1
------guess 4 0 dizzy-------
reward 0 done True action 0
episode 188 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9880257886991963  steps 886  memory 270
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 1 chump-------
reward -1 done False action 1
------guess 3 1 budge-------
reward -1 done False action 1
------guess 4 0 magma-------
reward 0 done True action 0
episode 189 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9883214330296045  steps 891  memory 275
------guess 0 0 cramp-------
reward -1 done False action 0
------guess 1 1 teloi-------
reward -1 done False action 1
------guess 2 1 dunsh-------
reward -1 done False action 1
------guess 3 1 gawky-------
reward -1 done False action 1
------guess 4 0 miner-------


------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 junto-------
reward -1 done False action 0
------guess 2 0 quoth-------
reward 0 done True action 0
episode 208 finished.  reward -2  eps [0.9, 0.05, 200]  gamma 0.9920929459484066  steps 969  memory 355
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 0 laugh-------
reward -1 done False action 0
------guess 2 0 manly-------
reward -1 done False action 0
------guess 3 0 nasal-------
reward 0 done True action 0
episode 209 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9922495161088634  steps 973  memory 358
------guess 0 0 prowl-------
reward -1 done False action 0
------guess 1 1 entia-------
reward -1 done False action 1
------guess 2 1 cushy-------
reward -1 done False action 1
------guess 3 0 saint-------
reward 0 done True action 0
episode 210 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9924029859724224  steps 977  memory 362
------guess 0 1 oater-------
reward -1 done F

------guess 1 0 brawn-------
reward -1 done False action 0
------guess 2 0 grown-------
reward -1 done False action 0
------guess 3 0 frown-------
reward -1 done False action 0
------guess 4 0 crown-------
reward -1 done False action 0
------guess 5 0 drown-------
reward 0 done True action 0
episode 230 finished.  reward -5  eps [0.9, 0.05, 200]  gamma 0.9949329678118393  steps 1058  memory 441
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 0 fjord-------
reward -1 done False action 0
------guess 2 0 curio-------
reward -1 done False action 0
------guess 3 0 occur-------
reward 0 done True action 0
episode 231 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9950333017715621  steps 1062  memory 447
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 1 chump-------
reward -1 done False action 1
------guess 3 1 gibed-------
reward -1 done False action 1
------guess 4 0 friar------

------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 1 chump-------
reward -1 done False action 1
------guess 3 0 cheap-------
reward 0 done True action 0
episode 251 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9967202890976564  steps 1145  memory 530
------guess 0 0 ninja-------
reward -1 done False action 0
------guess 1 1 estro-------
reward -1 done False action 1
------guess 2 0 tonic-------
reward -1 done False action 0
------guess 3 1 duply-------
reward -1 done False action 1
------guess 4 0 ingot-------
reward 0 done True action 0
episode 252 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9968012654483569  steps 1150  memory 534
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 libel-------
reward 0 done True action 0
episode 253 finished.  reward -2  eps [0.9, 0.05, 200]  gamma 0.9968488884015556  steps 

------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 0 groom-------
reward -1 done False action 0
------guess 2 1 lysin-------
reward -1 done False action 1
------guess 3 0 rowdy-------
reward 0 done True action 0
episode 268 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9977232356757451  steps 1218  memory 50
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 seedy-------
reward 0 done True action 0
episode 269 finished.  reward -2  eps [0.9, 0.05, 200]  gamma 0.9977571322805142  steps 1221  memory 54
------guess 0 0 haven-------
reward -1 done False action 0
------guess 1 1 lirot-------
reward -1 done False action 1
------guess 2 0 paint-------
reward 0 done True action 0
episode 270 finished.  reward -2  eps [0.9, 0.05, 200]  gamma 0.9977905242305842  steps 1224  memory 57
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done F

------guess 1 0 psalm-------
reward -1 done False action 0
------guess 2 0 khaki-------
reward -1 done False action 0
------guess 3 0 aback-------
reward 0 done True action 0
episode 289 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9984661893206755  steps 1297  memory 129
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 0 barge-------
reward -1 done False action 0
------guess 2 1 lysin-------
reward -1 done False action 1
------guess 3 1 chump-------
reward -1 done False action 1
------guess 4 1 fiked-------
reward -1 done False action 1
------guess 5 0 arise-------
reward 0 done True action 0
episode 290 finished.  reward -5  eps [0.9, 0.05, 200]  gamma 0.9985115202769406  steps 1303  memory 133
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 0 place-------
reward -1 done False action 0
------guess 2 0 cease-------
reward 0 done True action 0
episode 291 finished.  reward -2  eps [0.9, 0.05, 200]  gamma 0.9985336808529571  steps 

------guess 2 1 lotsa-------
reward -1 done False action 1
------guess 3 0 resin-------
reward 0 done True action 0
episode 309 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9990022422035156  steps 1383  memory 215
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 ficus-------
reward -1 done False action 0
------guess 3 0 skimp-------
reward -1 done False action 0
------guess 4 0 whisk-------
reward 0 done True action 0
episode 310 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9990268769312853  steps 1388  memory 219
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 vogue-------
reward -1 done False action 0
------guess 2 1 lysin-------
reward -1 done False action 1
------guess 3 0 choke-------
reward 0 done True action 0
episode 311 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9990461460590817  steps 1392  memory 224
------guess 0 0 media-------
reward -1 don

------guess 0 0 olive-------
reward -1 done False action 0
------guess 1 1 trans-------
reward -1 done False action 1
------guess 2 1 duchy-------
reward -1 done False action 1
------guess 3 1 gimpy-------
reward -1 done False action 1
------guess 4 1 bawks-------
reward -1 done False action 1
------guess 5 0 chide-------
reward 0 done True action 0
episode 329 finished.  reward -5  eps [0.9, 0.05, 200]  gamma 0.9993574076396444  steps 1471  memory 301
------guess 0 0 boxer-------
reward -1 done False action 0
------guess 1 0 mossy-------
reward -1 done False action 0
------guess 2 1 cital-------
reward -1 done False action 1
------guess 3 0 vouch-------
reward -1 done False action 0
------guess 4 0 pooch-------
reward 0 done True action 0
episode 330 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9993732733015516  steps 1476  memory 307
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 twist------

------guess 1 0 voila-------
reward -1 done False action 0
------guess 2 0 modal-------
reward -1 done False action 0
------guess 3 0 loyal-------
reward -1 done False action 0
------guess 4 0 local-------
reward 0 done True action 0
episode 349 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.999577786733191  steps 1555  memory 386
------guess 0 0 spark-------
reward -1 done False action 0
------guess 1 0 excel-------
reward -1 done False action 0
------guess 2 0 needy-------
reward -1 done False action 0
------guess 3 0 teeth-------
reward 0 done True action 0
episode 350 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9995861471160212  steps 1559  memory 391
------guess 0 0 wooer-------
reward -1 done False action 0
------guess 1 0 scout-------
reward -1 done False action 0
------guess 2 1 inlay-------
reward -1 done False action 1
------guess 3 0 quota-------
reward 0 done True action 0
episode 351 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9995943419521799  steps 1

------guess 1 0 style-------
reward -1 done False action 0
------guess 2 0 bused-------
reward 0 done True action 0
episode 370 finished.  reward -2  eps [0.9, 0.05, 200]  gamma 0.9997225861157594  steps 1639  memory 472
------guess 0 0 gooey-------
reward -1 done False action 0
------guess 1 0 mecca-------
reward -1 done False action 0
------guess 2 1 tirls-------
reward -1 done False action 1
------guess 3 0 lithe-------
reward 0 done True action 0
episode 371 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9997280792787104  steps 1643  memory 475
------guess 0 0 visor-------
reward -1 done False action 0
------guess 1 0 lasso-------
reward -1 done False action 0
------guess 2 0 mossy-------
reward -1 done False action 0
------guess 3 1 centu-------
reward -1 done False action 1
------guess 4 0 bossy-------
reward 0 done True action 0
episode 372 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9997347930252404  steps 1648  memory 479
------guess 0 0 hunch-------
reward -1 don

------guess 3 0 vigil-------
reward 0 done True action 0
episode 388 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.999806454900442  steps 1711  memory 15
------guess 0 0 visor-------
reward -1 done False action 0
------guess 1 1 laten-------
reward -1 done False action 1
------guess 2 0 mucky-------
reward -1 done False action 0
------guess 3 0 jumpy-------
reward -1 done False action 0
------guess 4 0 dummy-------
reward -1 done False action 0
------guess 5 1 baghs-------
reward -1 done True action 1
episode 389 finished.  reward -6  eps [0.9, 0.05, 200]  gamma 0.9998121750225937  steps 1717  memory 19
------guess 0 0 album-------
reward -1 done False action 0
------guess 1 1 torse-------
reward -1 done False action 1
------guess 2 1 pinch-------
reward -1 done False action 1
------guess 3 0 retro-------
reward -1 done False action 0
------guess 4 0 overt-------
reward 0 done True action 0
episode 390 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9998168124378091  steps 17

------guess 3 1 chump-------
reward -1 done False action 1
------guess 4 0 patio-------
reward 0 done True action 0
episode 408 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.999877205704879  steps 1802  memory 105
------guess 0 0 slack-------
reward -1 done False action 0
------guess 1 0 rogue-------
reward -1 done False action 0
------guess 2 0 oxide-------
reward -1 done False action 0
------guess 3 0 phone-------
reward 0 done True action 0
episode 409 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9998796371948327  steps 1806  memory 110
------guess 0 0 steep-------
reward -1 done False action 0
------guess 1 1 loran-------
reward -1 done False action 1
------guess 2 0 cycle-------
reward 0 done True action 0
episode 410 finished.  reward -2  eps [0.9, 0.05, 200]  gamma 0.9998814291635456  steps 1809  memory 114
------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 asset-------
reward 0 done 

------guess 4 1 bawks-------
reward -1 done False action 1
------guess 5 0 mourn-------
reward 0 done True action 0
episode 427 finished.  reward -5  eps [0.9, 0.05, 200]  gamma 0.9999205195914495  steps 1889  memory 191
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 smear-------
reward -1 done False action 0
------guess 2 0 cedar-------
reward -1 done False action 0
------guess 3 0 debar-------
reward 0 done True action 0
episode 428 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9999220934089849  steps 1893  memory 197
------guess 0 1 orate-------
reward -1 done False action 1
------guess 1 0 ozone-------
reward -1 done False action 0
------guess 2 1 sulci-------
reward -1 done False action 1
------guess 3 0 oxide-------
reward 0 done True action 0
episode 429 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9999236360628451  steps 1897  memory 201
------guess 0 0 quark-------
reward -1 done False action 0
------guess 1 1 teloi-------
reward -1 don

------guess 3 0 salon-------
reward 0 done True action 0
episode 448 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9999472528069845  steps 1971  memory 275
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 woven-------
reward -1 done False action 0
------guess 3 0 women-------
reward 0 done True action 0
episode 449 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9999482972713856  steps 1975  memory 279
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 1 lysin-------
reward -1 done False action 1
------guess 2 0 earth-------
reward 0 done True action 0
episode 450 finished.  reward -2  eps [0.9, 0.05, 200]  gamma 0.9999490670247319  steps 1978  memory 283
------guess 0 1 roate-------
reward -1 done False action 1
------guess 1 0 cyber-------
reward -1 done False action 0
------guess 2 0 nicer-------
reward -1 done False action 0
------guess 3 1 plush-------
reward -1 don

------guess 0 1 oater-------
reward -1 done False action 1
------guess 1 0 hunky-------
reward -1 done False action 0
------guess 2 1 clips-------
reward -1 done False action 1
------guess 3 1 gamed-------
reward -1 done False action 1
------guess 4 0 civil-------
reward 0 done True action 0
episode 470 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9999667015597029  steps 2063  memory 366
------guess 0 0 scary-------
reward -1 done False action 0
------guess 1 1 toile-------
reward -1 done False action 1
------guess 2 1 bundh-------
reward -1 done False action 1
------guess 3 1 grump-------
reward -1 done False action 1
------guess 4 0 chump-------
reward 0 done True action 0
episode 471 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9999675237011232  steps 2068  memory 371
------guess 0 0 delay-------
reward -1 done False action 0
------guess 1 0 mafia-------
reward -1 done False action 0
------guess 2 0 actor-------
reward -1 done False action 0
------guess 3 1 pungs------

------guess 2 1 cumin-------
reward -1 done False action 1
------guess 3 0 color-------
reward 0 done True action 0
episode 491 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9999779014817677  steps 2145  memory 449
------guess 0 0 axial-------
reward -1 done False action 0
------guess 1 0 mince-------
reward -1 done False action 0
------guess 2 0 virus-------
reward -1 done False action 0
------guess 3 1 gothy-------
reward -1 done False action 1
------guess 4 0 tipsy-------
reward 0 done True action 0
episode 492 finished.  reward -4  eps [0.9, 0.05, 200]  gamma 0.9999784470961269  steps 2150  memory 453
------guess 0 0 debut-------
reward -1 done False action 0
------guess 1 0 steal-------
reward -1 done False action 0
------guess 2 0 thief-------
reward -1 done False action 0
------guess 3 0 totem-------
reward 0 done True action 0
episode 493 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9999788738722176  steps 2154  memory 458
------guess 0 1 oater-------
reward -1 don

------guess 3 0 masse-------
reward 0 done True action 0
episode 511 finished.  reward -3  eps [0.9, 0.05, 200]  gamma 0.9999854802388728  steps 2229  memory 533
optimize_model_batch -1 537
avg_reward weights
tensor([[-3.4524, -3.4192],
        [-2.5413, -2.3407],
        [-1.2701, -1.7744],
        [-0.5059, -1.5714],
        [-0.2222, -1.4394],
        [-0.1509, -1.0000]], dtype=torch.float64)
tensor([[0.4917, 0.5083],
        [0.4500, 0.5500],
        [0.6235, 0.3765],
        [0.7437, 0.2563],
        [0.7716, 0.2284],
        [0.7004, 0.2996]], dtype=torch.float64)
{'count': 252, 'total': -870, 'avg': -3.4523809523809526} {'count': 260, 'total': -889, 'avg': -3.419230769230769}
{'count': 242, 'total': -615, 'avg': -2.541322314049587} {'count': 270, 'total': -632, 'avg': -2.3407407407407406}
{'count': 311, 'total': -395, 'avg': -1.270096463022508} {'count': 195, 'total': -346, 'avg': -1.7743589743589743}
{'count': 253, 'total': -128, 'avg': -0.5059288537549407} {'count': 140, 'tota

begun 3
azure 3
weave 5
genie 3
sound 4
glove 6
braid 3
scope 0
wryly 3
rover 0
assay 5
ocean 3
bloom 5
irate 0
later 3
woken 6
silky 5
wreck 5
dwelt 5
slate 5
smack 4
solid 3
amaze 3
hazel 3
wrist 5
jolly 4
globe 5
flint 3
rouse 5
civil 6
vista 4
relax 6
cover 0
alive 5
beech 5
jetty 5
bliss 5
vocal 3
often 4
dolly 6
eight 5
joker 0
since 4
event 0
ensue 3
shunt 3
diver 0
poser 0
worst 5
sweep 6
alley 2
creed 5
anime 4
leafy 5
bosom 3
dunce 4
stare 6
pudgy 5
waive 0
choir 4
stood 5
spoke 3
outgo 4
delay 3
bilge 4
ideal 3
clasp 0
seize 5
hotly 3
laugh 5
sieve 4
block 5
meant 0
grape 4
noose 3
hardy 5
shied 5
drawl 4
daisy 5
putty 2
strut 6
burnt 4
tulip 5
crick 5
idyll 0
vixen 6
furor 4
geeky 6
cough 5
naive 5
shoal 3
stork 3
bathe 3
aunty 5
check 3
prime 6
brass 0
outer 2
furry 0
razor 5
elect 4
evict 0
imply 4
demur 6
quota 0
haven 3
cavil 3
swear 3
crump 4
dough 5
gavel 4
wagon 6
salon 4
nudge 0
harem 5
pitch 5
sworn 3
pupil 6
excel 4
stony 3
cabin 6
unzip 5
queen 5
trout 0
polyp 5


In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='monte'),
    num_episodes=512,
    training=TrainConfig(optimizer='adam', lr=0.01, batch_size=-1, train_interval=128, clear_memory=True),
    seed=1,
    run_test=True
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    num_episodes=512,
    training=TrainConfig(optimizer='rmprop', lr=0.01, batch_size=-1, train_interval=128, clear_memory=True),
    seed=1,
    run_test=True
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='nn'),
    num_episodes=512,
    training=TrainConfig(optimizer='rmsprop', lr=0.01, batch_size=-1, train_interval=128, clear_memory=True),
    seed=1,
    run_test=True
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='hybrid'),
    num_episodes=512,
    training=TrainConfig(optimizer='rmsprop', lr=0.01, batch_size=-1, train_interval=128, clear_memory=True),
    seed=1,
    run_test=True
    ))