In [1]:
import json, pathlib, random, time
from collections import defaultdict
import numpy as np
import pandas as pd
import multiprocessing as mp

from environment import Env, validate_against_hint, load_word_lists, construct_word_df


In [2]:
df = construct_word_df(*load_word_lists())

In [3]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count


# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

#plt.ion()


In [4]:
#Get number of actions from gym action space
#n_actions = env.action_space.n
n_action_features = 5
n_state_features = 9
n_input_features = n_action_features + n_state_features


def plot_values(vals, axes=['duration', 'episode']):
    plt.figure(2)
    plt.clf()
    plt.title('Training...')
    plt.xlabel(axes[1])
    plt.ylabel(axes[0])
    plt.plot(np.array(vals))
    # Take 20 episode averages and plot them too
    window_width = 20
    if len(vals) >= window_width:
        cumsum_vec = np.cumsum(np.insert(vals, 0, 0)) 
        ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width
        plt.plot(np.insert(ma_vec, 0, [None]*int(window_width/2)))

    plt.pause(0.001)  # pause a bit so that plots are updated
    #if is_ipython:
    #    display.clear_output(wait=True)
    #    display.display(plt.gcf())
    
def plot_all(episode_durations, episode_rewards, losses, epsilons, gammas):
    plot_values(episode_durations, axes=['duration', 'episode'])
    plot_values(episode_rewards, axes=['reward', 'episode'])
    if losses: plot_values(losses, axes=['loss', 'step'])
    if epsilons: plot_values(epsilons, axes=['epsilon', 'step'])
    if gammas: plot_values(gammas, axes=['gamma', 'step'])
    #plt.ioff()
    plt.show()

In [5]:

class TrainConfig():
    def __init__(self, train_interval=128, batch_size=128, clear_memory=False, lr=0.01):
        self.train_interval = train_interval
        self.batch_size = batch_size
        self.clear_memory = clear_memory
        self.lr = lr
        
class ValueConfig():
    def __init__(self, name='reward', gamma=[0.9, 0.05, 200]):
        self.name = name
        self.gamma = gamma
        
class ModelConfig():
    def __init__(self, name='naive', startword=None, target_list_only=None):
        self.name = name
        self.startword = startword
        self.target_list_only = target_list_only

In [13]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
def run_experiment(model=ModelConfig(name='naive', startword=None, target_list_only=False),
                   num_episodes=128,
                   eps=[0.9, 0.05, 200],
                   value_function=ValueConfig(name='reward',gamma=[0.0, 1.0, 200]),
                   training=TrainConfig(clear_memory=False, batch_size=128, train_interval=128),
                   seed=0,
                   run_test=False):
    #torch.manual_seed(0)
    random.seed(seed)
    np.random.seed(seed)
    env = Env(df)
    #memory = ReplayMemory(10000)
    
    steps_done = 0
    last_training = 0
    losses = []
    episode_rewards = []
    episode_durations = []
    epsilons = []
    gammas = []
    
    
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        print(f'=========================episode {i_episode} {env.target}======================')

        episode_memory = []
        guesses = []
        for t in count():
            steps_done += 1
            # Select and perform an action
            #print(state, actions)
            
            
            #MODEL
            if t == 0 and model.startword:
                chosen_word = model.startword
            else:
                chosen_word = env.sample_word_matching_current_history(env.df)
            #/MODEL
            
            
            guesses.append(chosen_word)
            print(f'------guess {t} {guesses[-1]} -------')
            history, reward, done = env.step(chosen_word)
            episode_memory.append(reward)
                       
            print(f'reward {reward} done {done} ')
            
            if done:
                episode_durations.append(t + 1)
                episode_reward = sum(episode_memory)
                print(f'episode {i_episode} finished.  reward {t + 1} {episode_reward} {steps_done}')
                episode_rewards.append(episode_reward)
                break

    print('Complete')
    
    if run_test:
        performance_hist = [0] * (1 + env.num_guesses)
        for e in env.foreach_target_word():
            
            done = False
            reward = 0
            num_guesses = 0
            while not done:
                #MODEL
                if num_guesses == 0 and model.startword:
                    chosen_word = model.startword
                else:
                    chosen_word = env.sample_word_matching_current_history(env.df)
                #/MODEL
                _, reward, done = e.step(chosen_word)
                num_guesses += 1
            
            if num_guesses == 6 and reward == -1:
                num_guesses = 0
            print(f'{e.target} {num_guesses}')    
            performance_hist[num_guesses] += 1
            
        for i,p in enumerate(performance_hist):
            print(f'{i}: {p}')
    
    return episode_durations, episode_rewards, losses, epsilons, gammas

#env.render()
#env.close()
#plt.ioff()
#plt.show()

In [15]:
plot_all(*run_experiment(
    model=ModelConfig(name='naive', startword='oater', target_list_only=True),
    num_episodes=0,
    run_test=True
    ))


Complete
cigar 3
rebut 3
sissy 3
humph 4
awake 4
blush 3
focal 4
evade 4
naval 5
serve 4
heath 4
dwarf 4
model 6
karma 4
stink 4
grade 0
quiet 5
bench 5
abate 0
feign 5
major 3
death 0
fresh 5
crust 4
stool 4
colon 4
abase 6
marry 5
react 5
batty 4
pride 4
floss 6
helix 5
croak 5
staff 3
paper 0
unfed 0
whelp 4
trawl 0
outdo 4
adobe 5
crazy 0
sower 0
repay 3
digit 6
crate 6
cluck 4
spike 5
mimic 5
pound 5
maxim 6
linen 3
unmet 0
flesh 4
booby 0


KeyboardInterrupt: 

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='naive', startword=None, target_list_only=True),
    num_episodes=64
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='naive', startword='roate', target_list_only=True),
    num_episodes=150
    ))


In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='naive', startword='oater', target_list_only=True),
    num_episodes=150
    ))
