In [1]:
import json, pathlib, random, time
from collections import defaultdict
import numpy as np
import pandas as pd
import multiprocessing as mp


In [2]:
with open('lists.json') as f:
    j = json.load(f)

target_list = j['target']
guess_list = j["guess"]


In [3]:
def char_freq(lst):
    hist = defaultdict(int)
    for word in lst:
        for char in word:
            hist[char] += 1
    mx = max(hist.values())
    for char in hist:
        hist[char] /= mx
    return hist

def print_char_freq(cf):
    for char in sorted(list(cf.keys())):
        print(f'{char}: {cf[char]}')
        
def freq_score(word, cf):
    return sum(cf[x] for x in word) / len(word) 

def uniq_score(word):
    return (len(word) - len(set(word))) / (len(word) - 2)

In [4]:

cf = char_freq(target_list)
#print_char_freq(cf)

In [5]:
print(random.choice(guess_list))
print(random.choice(target_list))

stime
vouch


In [6]:
dfg = pd.DataFrame([[w, freq_score(w, cf), uniq_score(w), 1.0] for w in guess_list], columns=['word', 'freq_score', 'uniq_score', 'is_guess_word'])
dft = pd.DataFrame([[w, freq_score(w, cf), uniq_score(w), 0.0] for w in target_list], columns=['word', 'freq_score', 'uniq_score', 'is_guess_word'])
df = dfg.append(dft)
df.set_index('word', inplace=True)

In [7]:
df.iloc[2801]

freq_score       0.725547
uniq_score       1.000000
is_guess_word    1.000000
Name: esses, dtype: float64

In [105]:

def word_to_action(word, guesses, history):
    return dfword_to_action((word, df.loc[word]), guesses, history)
    
def dfword_to_action(dfword, guesses, history):
    #the action is going to be a word that we will submit next
    #for the purposes of feeding into the model, we will represent the action word as:
    #  how many of the entries in the hint history this word conforms to
    #  how many untried letters it gives us
    #  the number of uniq letters in the word
    #  the frequency of the letters in the word
    #  whether or not the word is in the guess list (as opposed to the target list)
    word = dfword[0]
    dfword = dfword[1]
    
    if guesses:
        conforms_to_history = sum([int(validate_against_hint(word,g,history[i])) for i,g in enumerate(guesses)]) / len(guesses)
    else: # we haven't made any guess yet, so this must conform
        conforms_to_history = 1.0
    num_untried_letters = len(set(word) - set(''.join(guesses))) / 5 #normalise to 1
    action = np.array([conforms_to_history, num_untried_letters, dfword['freq_score'], dfword['uniq_score'], dfword['is_guess_word']])
    
    #if word == 'aargh':
    #    print(f'recons', action, history, guesses)
    return action
        
def update_action(action, word, history, guesses):
    
    #just call validate_against_hint() for the most recent hint
    #print(guesses.__class__, guesses)  
    #print(word.__class__, word)

    num_untried_letters = len(set(word) - set(''.join(guesses))) / 5
    conforms_to_history = (action[0] * (len(guesses)-1) + int(validate_against_hint(word,guesses[-1],history[-1])) ) / len(guesses)
    #if word == 'aargh':
    #    print(f'updaten', action, history, guesses)
    return np.array([conforms_to_history, num_untried_letters, action[2], action[3], action[4]])

    

In [124]:

def construct_actions_global(arg): #guesses, history, start_idx, end_idx):
    st = time.time()
    guesses, history, start_idx, end_idx = arg
    #print(guesses, history, start_idx, end_idx)
    ret = np.array([dfword_to_action(dfword, guesses, history) for dfword in df.iloc[start_idx:end_idx].iterrows()])
    #print(f'construct_actions_global took {time.time() - st}')
    return ret
    
def update_actions_global(arg): 
    st = time.time()
    start_idx, end_idx = arg
    #print(guesses, history, start_idx, end_idx)
    ret = np.array([update_action(env_global.actions[i], df.iloc[i].name,  env_global.history, env_global.guesses) for i in range(start_idx, end_idx, 1)])
    #print(f'update_actions_global took {time.time() - st}')
    return ret
    

class ActionSpace:
    def __init__(self, n):
        self.n = n
    
    
class Env:
    def __init__(self, df, target_word=None):
        self.df = df
        self.specified_target_word = False
        if target_word:
            self.specified_target_word = True
            self.target = target_word            
            
        self.reset()     
        self.num_letters = len(self.target)
        self.num_guesses = 6
        
        self.num_processes = mp.cpu_count() - 1
        self.action_space = ActionSpace(len(self.df))
       
        
    def index_from_word(self, word):
        return self.df.index.get_loc(word)
    
    def word_from_index(self, idx):
        return self.df.iloc[idx].name
    
    def submit_guess(self, guess):
        wrongplace = [0] * len(self.target)
        hints = np.zeros(len(self.target))
        rightplace = [guess[n] == chrt for n,chrt in enumerate(self.target)]
        #print(f'comparing {guess} against {self.target}.  rightplace {rightplace}')
        
        for n,chrt in enumerate(self.target):
            if rightplace[n] == 1: continue #this character has already been scored, skip it
            for m,chrg in enumerate(guess):
                if n == m: continue # we've already checked rightplace matches above
                if chrt != chrg: continue
                if wrongplace[m] == 1: continue
                if rightplace[m] == 1: continue
                
                wrongplace[m] = 1
                break

        for i in range(len(self.target)):
            hints[i] = 2 if rightplace[i] == 1 else wrongplace[i]
        
        return hints
    
    def reset(self):
        self.history = np.array([[]])
        self.guesses = []
        if not self.specified_target_word:
            self.target = df[df['is_guess_word'] == 0.0].sample().iloc[0].name
        self.actions = None
            
    def construct_actions(self):
        return np.array([dfword_to_action(dfword, self.guesses, self.history) for dfword in self.df.iterrows()])
    
    def update_actions(self):
        return np.array([update_action(self.actions[i], df.iloc[i].name, self.history, self.guesses) for i in range(len(self.actions))])
    
    def construct_actions_mp(self):
        
        grp_lst_args = []
        grp_guesses = [self.guesses] * self.num_processes
        grp_history = [self.history] * self.num_processes
        
        chunk_size = int(len(self.df) / self.num_processes) + 1
        start_offsets = list(range(0, len(self.df), chunk_size))
        end_offsets = start_offsets[1:] + [len(self.df)]
        grp_lst_args = list(zip(grp_guesses, grp_history, start_offsets, end_offsets))
        
        #print(grp_lst_args)
        self.pool = mp.Pool(processes=self.num_processes)
        results = self.pool.map(construct_actions_global, grp_lst_args)
        self.pool.close()
        self.pool.join()
        return np.concatenate(results)
    
    def update_actions_mp(self):
        global env_global
        env_global = self
        
        grp_lst_args = []
        
        chunk_size = int(len(self.df) / self.num_processes) + 1
        start_offsets = list(range(0, len(self.df), chunk_size))
        end_offsets = start_offsets[1:] + [len(self.df)]
        grp_lst_args = list(zip(start_offsets, end_offsets))
        
        #print(grp_lst_args)
        self.pool = mp.Pool(processes=self.num_processes)
        results = self.pool.map(update_actions_global, grp_lst_args)
        self.pool.close()
        self.pool.join()
        return np.concatenate(results)
    
    def construct_state(self):
        #print(history)
        #so the state is going to be:
            #  The number of green locations we know
            #  The number of other letters we know to be in the word
            #  The sequence number of the guess (1st guess, 2nd guess etc.)

        #the number of locations which were green at some point in the history
        num_green_locs = np.count_nonzero(self.history.max(axis=0) == 2)

        green_chars = [self.guesses[x][y] for x,y in np.argwhere(self.history == 2) ]
        orange_chars = [self.guesses[x][y] for x,y in np.argwhere(self.history == 1) ]
        black_chars = [self.guesses[x][y] for x,y in np.argwhere(self.history == 0) ]
        num_other_letters = len(set(orange_chars) - set(green_chars))
        num_black_letters = len(set(black_chars))

        sequence_number = int(self.history.size / 5)
        #print(f'construct_state() with seqno {sequence_number}')

        sequence_number_onehot = np.zeros(self.num_guesses)
        sequence_number_onehot[sequence_number] = 1.0
        return np.concatenate((np.array([num_green_locs, num_other_letters, num_black_letters])/5, sequence_number_onehot))

    def step_by_index(self, guess_idx):
        return self.step(self.word_from_index(guess_idx))
    
    
    def step(self, guess, reconstruct=False): #returns state, reward, done, actions
        #print(actions)
        hints = self.submit_guess(guess)

        #print(list(zip(self.guesses,self.history)))
        if self.history.size == 0:
            self.history = np.expand_dims(hints,0)
            best_hints = 0
        else:
            best_hints = np.apply_along_axis(np.sum, 1, self.history).max()
            self.history = np.row_stack([self.history, hints])
            
        #print(f'======={guess} ({self.target}) => {hints}= {best_hints} =======')
        
        self.guesses.append(guess)
        reward = max(0, hints.sum() - best_hints)
        done = (hints.sum() == self.num_letters * 2 or len(self.guesses) == self.num_guesses)
    
        if not done:
            state = self.construct_state()
            if self.actions is None or reconstruct:
                st = time.time()
                self.actions = self.construct_actions_mp() #slower
                #print(f'construct actions took {time.time() - st}')
                #print(f'after recons {self.actions[2]}')
            else:
                st = time.time()
                self.actions = self.update_actions_mp() #faster
                #print(f'update actions took {time.time() - st}')
                #print(f'after update {self.actions[2]}')
        else:
            state = None
            self.actions = None
        return state, reward, done, self.actions

    
def hint_to_hinty(hint):
    #hint takes form [0,1,2,1,0]
    #hinty takes form {2:[2], 1:[1,3], 0:[0,4]}
    hinty = {}
    for n in [0,1,2]:
        hinty[n] = [i for i, x in enumerate(hint) if x == n]
    #print(f'hint_to_hinty() {hint}, {hinty}')
    return hinty
    
def validate_against_hint(word, guess, hint):
    return validate_against_hinty(word, guess, hint_to_hinty(hint))

def validate_against_hinty(word, guess, hinty):
    #hinty takes form {2:[idx,..], 1:[idx,..], 0:[idx,..]}
    #print(hinty)
    for idx in hinty[2]: # check the fixed letters first
        if word[idx] != guess[idx]:
            return False
      
    for idx in hinty[0]:
        #get the number of times char appears in target word (minus the times it appears in the correct location)
        indices = [i for i,x in enumerate(word) if x == guess[idx] and i not in hinty[2]]
        #get number of times char appears in guess word in the wrong location
        indices_g = [n for n,x in enumerate(guess) if x == guess[idx] and n in hinty[1]]
        #we already know that there is one not-exist hint for this char, so
        #if there are more fewer wrong location hints for this letter than there are actual occurrences of the letter
        #then the hint does not validate against this word
        if len(indices) > len(indices_g):
            return False
    for idx in hinty[1]:
        if word[idx] == guess[idx]:
            return False
        #get all the indices of the character in the target word
        #print(word.__class__, word)
        indices = [i for i,x in enumerate(word) if x == guess[idx] and i not in hinty[2]]
        #remove all the indices where there is already a fixed position hint
        
        #now count all the occurences of the char in guess where the location is wrong
        indices_g = [i for i,x in enumerate(guess) if x == guess[idx] and i in hinty[1]]
        #if there are more wrong loc hints for this char than there are actual occurrences, then it must be bogus
        if len(indices) < len(indices_g):
            return False
    return True            
    

In [126]:
e = Env(df, target_word='crapy')
e.reset()
print(e.num_processes)
#e.num_processes = 1
words = ['beast', 'treat', 'pzazz', 'jobby', 'bobby', 'jimbo']
actions_update = []
actions_recons = []

st = time.time()
for i, word in enumerate(words):
    actions_update.append(e.step(word, reconstruct=False)[3])
print(f'update {time.time() - st}')
    
print('==============================')
e.reset()
st =time.time()
for i, word in enumerate(words):
    actions_recons.append(e.step(word, reconstruct=True)[3])
    
    
print(f'recons {time.time() - st}')    
for n in range(len(words)):
    print((actions_update[n] == actions_recons[n]).all())
    


11
update 1.2987322807312012
recons 1.2577588558197021
True
True
True
True
True


AttributeError: 'bool' object has no attribute 'all'

In [None]:
e = Env(df)
e.reset()
st = time.time()
rmp = e.construct_actions_mp()
print(time.time() - st)
e.reset()
st = time.time()
r = e.construct_actions()
print(time.time() - st)

print(r.__class__)
print(r.shape)

print(rmp.__class__)
print(rmp.shape)

print((r == rmp).all())


In [None]:
e_simple = Env(target_list, target_word='abcde')
tests_simple = {'abcde': [2,2,2,2,2],
         'acbde': [2,1,1,2,2],
         'azcde': [2,0,2,2,2],
         'aacde': [2,0,2,2,2],
         'zacde': [0,1,2,2,2],
         'zzdzz': [0,0,1,0,0],
         'zzddz': [0,0,0,2,0],
         'zdddz': [0,0,0,2,0],
         'ddddd': [0,0,0,2,0],
         'zzzdd': [0,0,0,2,0],
         'zzdez': [0,0,1,1,0]}

e_repeat = Env(target_list, target_word='abcae')
tests_repeat = {'abcde': [2,2,2,0,2],
         'acbde': [2,1,1,0,2],
         'azcde': [2,0,2,0,2],
         'aacde': [2,1,2,0,2],
         'zacde': [0,1,2,0,2],
         'zzdzz': [0,0,0,0,0],
         'zzddz': [0,0,0,0,0],
         'zdddz': [0,0,0,0,0],
         'ddddd': [0,0,0,0,0],
         'zzzdd': [0,0,0,0,0],
         'zzdez': [0,0,0,1,0],
         'aaaaa': [2,0,0,2,0],
         'aaaza': [2,1,0,0,0],
         'zaazz': [0,1,1,0,0],
         'zaaza': [0,1,1,0,0]}

for e,tests in [(e_simple, tests_simple),(e_repeat, tests_repeat)]:
    for guess,expected in tests.items():
        #guess = random.choice(guess_list + target_list)
        actual = e.submit_guess(guess)
        hinty = hint_to_hinty(expected)
        hinty_valid = validate_against_hinty(e.target, guess, hinty)
        print(e.target, guess, actual, expected, expected == actual, hinty_valid)

In [None]:
e=Env(df)
for _ in range(10):
    n = random.randint(0, len(e.df))
    w = e.word_from_index(n)
    n_ = e.index_from_word(w)
    print(f'{n}, {w}, {n_}')
    assert(n == n_)
    

In [None]:
def random_guess(guess_list, target_list):
    guess_idx = random.randint(0, len(guess_list) + len(target_list))
    is_guess = guess_idx < len(guess_list)
    if is_guess:
        word = guess_list[guess_idx]
    else:
        word = target_list[guess_idx - len(guess_list)]
    return word, is_guess

In [None]:
#'beast'
e = Env(df, target_word='beast')
e.step('treat')
#e.guesses = ['treat']
#e.history = np.array([[0.0, 0.0, 1.0, 1.0, 2.0]])
#Env(target_list, target_word='beast').submit_guess('treat')
print(e.guesses, e.history)
actual = e.construct_state()
expected = [0.2, 0.4, 0.2]
print(expected, actual, expected == actual)

actual = word_to_action('feast', ['treat'], np.array([[0.0, 0.0, 1.0, 1.0, 2.0]]))
expected = [1.0, 0.4, 0.62287105, 0.0, 0.0]
print(expected, actual, expected == actual)


In [None]:
num_guesses = 6
e = Env(df)

print(e.target)
num_letters = len(e.target)
history = np.array([[]])
guesses = []
rewards = []
for i in range(num_guesses):
    #guess, is_guess_list = random_guess(guess_list, target_list)
    actions = e.construct_actions_mp()
    state = e.construct_state()
    #here feed it into a model to choose the word
    #guess, value = np.argmax(model(state)) # but do this epsilon greedy
    
    #print(actions)
    hints = e.submit_guess(guess)
    
    print(f'======={guess}========')
    print(list(zip(guesses,history)))
    if history.size == 0:
        history = np.expand_dims(hints,0)
    else:
        history = np.row_stack([history, hints])
    guesses.append(guess)
    if hints.sum() == num_letters * 2 or i == num_guesses - 1:
        reward = hints.sum()
        done = True
    else:
        reward = -1
        done = False
    
    

    
#so the state is going to be:
#  The number of green locations we know
#  The number of other letters we know to be in the word
#  The sequence number of the guess (1st guess, 2nd guess etc.)

#the action is going to be a word that we will submit next
#for the purposes of feeding into the model, we will represent the action word as:
#  whether or not it conforms to the hint history
#  how many new letters it gives us
#  the number of uniq letters in the word
#  the frequency of the letters in the word

#the reward is going to be:
#  -1 on all states except the last one
#  on the last state (which can either be after guess 6 or on guessing the correct word):
#    the sum of the last hint (ie. 2 for a correct letter/position combo, 1 for a letter in the wrong place)

In [None]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

#plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def clear(self):
        self.memory.clear()
        
    def __len__(self):
        return len(self.memory)
    

In [None]:
#Get number of actions from gym action space
#n_actions = env.action_space.n
n_action_features = 5
n_state_features = 9
n_input_features = n_action_features + n_state_features


def select_action(policy_net, state, actions, eps_threshold):
    sample = random.random()
    if sample > eps_threshold:
        with torch.no_grad():
            #now combine the state (shape 3,) and action (shape 5, n) into one input array (shape 8,n)
            #first expand the state so that it is shape 3,1
            #then repeat it to 3,n
            states = np.repeat(np.expand_dims(state, 0), actions.shape[0], axis=0)
            #print(f'states shape {states.shape} actions shape {actions.shape}')
            #then concatenate to 8,n
            state_actions = np.concatenate((states, actions), axis=1)
            # policy_net(state_action) will return a single value estimate for each state/action row
            # so, probably shape (1,n)
            # Then return the index which has the max value
            
            estimate = policy_net(torch.tensor(state_actions, device=device, dtype=torch.float))
            #print(f'ESTIMATE>>>{estimate.__class__} {estimate.shape} {estimate} {estimate.max(0).indices.item()}<<<')
            return estimate.max(0).indices.item()
    else:
        randindex = random.randrange(len(actions))
        print(f'returning random index {randindex}')
        return randindex #torch.tensor([[randindex]], device=device, dtype=torch.long)



def plot_values(vals, axes=['duration', 'episode']):
    plt.figure(2)
    plt.clf()
    plt.title('Training...')
    plt.xlabel(axes[1])
    plt.ylabel(axes[0])
    plt.plot(np.array(vals))
    # Take 20 episode averages and plot them too
    window_width = 20
    if len(vals) >= window_width:
        cumsum_vec = np.cumsum(np.insert(vals, 0, 0)) 
        ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width
        plt.plot(np.insert(ma_vec, 0, [None]*int(window_width/2)))

    plt.pause(0.001)  # pause a bit so that plots are updated
    #if is_ipython:
    #    display.clear_output(wait=True)
    #    display.display(plt.gcf())
    
def plot_all(episode_durations, episode_rewards, losses, epsilons, gammas):
    plot_values(episode_durations, axes=['duration', 'episode'])
    plot_values(episode_rewards, axes=['reward', 'episode'])
    if losses: plot_values(losses, axes=['loss', 'step'])
    if epsilons: plot_values(epsilons, axes=['epsilon', 'step'])
    if gammas: plot_values(gammas, axes=['gamma', 'step'])
    #plt.ioff()
    plt.show()

In [None]:
class DQN(nn.Module):

    def __init__(self, inputs):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(inputs, 20)
        self.fc2 = nn.Linear(20, 16)
        self.fc3 = nn.Linear(16, 20)
        self.head = nn.Linear(20, 1)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.head(x)

In [None]:
class LinearQ(nn.Module):

    def __init__(self, inputs):
        super(LinearQ, self).__init__()
        self.head = nn.Linear(inputs, 1)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        return self.head(x)

In [None]:
env = Env(df)
class NaiveModel():
    def __init__(self, startword_idx=None, target_list_only=False):
        self.startword_idx = startword_idx
        self.target_list_only = target_list_only #when guessing, only choose from the target list (not the full guess list)
        
    def guess(self, idx, state_action):
        if state_action[2] == 0: #if this is the first guess
            return 
    
    def conforms_to_history(state_action):
        return state_action[n_state_features] == 1.0
    
    def is_in_target_list(state_action):
        return state_action[n_state_features+4] == 0.0
                
    def __call__(self, x): # we must return the value of each action
        #x will be a batch
        # if this is the first guess, we must return 1 for the chosen startword
        print(f'startword analysis {x[0]} {x[0].__class__}')
        if x[0][3] == 1.0: #if this is the first guess
            if self.startword_idx: # if we have specified a starting word
                choice = self.startword_idx
                print(f'choosing fixed startword at {choice}')
            else:  #chose a random startword
                choice = random.randint(0,len(x))
                print(f'choosing random word at {choice}')
        else:
            #choose a random startword from the words that conform to the history
            
            conformant_indices = [idx for idx, state_action in enumerate(x) if NaiveModel.conforms_to_history(state_action)]
            if self.target_list_only:
                conformant_indices = [idx for idx in conformant_indices if NaiveModel.is_in_target_list(x[idx])]
            choice = random.sample(conformant_indices, 1)[0]
            print(f'choosing conformant word at {choice} from {len(conformant_indices)} conformant words')

        print(f'word at {choice} is {env.word_from_index(choice)}')
        ret = torch.zeros(len(x))
        ret[choice] = 1.0
        return ret
    
    def parameters(self):
        return []
    
    def to(self, device):
        return self
        

In [None]:
def optimize_model(model, optimizer, memory, batch_size=128):

    transitions = memory.sample(batch_size)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    state_batch = np.stack([tr.state for tr in transitions])
    action_batch = np.stack([tr.action for tr in transitions])
      
    reward_batch = np.stack([tr.reward for tr in transitions])
    state_action_batch = np.concatenate((state_batch, action_batch), axis=1)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_value_estimates = model(torch.tensor(state_action_batch, device=device, dtype=torch.float))
    #print(f'ESTIMATE>>>{estimate.__class__} {estimate.shape} {estimate} {estimate.max(0).indices.item()}<<<')
       
    expected_state_action_values = torch.tensor(reward_batch, device=device, dtype=torch.float)
    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_value_estimates, expected_state_action_values)
    
    print(f'loss {loss}')

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return loss

class TrainConfig():
    def __init__(self, train_interval=128, batch_size=128, clear_memory=False, lr=0.01):
        self.train_interval = train_interval
        self.batch_size = batch_size
        self.clear_memory = clear_memory
        self.lr = lr
        
class ValueConfig():
    def __init__(self, name='reward', gamma=[0.9, 0.05, 200]):
        self.name = name
        self.gamma = gamma
        
class ModelConfig():
    def __init__(self, name='naive', startword=None, target_list_only=None):
        self.name = name
        self.startword = startword
        self.target_list_only = target_list_only

In [None]:
#https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
def run_experiment(model=ModelConfig(name='naive', startword=None, target_list_only=False),
                   num_episodes=128,
                   eps=[0.9, 0.05, 200],
                   value_function=ValueConfig(name='reward',gamma=[0.0, 1.0, 200]),
                   training=TrainConfig(clear_memory=False, batch_size=128, train_interval=128)):
    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)
    EPS_START = eps[0]
    EPS_END = eps[1]
    EPS_DECAY = eps[2]
    GAMMA_START, GAMMA_END, GAMMA_DECAY = value_function.gamma
    env = Env(df)
    memory = ReplayMemory(10000)
    starting_actions = env.construct_actions()
    starting_state = env.construct_state()

    steps_done = 0
    last_training = 0
    losses = []
    episode_rewards = []
    episode_durations = []
    epsilons = []
    gammas = []
    
    if model.name =='naive':
        startword_idx = env.index_from_word(model.startword) if model.startword else None
        
        policy_net = NaiveModel(startword_idx=startword_idx, target_list_only=model.target_list_only)
        optimizer = None
        EPS_START = 0
        EPS_END = 0
    elif model.name == 'linear':
        policy_net = LinearQ(n_input_features).to(device)
        optimizer = optim.RMSprop(policy_net.parameters(), lr=training.lr)
    else:
        policy_net = DQN(n_input_features).to(device)
        optimizer = optim.RMSprop(policy_net.parameters(), lr=training.lr)

    print(f'pn params {list(policy_net.parameters())}')
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        print(f'=========================episode {i_episode} {env.target}======================')

        episode_memory = []
        state = starting_state
        actions = starting_actions
        for t in count():
            eps = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
            GAMMA = GAMMA_END + (GAMMA_START - GAMMA_END) * math.exp(-1. * steps_done / GAMMA_DECAY)
            epsilons.append(eps)
            gammas.append(GAMMA)
            steps_done += 1
            # Select and perform an action
            #print(state, actions)
            action_idx = select_action(policy_net, state, actions, eps)
            selected_action = actions[action_idx]
            print(f'------guess {t} {action_idx} {env.word_from_index(action_idx)} {selected_action}-------')
            next_state, reward, done, actions = env.step_by_index(action_idx)
            print(f'reward {reward} done {done} ')
            reward = np.array([reward])

            # Store the transition in memory
            #memory.push(state, selected_action, reward)
            episode_memory.append([state, selected_action, reward])
            # Move to the next state
            state = next_state

            if done:
                episode_durations.append(t + 1)
                episode_reward = sum([tr[2] for tr in episode_memory])
                print(f'episode {i_episode} finished.  reward {episode_reward}  eps {eps}  gamma {GAMMA}  steps {steps_done}  memory {len(memory)}')
                episode_rewards.append(episode_reward)
                if value_function.name == 'reward':
                    for tr in episode_memory:
                        memory.push(*tr)
                elif value_function.name == 'hybrid':
                    #use the returned reward,
                    #but reduce it if we didn't get the target word by the end of the episode
                    #and increase it if we took less than 6 guesses
                    if episode_reward == 10.0: # if we got the target word
                        #apply a positive factor to all guess values
                        bonus = (env.num_guesses - (t+1)) * GAMMA
                        # after 3 guesses offset = 3* GAMMA
                        # after 6 guesses, offset = 0 * GAMMA 
                    else:
                        bonus = -2*GAMMA
                    
                    print(f'original rewards {[tr[2] for tr in episode_memory]}')
                    for tr in episode_memory:
                        q = max(0.0, tr[2][0] + bonus)
                        tr[2][0] = q
                        memory.push(*tr)
                    print(f'hybrid rewards {[tr[2] for tr in episode_memory]}')    
                elif value_function.name == 'discounted':
                    #q is the actual value of the state_action value function
                    # which is the discounted reward.
                    #on the last guess q(n) is equal to the total episode reward 
                    #and q(n-1) is equal to -1 + episode_reward * GAMMA.
                    #and q(n-2) = -1 + (n-1) * GAMMA
                    #min(q) = -6 (for GAMMA = 1)
                    #max(q) = 10
                    q = episode_reward
                    qs = []
                    if episode_reward < 10: q = q * 0.7 #reduce the reward if we didn't get the correct answer
                    for idx,tr in enumerate(reversed(episode_memory)):
                        if idx > 0:
                            q = -1 + GAMMA * q
                        memory.push(tr[0], tr[1], (q + 6)) # add 6 on to the value so that it is never < 0 
                        qs.append(q+6)
                    print(f'discounted rewards {list(reversed(qs))} vs. {[tr[2] for tr in episode_memory]}')
                else:
                    raise Exception(f'bad value function {value_function.name}')

                # If we have gathered enough data, Perform one step of the optimization (on the policy network)
                if model.name != 'naive' \
                    and len(memory) >= training.batch_size \
                    and steps_done - last_training > training.train_interval:
                    loss = optimize_model(policy_net, optimizer, memory, batch_size=training.batch_size)
                    losses.append(loss)
                    if training.clear_memory: memory.clear()
                    last_training = steps_done
                #plot_durations()
                break

    print('Complete')
    
    return episode_durations, episode_rewards, losses, epsilons, gammas

#env.render()
#env.close()
#plt.ioff()
#plt.show()

In [None]:
plot_all(*run_experiment(
    model={'name': 'naive', 'startword': None, 'target_list_only':False},
    num_episodes=64
    ))


In [None]:
plot_all(*run_experiment(
    model={'name': 'naive', 'startword': None, 'target_list_only':True},
    num_episodes=64
    ))

In [None]:
plot_all(*run_experiment(
    model={'name': 'naive', 'startword': 'roate', 'target_list_only':True},
    num_episodes=64
    ))


In [None]:
plot_all(*run_experiment(
    model={'name': 'naive', 'startword': 'roate', 'target_list_only':False},
    num_episodes=64
    ))



In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.0, 0.0, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.07)
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.0, 0.0, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.03)
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.0, 0.0, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.1)
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.3, 0.3, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.07)
    ))

In [None]:
plot_all(*run_experiment(
    model=ModelConfig(name='linear'),
    value_function=ValueConfig(name='hybrid', gamma=[0.6, 0.6, 200]),
    eps=[0.0, 0.0, 400],
    num_episodes=150,
    training=TrainConfig(train_interval=16, batch_size=64, clear_memory=False, lr=0.07)
    ))