In [1]:
import gym
from griddy_env import GriddyEnvOneHot

import numpy as np
import pickle
from copy import deepcopy
import time

In [2]:
def value_table_viz(value_table):
    values = np.zeros((4, 4))
    base_st = np.zeros((3, 4, 4), dtype=np.int64)
    base_st[0, 3, 3]=1
    for i in range(4):
        for j in range(4):
            test_st = deepcopy(base_st)
            test_st[2, i, j] = 1
            #print(test_st)
            key = pickle.dumps(test_st)
            if key in value_table:
                val = value_table[key]
            else:
                val=0
            values[i, j] = val
    return values

In [3]:
def pick_best_action(state):
    action_values=[]
    for test_action in range(4): #for each action
        new_state = transition(state, test_action)
        key = pickle.dumps(new_state)
        if key not in value_table: value_table[key] = 0
        action_values.append(value_table[key])
    policy_action = np.argmax(action_values)
    return policy_action

In [4]:
def calculate_Gs(episode_mem, discount_factor=0.95):
    for i, mem in reversed(list(enumerate(episode_mem))):
        if i==len(episode_mem)-1:
            episode_mem[i]['G']= mem['reward']
        else:
            G = mem['reward']+discount_factor*episode_mem[i+1]['G']
            episode_mem[i]['G'] = G   
    return episode_mem

In [22]:
def update_value_table(value_table, episode_mem):
    for mem in episode_mem:
        key = pickle.dumps(mem['new_observation'])
        if key not in value_table:
            value_table[key]=0
        #value_table[key] = max(value_table[key], mem['G'])
        new_val = 0.9*value_table[key] + 0.1*mem['G']
        diff = abs(value_table[key]
        value_table[key] = newval
    return value_table

In [23]:
def transition(state, action):
    state = deepcopy(state)
    agent_pos = list(zip(*np.where(state[2] == 1)))[0]
    new_agent_pos = np.array(agent_pos)
    if action==0:
        new_agent_pos[1]-=1
    elif action==1:
        new_agent_pos[1]+=1
    elif action==2:
        new_agent_pos[0]-=1
    elif action==3:
        new_agent_pos[0]+=1    
    new_agent_pos = np.clip(new_agent_pos, 0, 3)

    state[2, agent_pos[0], agent_pos[1]] = 0 #moved from this position so it is empty
    state[2, new_agent_pos[0], new_agent_pos[1]] = 1 #moved to this position
    return state

In [24]:
env = GriddyEnvOneHot()
epsilon = 1
value_table = {}

In [27]:
try:
    for i_episode in range(100):
        old_observation = env.reset()
        done=False
        episode_mem = []
        t=0
        while not done:
            env.render()
            
            policy_action = pick_best_action(old_observation)
            action = env.action_space.sample() if np.random.rand()<epsilon else policy_action
            #action = env.action_space.sample()
            #print(action)
            new_observation, reward, done, info = env.step(action)
            episode_mem.append({'old_observation':deepcopy(old_observation),
                                'action':action,
                                'reward':reward,
                                'new_observation':deepcopy(new_observation),
                                'done':done})
            old_observation=deepcopy(new_observation)
            t+=1
            epsilon*=0.999
            #time.sleep(0.5)
        print("Episode finished after {} timesteps. Eplislon={}".format(t+1, epsilon))
        env.render()
        #time.sleep(0.5)
        episode_mem = calculate_Gs(episode_mem)
        value_table = update_value_table(value_table, episode_mem)
    env.close()
except KeyboardInterrupt:
    env.close()

Episode finished after 7 timesteps. Eplislon=0.2900740880362934
Episode finished after 5 timesteps. Eplislon=0.28891553096867023
Episode finished after 7 timesteps. Eplislon=0.2871863657418441
Episode finished after 9 timesteps. Eplislon=0.2848968999718006
Episode finished after 5 timesteps. Eplislon=0.28375902061401054
Episode finished after 3 timesteps. Eplislon=0.28319178633180314
Episode finished after 7 timesteps. Eplislon=0.28149687783101773
Episode finished after 8 timesteps. Eplislon=0.2795323012780908
Episode finished after 13 timesteps. Eplislon=0.2761963014356792
Episode finished after 5 timesteps. Eplislon=0.2750931723032361
Episode finished after 5 timesteps. Eplislon=0.2739944490729594
Episode finished after 4 timesteps. Eplislon=0.27317328743509334
Episode finished after 2 timesteps. Eplislon=0.27290011414765825
Episode finished after 5 timesteps. Eplislon=0.2718101500004249
Episode finished after 3 timesteps. Eplislon=0.27126680151057403
Episode finished after 5 timeste

In [26]:
value_table_viz(value_table)

array([[0.3060108 , 0.52833382, 0.64338609, 0.66168489],
       [0.33759043, 0.43687646, 0.78736672, 0.81200628],
       [0.66962984, 0.7985848 , 0.86683305, 0.9336722 ],
       [0.3415757 , 0.46164763, 0.7553073 , 0.99997344]])