In [31]:
import time
import pickle
import numpy as np

import gym
from griddy_env import GriddyEnv

In [32]:
def key_from_state(state):
    key = pickle.dumps(state)
    if key not in value_table:
        value_table[key]=0 #initialize
    return key

In [33]:
def update_value_table(episode_mem, value_table, discount_factor=0.95, alpha=0.5):
    all_diffs=[]
    for i, mem in reversed(list(enumerate(episode_mem))): #start from terminal state
        if i==len(episode_mem)-1: #if terminal state, G=reward
            calculated_new_v = episode_mem[i]['reward']
        else:
            calculated_new_v = mem['reward']+(discount_factor*np.max(greedy_policy(mem['new_observation'], return_action_vals=True)))
        key = key_from_state(mem['new_observation'])
        diff = abs(value_table[key]-calculated_new_v)
        all_diffs.append(diff)
        value_table[key] =  value_table[key] + alpha*(calculated_new_v-value_table[key])
    return value_table, np.mean(all_diffs)

In [34]:
def transition(state, action):
    state = np.copy(state)
    agent_pos = list(zip(*np.where(state[2] == 1)))[0]
    new_agent_pos = np.array(agent_pos)
    if action==0:
        new_agent_pos[1]-=1
    elif action==1:
        new_agent_pos[1]+=1
    elif action==2:
        new_agent_pos[0]-=1
    elif action==3:
        new_agent_pos[0]+=1    
    new_agent_pos = np.clip(new_agent_pos, 0, 3)

    state[2, agent_pos[0], agent_pos[1]] = 0 #moved from this position so it is empty
    state[2, new_agent_pos[0], new_agent_pos[1]] = 1 #moved to this position
    return state

In [35]:
def greedy_policy(state, return_action_vals=False):
    action_values=[]
    for test_action in range(4): #for each action
        new_state = transition(state, test_action)
        key = key_from_state(new_state)
        action_values.append(value_table[key])
    policy_action = np.argmax(action_values)
    if return_action_vals: return action_values
    return policy_action

In [36]:
def epsilon_greedy_policy(state, epsilon):
    action = env.action_space.sample() if np.random.rand()<epsilon else greedy_policy(state)
    return action

In [37]:
def random_policy(state):
    return np.random.randint(0, 4)

In [38]:
def value_table_viz(value_table):
    values = np.zeros((4, 4))
    base_st = np.zeros((3, 4, 4), dtype=np.int64)
    base_st[0, 3, 3]=1
    for i in range(4):
        for j in range(4):
            test_st = np.copy(base_st)
            test_st[2, i, j] = 1
            key = pickle.dumps(test_st)
            if key in value_table:
                val = value_table[key]
            else:
                val=0
            values[i, j] = val
    return values

In [39]:
def visualise_agent(policy, value_table=None, n=5):
    try:
        for trial_i in range(n):
            observation = env.reset()
            done=False
            t=0
            while not done:
                if value_table: env.render(value_table_viz(value_table))
                else: env.render()
                policy_action = policy(observation)
                observation, reward, done, info = env.step(policy_action)
                time.sleep(0.5)
                t+=1
            env.render()
            time.sleep(1.5)
            print("Episode {} finished after {} timesteps".format(trial_i, t))
        env.close()
    except KeyboardInterrupt:
        env.close()

In [40]:
env = GriddyEnv(4, 4)
epsilon = 1
i_episode=0
discount_factor=0.8
value_table = {}

In [41]:
def train(n_episodes=100):
    global epsilon
    global value_table
    global i_episode
    try:
        for _ in range(n_episodes):
            observation = env.reset()
            episode_mem = []
            done=False
            t=0
            while not done:
                env.render()
                time.sleep(0.05)
                action = epsilon_greedy_policy(observation, epsilon)
                new_observation, reward, done, info = env.step(action)
                episode_mem.append({'observation':observation,
                                    'action':action,
                                    'reward':reward,
                                    'new_observation':new_observation,
                                    'done':done})
                observation=new_observation
                t+=1
                epsilon*=0.999
            value_table, v_delta = update_value_table(episode_mem, value_table)
            i_episode+=1
            print("Episode {} finished after {} timesteps. Eplislon={}. V_Delta={}".format(i_episode, t, epsilon, v_delta))#, end='\r')
            #print(value_table_viz(value_table))
            #print()
            env.render(value_table_viz(value_table))
            time.sleep(2)
        env.close()
    except KeyboardInterrupt:
        env.close()

In [42]:
train()

Episode 1 finished after 149 timesteps. Eplislon=0.8615048875706075. V_Delta=0.05205898401515101
Episode 2 finished after 7 timesteps. Eplislon=0.8554924148377159. V_Delta=0.16937896406312042
Episode 3 finished after 14 timesteps. Eplislon=0.8435930602951368. V_Delta=0.18900946939319924
Episode 4 finished after 16 timesteps. Eplislon=0.8301963316171974. V_Delta=0.17800375121021716
Episode 5 finished after 38 timesteps. Eplislon=0.7992255563671304. V_Delta=0.15410292256508373
Episode 6 finished after 7 timesteps. Eplislon=0.7936477332643059. V_Delta=0.036209333390592624
Episode 7 finished after 3 timesteps. Eplislon=0.7912691702140651. V_Delta=0.03289009094238279
Episode 8 finished after 29 timesteps. Eplislon=0.7686407469632577. V_Delta=0.09666403144638631
Episode 9 finished after 13 timesteps. Eplislon=0.7587081519483351. V_Delta=0.08409900818192216
Episode 10 finished after 18 timesteps. Eplislon=0.7451668707698216. V_Delta=0.03653499062805766
Episode 11 finished after 36 timesteps. 

In [60]:
value_table_viz(value_table)

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [14]:
visualise_agent(greedy_policy, value_table)
#v(s t+1) from v(s)
#showing a trajectory on gui

Episode 0 finished after 1 timesteps
Episode 1 finished after 3 timesteps
Episode 2 finished after 3 timesteps
Episode 3 finished after 1 timesteps
Episode 4 finished after 3 timesteps


In [None]:
- v table visuailsation
- breaking it up into small codable chunks

In [None]:
'''def calculate_Gs(episode_mem, discount_factor=0.95):
    for i, mem in reversed(list(enumerate(episode_mem))): #start from terminal state
        if i==len(episode_mem)-1: #if terminal state, G=reward
            episode_mem[i]['G']= mem['reward'] 
        else:
            G = mem['reward']+discount_factor*episode_mem[i+1]['G']
            episode_mem[i]['G'] = G 
    return episode_mem
def update_value_table(value_table, episode_mem):
    all_diffs=[]
    for mem in episode_mem:
        key = pickle.dumps(mem['new_observation'])
        if key not in value_table:
            value_table[key]=0 #initialize
        new_val = max(value_table[key], mem['G'])
        diff = abs(value_table[key]-new_val)
        all_diffs.append(diff)
        value_table[key] = new_val
    return value_table, np.mean(all_diffs)'''