In [1]:
import time
import pickle
import numpy as np

import gym
from griddy_env import GriddyEnv

In [2]:
def key_from_state(state):
    key = pickle.dumps(state)
    if key not in value_table:
        value_table[key]=0 #initialize
    return key

In [17]:
def update_value_table(episode_mem, value_table, discount_factor=0.95, learning_rate=0.1):
    all_diffs=[]
    for i, mem in reversed(list(enumerate(episode_mem))): #start from terminal state
        if i==len(episode_mem)-1: #if terminal state, G=reward
            calculated_new_v = episode_mem[i]['reward']
        else:
            calculated_new_v = mem['reward']+(discount_factor*np.max(greedy_policy(mem['new_observation'], return_action_vals=True)))
        key = key_from_state(mem['observation'])
        diff = abs(value_table[key]-calculated_new_v)
        all_diffs.append(diff)
        value_table[key] =  value_table[key] + learning_rate*(calculated_new_v-value_table[key])
    return value_table, np.mean(all_diffs)

In [18]:
#This is the transition model aka our model of the environment. Given state and action it predicts next state
def transition(state, action):
    state = np.copy(state)
    agent_pos = list(zip(*np.where(state[2] == 1)))[0]
    new_agent_pos = np.array(agent_pos)
    if action==0:
        new_agent_pos[1]-=1
    elif action==1:
        new_agent_pos[1]+=1
    elif action==2:
        new_agent_pos[0]-=1
    elif action==3:
        new_agent_pos[0]+=1    
    new_agent_pos = np.clip(new_agent_pos, 0, 3)

    state[2, agent_pos[0], agent_pos[1]] = 0 #moved from this position so it is empty
    state[2, new_agent_pos[0], new_agent_pos[1]] = 1 #moved to this position
    return state

In [19]:

def greedy_policy(state, return_action_vals=False):
    action_values=[]
    for test_action in range(4): #for each action
        new_state = transition(state, test_action)
        key = key_from_state(new_state)
        action_values.append(value_table[key])
    policy_action = np.argmax(action_values)
    if return_action_vals: return action_values
    return policy_action

In [20]:
def epsilon_greedy_policy(state, epsilon):
    action = env.action_space.sample() if np.random.rand()<epsilon else greedy_policy(state)
    return action

In [21]:
def random_policy(state):
    return np.random.randint(0, 4)

In [22]:
def value_table_viz(value_table):
    values = np.zeros((4, 4))
    base_st = np.zeros((3, 4, 4), dtype=np.int64)
    base_st[0, 3, 3]=1
    for i in range(4):
        for j in range(4):
            test_st = np.copy(base_st)
            test_st[2, i, j] = 1
            key = pickle.dumps(test_st)
            if key in value_table:
                val = value_table[key]
            else:
                val=0
            values[i, j] = val
    return values

In [23]:
def visualise_agent(policy, value_table=None, n=5):
    try:
        for trial_i in range(n):
            observation = env.reset()
            done=False
            t=0
            while not done:
                if value_table: env.render(value_table_viz(value_table))
                else: env.render()
                policy_action = policy(observation)
                observation, reward, done, info = env.step(policy_action)
                time.sleep(0.5)
                t+=1
            env.render()
            time.sleep(1.5)
            print("Episode {} finished after {} timesteps".format(trial_i, t))
        env.close()
    except KeyboardInterrupt:
        env.close()

In [37]:
env = GriddyEnv(4, 4)
epsilon = 1
i_episode=0
discount_factor=0.9
learning_rate=0.7
value_table = {}

In [38]:
def train(n_episodes=100):
    global epsilon
    global value_table
    global i_episode
    try:
        for _ in range(n_episodes):
            observation = env.reset()
            episode_mem = []
            done=False
            t=0
            while not done:
                env.render()
                time.sleep(0.05)
                action = epsilon_greedy_policy(observation, epsilon)
                new_observation, reward, done, info = env.step(action)
                episode_mem.append({'observation':observation,
                                    'action':action,
                                    'reward':reward,
                                    'new_observation':new_observation,
                                    'done':done})
                observation=new_observation
                t+=1
                epsilon*=0.999
            value_table, v_delta = update_value_table(episode_mem, value_table, discount_factor, learning_rate)
            i_episode+=1
            print("Episode {} finished after {} timesteps. Eplislon={}. V_Delta={}".format(i_episode, t, epsilon, v_delta))#, end='\r')
            #print(value_table_viz(value_table))
            #print()
            env.render(value_table_viz(value_table))
            time.sleep(2)
        env.close()
    except KeyboardInterrupt:
        env.close()

In [39]:
train()

Episode 1 finished after 62 timesteps. Eplislon=0.9398537314349842. V_Delta=0.15534477365273675
Episode 2 finished after 38 timesteps. Eplislon=0.9047921471137096. V_Delta=0.134447094081185
Episode 3 finished after 47 timesteps. Eplislon=0.8632304853107438. V_Delta=0.09260665745514408
Episode 4 finished after 13 timesteps. Eplislon=0.8520755747117399. V_Delta=0.09564585837028985
Episode 5 finished after 9 timesteps. Eplislon=0.8444374977929298. V_Delta=0.12485087912878333
Episode 6 finished after 4 timesteps. Eplislon=0.8410648110498392. V_Delta=0.08385507333119002
Episode 7 finished after 2 timesteps. Eplislon=0.8393835224925505. V_Delta=0.13101619589809926
Episode 8 finished after 26 timesteps. Eplislon=0.8178301806491574. V_Delta=0.081366620947906
Episode 9 finished after 46 timesteps. Eplislon=0.7810441642583167. V_Delta=0.05356607614263002
Episode 10 finished after 83 timesteps. Eplislon=0.7188050416738131. V_Delta=0.04012056051988172
Episode 11 finished after 41 timesteps. Eplisl

In [40]:
value_table_viz(value_table)

array([[0.60164009, 0.64239867, 0.70853401, 0.77720177],
       [0.66372537, 0.68540333, 0.63874238, 0.78140171],
       [0.58804101, 0.59956988, 0.72253787, 0.86137832],
       [0.53163608, 0.56978453, 0.6353672 , 0.        ]])

In [14]:
visualise_agent(greedy_policy, value_table)
#v(s t+1) from v(s)
#showing a trajectory on gui

Episode 0 finished after 1 timesteps
Episode 1 finished after 3 timesteps
Episode 2 finished after 3 timesteps
Episode 3 finished after 1 timesteps
Episode 4 finished after 3 timesteps


In [None]:
- v table visuailsation
- breaking it up into small codable chunks

In [None]:
'''def calculate_Gs(episode_mem, discount_factor=0.95):
    for i, mem in reversed(list(enumerate(episode_mem))): #start from terminal state
        if i==len(episode_mem)-1: #if terminal state, G=reward
            episode_mem[i]['G']= mem['reward'] 
        else:
            G = mem['reward']+discount_factor*episode_mem[i+1]['G']
            episode_mem[i]['G'] = G 
    return episode_mem
def update_value_table(value_table, episode_mem):
    all_diffs=[]
    for mem in episode_mem:
        key = pickle.dumps(mem['new_observation'])
        if key not in value_table:
            value_table[key]=0 #initialize
        new_val = max(value_table[key], mem['G'])
        diff = abs(value_table[key]-new_val)
        all_diffs.append(diff)
        value_table[key] = new_val
    return value_table, np.mean(all_diffs)'''