In [1]:
import numpy as np
from goldboxworld.simplified_goldbox_world import SimplifiedGoldboxWorld

In [2]:
env = SimplifiedGoldboxWorld()

In [3]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

In [4]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.1
exploration_decay_rate = 0.001

In [5]:
rewards_all_episodes = []

for episode in range(num_episodes):
    state = env.reset()

    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()

        new_state, reward, done = env.step(action)

        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done:
            break

    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

In [6]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("~~~~~~~Average Rewards Per Thousand Episodes~~~~~~")
for r in rewards_per_thousand_episodes:
    print(f"{count: <5}: {np.sum(r/1000)}")
    count += 1000

print()
print("~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~")
print(q_table)

~~~~~~~Average Rewards Per Thousand Episodes~~~~~~
1000 : 3735.753
2000 : 7488.276
3000 : 8570.083
4000 : 8903.582
5000 : 9166.223000000002
6000 : 9055.228
7000 : 9156.199
8000 : 9236.938
9000 : 9136.021
10000: 9105.705000000002

~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~
[[ 9504.99950399  9408.94950895  9408.94950895  9504.99950399]
 [ 9602.019701    9504.94273867  9408.91707476  9601.66758219]
 [ 9700.01972001  9571.69943987  9447.83655256  9360.89930476]
 [ 8759.73685445  7888.17124822  9582.12684563  7561.57525872]
 [ 9602.019701    9408.94950895  9504.99950399  9602.019701  ]
 [ 9700.0199      9504.99950399  9504.99950399  9700.0199    ]
 [ 9799.01        9601.92575184  9601.78190077  9597.03107319]
 [  -99.62428979  8724.79540652  9698.97872846  8105.46505314]
 [ 9504.99950395  9504.99950399  9602.019701    9700.0199    ]
 [ -100.          9602.019701    9602.019701    9799.01      ]
 [ 9899.          9700.0199      9700.0199      -100.        ]
 [    0.             0.  

In [7]:
simages = []

agentX = 0
agentY = 0

state = env.reset()

rewards_current_episode = 0

for step in range(max_steps_per_episode):
    action = np.argmax(q_table[state, :])

    if action == 0:
        agentY = min(3, agentY + 1)
    elif action == 1:
        agentY = max(0, agentY - 1)
    elif action == 2:
        agentX = max(0, agentX - 1)
    else:
        agentX = min(3, agentX + 1)

    print((action,agentX, agentY));

    new_state, reward, done = env.step(action)

    state = new_state
    rewards_current_episode += reward

    if done:
        break

print(f"Reward: {rewards_current_episode}")


(0, 0, 1)
(0, 0, 2)
(3, 1, 2)
(3, 2, 2)
(0, 2, 3)
(3, 3, 3)
Reward: 9995
