In [37]:
import numpy as np
from goldboxworld.simplified_goldbox_world import SimplifiedGoldboxWorld

In [38]:
env = SimplifiedGoldboxWorld()

In [39]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

In [40]:
num_episodes = 1000
max_steps_per_episode = 20

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.1
exploration_decay_rate = 0.001

In [41]:
rewards_all_episodes = []

for episode in range(num_episodes):
    state = env.reset()

    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()

        new_state, reward, done = env.step(action)

        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done:
            break

    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

GOAL REACHED!!! 18671
GOAL REACHED!!! 18575
GOAL REACHED!!! 18574
GOAL REACHED!!! 19070
GOAL REACHED!!! 17872
GOAL REACHED!!! 18982
GOAL REACHED!!! 18279
GOAL REACHED!!! 18972
GOAL REACHED!!! 19377
GOAL REACHED!!! 18877
GOAL REACHED!!! 18575
GOAL REACHED!!! 19079
GOAL REACHED!!! 18671
GOAL REACHED!!! 19074
GOAL REACHED!!! 18779
GOAL REACHED!!! 18772
GOAL REACHED!!! 19286
GOAL REACHED!!! 19183
GOAL REACHED!!! 18675
GOAL REACHED!!! 18872
GOAL REACHED!!! 18672
GOAL REACHED!!! 19474
GOAL REACHED!!! 19482
GOAL REACHED!!! 18971
GOAL REACHED!!! 18773
GOAL REACHED!!! 18784
GOAL REACHED!!! 18681
GOAL REACHED!!! 18576
GOAL REACHED!!! 18982
GOAL REACHED!!! 19188
GOAL REACHED!!! 18973
GOAL REACHED!!! 18580
GOAL REACHED!!! 17875
GOAL REACHED!!! 17573
GOAL REACHED!!! 17972
GOAL REACHED!!! 19677
GOAL REACHED!!! 18370
GOAL REACHED!!! 18078
GOAL REACHED!!! 18075
GOAL REACHED!!! 18581
GOAL REACHED!!! 18477
GOAL REACHED!!! 18679
GOAL REACHED!!! 18377
GOAL REACHED!!! 18479
GOAL REACHED!!! 18272
GOAL REACH

In [42]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("~~~~~~~Average Rewards Per Thousand Episodes~~~~~~")
for r in rewards_per_thousand_episodes:
    print(f"{count: <5}: {np.sum(r/1000)}")
    count += 1000

print()
print("~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~")
print(q_table)

~~~~~~~Average Rewards Per Thousand Episodes~~~~~~
1000 : 3707.7789999999995

~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~
[[-7.29097796e-01 -6.96927311e-01 -7.64128128e-01 -7.49049261e-01]
 [-8.71754811e-01 -8.53783719e-01 -9.13044148e-01 -1.12207022e+00]
 [-1.21627300e+00 -1.42934648e+00 -1.20772034e+00 -1.30765442e+00]
 [-1.76424966e+00 -2.05596620e+00 -1.77895260e+00 -2.26457101e+00]
 [-2.62147862e+00 -2.90904562e+00 -2.37482412e+00  1.22674569e+01]
 [-6.95954368e-01 -2.63561133e+00 -2.96943488e+00  2.01333251e+02]
 [ 5.22893685e+02  9.07027568e+01  2.41243560e+01  2.93285349e+03]
 [ 2.66329133e+03  3.11360519e+03  5.94076049e+02  8.12275484e+03]
 [ 8.45193476e+03  7.60227246e+03  6.78651705e+03  8.31669905e+03]
 [ 8.63933511e+03  8.55190061e+03  8.36616249e+03  8.55188872e+03]
 [-6.57664594e-01 -5.99001357e-01 -8.50746789e-01 -6.14046469e-01]
 [-4.13979250e+01 -8.69315879e-01 -9.12370437e-01 -8.30116319e-01]
 [-8.05538197e+01 -1.40112460e+00 -1.01765327e+00 -1.49943840e+00]

In [43]:
from PIL import Image
canvas = Image.open("images/canvas.png")
agent = Image.open("images/rebort-scaled.png")
agent.putalpha(255)

In [44]:
locations = {(x, y): (100 * (y + 1) + 100 * y, (100 * (x + 1) + 100 * x))
             for y in range(10) for x in range(10)}

In [45]:
images = []

agentX = 9
agentY = 0

b = canvas.copy()
b.paste(agent, locations[(agentX, agentY)], agent)
images.append(b)

state = env.reset()

rewards_current_episode = 0

for step in range(max_steps_per_episode):
    action = np.argmax(q_table[state, :])

    if action == 0:
        agentY = min(9, agentY + 1)
    elif action == 1:
        agentY = max(0, agentY - 1)
    elif action == 2:
        agentX = max(0, agentX - 1)
    else:
        agentX = min(9, agentX + 1)

    
    b = canvas.copy()
    b.paste(agent, locations[(agentX, agentY)], agent)
    images.append(b)


    print(q_table[state, :])
    print(locations[(agentX, agentY)])
    print(action,(agentX, agentY))

    new_state, reward, done = env.step(action)

    state = new_state
    rewards_current_episode += reward

    if done:
        break

print(f"Reward: {rewards_current_episode}")

images[0].save('images/anitest.gif',
               save_all=True,
               append_images=images[1:],
               duration=500,
               loop=0)

[8639.33511431 8551.90060682 8366.16249018 8551.88871528]
(300, 1900)
0 (9, 1)
[8828.62132758 8551.94035277 8451.89392492 8639.32509108]
(500, 1900)
0 (9, 2)
[9019.81952281 8639.27349751 8692.57539885 8828.59081581]
(700, 1900)
0 (9, 3)
[9212.94901294 8828.60325802 8896.78856485 9019.80700628]
(900, 1900)
0 (9, 4)
[9307.01920499 9019.78416443 9119.71711368 9212.94572782]
(1100, 1900)
0 (9, 5)
[9402.039601   9212.94501326 9212.94732874 9307.01902571]
(1300, 1900)
0 (9, 6)
[9599.0299     9307.0139078  9206.88676936 9402.03911546]
(1500, 1900)
0 (9, 7)
[9798.01       9402.02671822 9401.87159577 9599.02235379]
(1700, 1900)
0 (9, 8)
[9999.         9598.90047579 9591.57292535 9797.92350391]
(1900, 1900)
0 (9, 9)
GOAL REACHED!!! 19391
Reward: 9391
