In [55]:
import numpy as np
from environment.discover_goldbox import DiscoverGoldbox

In [56]:
env = DiscoverGoldbox()

In [57]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

In [58]:
num_episodes = 10000
max_steps_per_episode = 50

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.1
exploration_decay_rate = 0.001

In [59]:
rewards_all_episodes = []

for episode in range(num_episodes):
    state = env.reset()

    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()

        new_state, reward, done = env.step(action)

        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done:
            break

    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

dbox and it has total battery points:  19391
Agent discovered goldbox and it has total battery points:  19391
Agent discovered goldbox and it has total battery points:  19391
Agent discovered goldbox and it has total battery points:  19287
Agent discovered goldbox and it has total battery points:  19289
Agent discovered goldbox and it has total battery points:  19391
Agent discovered goldbox and it has total battery points:  19391
Agent discovered goldbox and it has total battery points:  19290
Agent discovered goldbox and it has total battery points:  19391
Agent discovered goldbox and it has total battery points:  19289
Agent discovered goldbox and it has total battery points:  19186
Agent discovered goldbox and it has total battery points:  19391
Agent discovered goldbox and it has total battery points:  19391
Agent discovered goldbox and it has total battery points:  19390
Agent discovered goldbox and it has total battery points:  19287
Agent discovered goldbox and it has total bat

In [60]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("~~~~~~~Average Rewards Per Thousand Episodes~~~~~~")
for r in rewards_per_thousand_episodes:
    print(f"{count: <5}: {np.sum(r/1000)}")
    count += 1000

print()
print("~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~")
print(q_table)

~~~~~~~Average Rewards Per Thousand Episodes~~~~~~
1000 : 5147.610999999999
2000 : 9116.222
3000 : 9263.14
4000 : 9307.871
5000 : 9314.845000000001
6000 : 9322.665
7000 : 9324.510000000002
8000 : 9323.985
9000 : 9325.227
10000: 9329.08

~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~
[[-3.01763993e+00 -2.78819941e+00 -3.01684631e+00 -3.17477653e+00]
 [-3.33672575e+00 -3.24144898e+00 -3.23039701e+00 -3.35095596e+00]
 [-3.76039844e+00 -3.69690083e+00 -3.60721060e+00 -3.51650032e+00]
 [-4.22044199e+00 -3.91720412e+00 -3.84389159e+00 -3.80128429e+00]
 [ 7.38162265e+00 -4.27387567e+00 -4.29373262e+00 -2.16570263e+00]
 [ 9.31361192e+00 -1.71607448e+00 -4.64400062e+00  2.60127621e+02]
 [ 3.19254317e+03  8.36090358e+01  3.17096102e+01  5.80462722e+02]
 [ 8.73416190e+03  5.62130824e+03  8.17863646e+02  6.17779227e+03]
 [ 8.49351723e+03  8.44516005e+03  8.53046872e+03  8.55194176e+03]
 [ 8.63933511e+03  8.55194176e+03  8.46542235e+03  8.55194176e+03]
 [-2.65317918e+00 -2.96919994e+00 -2.6304

In [61]:
from PIL import Image
canvas = Image.open("images/discover_goldbox_environment.png")
agent = Image.open("images/robot.png")
agent.putalpha(255)

In [62]:
locations = {(x, y): (100 * (y + 1) + 100 * y, (100 * (x + 1) + 100 * x))
             for y in range(10) for x in range(10)}

In [63]:
images = []

agentX = 9
agentY = 0

b = canvas.copy()
b.paste(agent, locations[(agentX, agentY)], agent)
images.append(b)

state = env.reset()

rewards_current_episode = 0

for step in range(max_steps_per_episode):
    action = np.argmax(q_table[state, :])

    if action == 0:
        agentY = min(9, agentY + 1)
    elif action == 1:
        agentY = max(0, agentY - 1)
    elif action == 2:
        agentX = max(0, agentX - 1)
    else:
        agentX = min(9, agentX + 1)

    
    b = canvas.copy()
    b.paste(agent, locations[(agentX, agentY)], agent)
    images.append(b)


    #print("Qtable",q_table[state, :])
    print("Coordinate ",(agentX, agentY))

    new_state, reward, done = env.step(action)

    state = new_state
    rewards_current_episode += reward

    if done:
        break

#print(f"Reward after : {rewards_current_episode}")

images[0].save('images/discoveredgoldbox.gif',
               save_all=True,
               append_images=images[1:],
               duration=500,
               loop=0)

Coordinate  (9, 1)
Coordinate  (9, 2)
Coordinate  (9, 3)
Coordinate  (9, 4)
Coordinate  (9, 5)
Coordinate  (9, 6)
Coordinate  (9, 7)
Coordinate  (9, 8)
Coordinate  (9, 9)
Agent discovered goldbox and it has total battery points:  19391


<img src="images/discoveredgoldbox.gif" width="50%" />