In [37]:
import numpy as np
from environment.discover_goldbox import DiscoverGoldbox

In [38]:
env = DiscoverGoldbox()

In [39]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

In [40]:
num_episodes = 1000
max_steps_per_episode = 40

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.1
exploration_decay_rate = 0.001

In [41]:
rewards_all_episodes = []

for episode in range(num_episodes):
    state = env.reset()

    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()

        new_state, reward, done = env.step(action)

        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done:
            break

    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

dbox and it has total battery points:  18377
Agent discovered goldbox and it has total battery points:  18985
Agent discovered goldbox and it has total battery points:  19289
Agent discovered goldbox and it has total battery points:  18376
Agent discovered goldbox and it has total battery points:  18167
Agent discovered goldbox and it has total battery points:  18776
Agent discovered goldbox and it has total battery points:  18987
Agent discovered goldbox and it has total battery points:  19087
Agent discovered goldbox and it has total battery points:  19180
Agent discovered goldbox and it has total battery points:  18280
Agent discovered goldbox and it has total battery points:  18777
Agent discovered goldbox and it has total battery points:  19079
Agent discovered goldbox and it has total battery points:  18472
Agent discovered goldbox and it has total battery points:  19380
Agent discovered goldbox and it has total battery points:  18172
Agent discovered goldbox and it has total bat

In [42]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("~~~~~~~Average Rewards Per Thousand Episodes~~~~~~")
for r in rewards_per_thousand_episodes:
    print(f"{count: <5}: {np.sum(r/1000)}")
    count += 1000

print()
print("~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~")
print(q_table)

~~~~~~~Average Rewards Per Thousand Episodes~~~~~~
1000 : 4423.875

~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~
[[-2.56341476e+00 -2.45050475e+00 -2.65863526e+00 -2.50489356e+00]
 [-2.71867950e+00 -2.83307007e+00 -2.76283858e+00 -3.04077438e+00]
 [-3.27444586e+00 -3.40843845e+00 -3.16585420e+00 -3.85744322e+00]
 [-4.03372779e+00 -3.73961789e+00 -3.74153541e+00  2.40211269e+01]
 [-4.38232033e+00 -4.57942035e+00 -4.39746823e+00  3.95738514e+02]
 [ 1.10224065e+03  8.41520382e+01  4.34608634e+01  1.90180515e+03]
 [ 3.30339794e+03  2.95390997e+03  7.38744615e+02  7.53239268e+03]
 [ 8.73950721e+03  7.79283078e+03  5.88107535e+03  7.60154745e+03]
 [ 8.13804945e+03  6.30713075e+03  8.61740297e+03  8.02282962e+03]
 [ 8.63933511e+03  8.55176391e+03  8.47815977e+03  8.55176621e+03]
 [-2.26889436e+00 -2.38170854e+00 -2.42784845e+00 -2.34377439e+00]
 [-8.29547198e+01 -2.76414808e+00 -2.69590547e+00 -3.16818265e+00]
 [-9.96321372e+01 -3.25180494e+00 -3.29328912e+00 -3.84103956e+00]
 [-9.6967

In [43]:
from PIL import Image
canvas = Image.open("images/discover_goldbox_environment.png")
agent = Image.open("images/robot.png")
agent.putalpha(255)

In [44]:
locations = {(x, y): (100 * (y + 1) + 100 * y, (100 * (x + 1) + 100 * x))
             for y in range(10) for x in range(10)}

In [45]:
images = []

agentX = 9
agentY = 0

b = canvas.copy()
b.paste(agent, locations[(agentX, agentY)], agent)
images.append(b)

state = env.reset()

rewards_current_episode = 0

for step in range(max_steps_per_episode):
    action = np.argmax(q_table[state, :])

    if action == 0:
        agentY = min(9, agentY + 1)
    elif action == 1:
        agentY = max(0, agentY - 1)
    elif action == 2:
        agentX = max(0, agentX - 1)
    else:
        agentX = min(9, agentX + 1)

    
    b = canvas.copy()
    b.paste(agent, locations[(agentX, agentY)], agent)
    images.append(b)


    #print("Qtable",q_table[state, :])
    print("Coordinate ",(agentX, agentY))

    new_state, reward, done = env.step(action)

    state = new_state
    rewards_current_episode += reward

    if done:
        break

#print(f"Reward after : {rewards_current_episode}")

images[0].save('images/discoveredgoldbox.gif',
               save_all=True,
               append_images=images[1:],
               duration=500,
               loop=0)

Coordinate  (9, 1)
Coordinate  (9, 2)
Coordinate  (9, 3)
Coordinate  (9, 4)
Coordinate  (9, 5)
Coordinate  (9, 6)
Coordinate  (9, 7)
Coordinate  (9, 8)
Coordinate  (9, 9)
Agent discovered goldbox and it has total battery points:  19391


<img src="images/discoveredgoldbox.gif" width="50%" />