In [2]:
%load_ext autoreload
%autoreload 2
import os  # noqa
import sys  # noqa
import numpy as np  # noqa
import copy  # noqa
import pygame  # noqa
import gymnasium as gym  # noqa
from gymnasium import spaces  # noqa
from gymnasium.envs.registration import register  # noqa
import matplotlib.pyplot as plt  # noqa
from pprint import pprint  # noqa

module_path = os.path.abspath(os.path.join(".."))
sys.path.insert(0, module_path)
from dyna import TabularDynaQ  # noqa
from q_learning import QLearning  # noqa
from env_helper import (
    check_banana_on_floor,
    check_reach_banana_with_chair,
    check_climb_to_reach_banana,
    check_climb_down,
    check_full_model,
)

from environment import (
    MonkeyBananaEnv,
    BananaOnFloorEnv,
    ReachBananaWithChairEnv,
    ClimbToReachBananaEnv,
    ClimbDownEnv,
)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
pygame 2.6.1 (SDL 2.28.4, Python 3.11.4)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [None]:
# If we tweak the planning in dyna to be like prioritised sweeping, we can
# make it much more efficient

In [3]:
env = BananaOnFloorEnv(size=5)
agent = TabularDynaQ(env)
agent.train(100)

info = check_banana_on_floor(env, agent)
pprint(info)


{'Missing states': 0,
 'Model accuracy': 1.0,
 'Total states': 125,
 'Unique states in model': 125}


In [4]:
env = ReachBananaWithChairEnv(size=5)
agent = TabularDynaQ(env)
agent.train(100)

info = check_reach_banana_with_chair(env, agent)
pprint(info)


{'Errors': [],
 'Missing states': 0,
 'Model accuracy': 1.0,
 'Total states': 125,
 'Unique states in model': 125}


In [5]:
env = ClimbToReachBananaEnv(size=5)
agent = TabularDynaQ(env)
agent.train(100)

info = check_climb_to_reach_banana(env, agent)
pprint(info)


{'Errors': [],
 'Missing states': 0,
 'Model accuracy': 1.0,
 'Total states': 10,
 'Unique states in model': 10}


In [6]:
env = ClimbDownEnv(size=5)
agent = TabularDynaQ(env)
agent.train(100)

info = check_climb_down(env, agent)
pprint(info)


{'Errors': [],
 'Missing states': 0,
 'Model accuracy': 1.0,
 'Total states': 10,
 'Unique states in model': 10}


In [7]:
# Curriculum training:
print("Banana on floor")
env = BananaOnFloorEnv(size=5)
agent = TabularDynaQ(env)
agent.train(100)
agent.clear_rewards_in_model()

pprint(check_banana_on_floor(env, agent))

print("Reach banana with chair")
env = ReachBananaWithChairEnv(size=5)
agent.switch_env(env)
agent.train(100)
agent.clear_rewards_in_model()

pprint(check_reach_banana_with_chair(env, agent))

print("Climb to reach banana")
env = ClimbToReachBananaEnv(size=5)
agent.switch_env(env)
agent.train(100)
agent.clear_rewards_in_model()

pprint(check_climb_to_reach_banana(env, agent))

print("Climb down")
env = ClimbDownEnv(size=5)
agent.switch_env(env)
agent.train(100)
agent.clear_rewards_in_model()

pprint(check_climb_down(env, agent))


Banana on floor
{'Missing states': 0,
 'Model accuracy': 1.0,
 'Total states': 125,
 'Unique states in model': 125}
Reach banana with chair
{'Errors': [],
 'Missing states': 0,
 'Model accuracy': 1.0,
 'Total states': 125,
 'Unique states in model': 125}
Climb to reach banana
{'Errors': [],
 'Missing states': 0,
 'Model accuracy': 1.0,
 'Total states': 10,
 'Unique states in model': 130}
Climb down
{'Errors': [],
 'Missing states': 0,
 'Model accuracy': 1.0,
 'Total states': 10,
 'Unique states in model': 130}


In [8]:
check_full_model(env, agent)

# Ok, the missing 20 states might be a problem
# Or will it? During the final MB training, I feel like we can
# learn those model on the go as well, no?

{'Total states': 150,
 'Unique states in model': 130,
 'Missing states': {121122,
  121132,
  121142,
  121152,
  222112,
  222132,
  222142,
  222152,
  323112,
  323122,
  323142,
  323152,
  424112,
  424122,
  424132,
  424152,
  525112,
  525122,
  525132,
  525142},
 'Model accuracy': 1.0,
 'Errors': []}

### Training with model


In [9]:
def train_final_with_model(agent, iterations=100):
    # Duplicate agent
    agent_copy = copy.deepcopy(agent)
    env = MonkeyBananaEnv(size=5)
    agent_copy.switch_env(env)
    agent_copy.n_planning_steps = 200
    steps_per_episode = agent_copy.train_for_final()
    return agent_copy

In [10]:
final_agent = train_final_with_model(agent)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124


### Training from sratch


In [16]:
def train_final_without_model(iterations=100):
    env = MonkeyBananaEnv(size=5)
    agent = TabularDynaQ(env)
    agent.train(iterations)
    return agent


final_agent_without_model = train_final_without_model()

In [None]:
test_final_agent(final_agent_without_model)

In [14]:
import time  # noqa

env = MonkeyBananaEnv(size=5, render_mode="human")
state, _ = env.reset()
done = False
while not done:
    action = max(final_agent.Q[state], key=final_agent.Q[state].get)
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    state = next_state

In [11]:
# Test on many situations and get the accuracy
def test_final_agent(agent):
    env = MonkeyBananaEnv(size=5)
    num_correct = 0
    for _ in range(100):
        state, _ = env.reset()
        done = False
        max_steps = 20
        steps = 0
        while not done and steps < max_steps:
            action = max(agent.Q[state], key=agent.Q[state].get)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = next_state
            steps += 1
        if done:
            num_correct += 1

    return num_correct / 100

In [12]:
test_final_agent(final_agent)

0.97