# MuZero CartPole VPython

This notebook runs the MuZero RL algorithm from Google Deep Mind in the cartpole environment and visualized in VPython.


In [None]:
from vpython import canvas, box, cylinder, vector, color, rate
from helper import SharedStorage, MuZeroConfig, ReplayBuffer, make_CartPole_config, CartPole, CartPoleNetwork
from self_play import run_selfplay, run_eval
from training import train_network

class CartPoleVPython(CartPole):

    def __init__(self, discount: float):
        global b,c,a
        super().__init__(discount)
        observation = self.observations[0]
        b.visible = False
        c.visible = False
        del b
        del c
        b = box(pos=vector(0,-0.5,0), color=color.green)
        c = cylinder(pos=vector(0,0,0), axis=vector(0,4,0), radius=0.1, )
        b.pos.x = c.pos.x = observation[0]
        a = observation[2]
        c.rotate(angle=a, axis=vector(0,0,1))


    def step(self, action) -> int:
        global b,c,a
        observation, reward, done, _ = self.env.step(action.index)
        self.observations += [observation]
        self.done = done
        rate(1000)
        b.pos.x = c.pos.x = observation[0]
        c.rotate(angle=-a, axis=vector(0,0,1))
        a = -observation[2]
        c.rotate(angle=a, axis=vector(0,0,1))

        return reward

def make_CartPoleVPython_config() -> MuZeroConfig:
    def visit_softmax_temperature(num_moves, training_steps):
        return 1.0

    return MuZeroConfig(
        game=CartPoleVPython,
        nb_training_loop=20,
        nb_episodes=20,
        nb_epochs=20,
        network_args={'action_size': 2,
                      'state_size': 4,
                      'representation_size': 4,
                      'max_value': 500},
        network=CartPoleNetwork,
        action_space_size=2,
        max_moves=1000,
        discount=0.99,
        dirichlet_alpha=0.25,
        num_simulations=11,  # Odd number perform better in eval mode
        batch_size=512,
        td_steps=10,
        visit_softmax_temperature_fn=visit_softmax_temperature,
        lr=0.05)


# MuZero training is split into two independent parts: Network training and
# self-play data generation.
# These two parts only communicate by transferring the latest network checkpoint
# from the training to the self-play, and the finished games from the self-play
# to the training.
train_scores = []
eval_scores = []
def muzero(config: MuZeroConfig):

    storage = SharedStorage(config.new_network(), config.uniform_network(), config.new_optimizer())
    replay_buffer = ReplayBuffer(config)

    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes)
        train_network(config, storage, replay_buffer, config.nb_epochs)
        score_eval = run_eval(config, storage, 20)

        print("Train score:", score_train)
        print("Eval score:", score_eval)
        print(f"MuZero played {config.nb_episodes * (loop + 1)} "
              f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n")
        train_scores.append(score_train)
        eval_scores.append(score_eval)

    return storage.latest_network()

scene = canvas(range=5)
b = box(pos=vector(0,-0.5,0),color=color.green)
c = cylinder(pos=vector(0,0,0), axis=vector(0,4,0), radius=0.1)
a = 0
wire = cylinder(pos=vector(-10,-0.5,0), axis=vector(20,0,0), radius=0.02, color=color.white)
display(scene)
config = make_CartPoleVPython_config()

latest_network = muzero(config);

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_scores, color="red")
plt.plot(eval_scores, color="blue")
plt.xticks(range(20))
plt.xlabel('Loops')
plt.ylabel('Score')
plt.show()

In [None]:
from vpython import canvas, box, cylinder, vector, color

scene = canvas(range=5)
b = box(pos=vector(0,-0.5,0),color=color.green)
display(scene)
play_game(config, latest_network);