# 1. Import dependencies


In [None]:
# Execute this first
%pip install git+https://github.com/DLR-RM/stable-baselines3
# Then install the package with extras (gymnasium, atari, etc)
%pip install stable-baselines3[extra]

In [None]:
import os
import gymnasium as gym
import time
# Algorithm
from stable_baselines3 import PPO
# This allows to vectorize our environment for parallel training
from stable_baselines3.common.vec_env import DummyVecEnv
# Makes easier to evaluate how our model is running
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load Environment


In [None]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name, render_mode='human')
# Environment functions: reset(), render(), step(), close()

In [None]:
# reset the environment,
# returns an initial state
(state, _) = env.reset()
# states are
# cart position, cart velocity
# pole angle, pole angular velocity

# # ENVIRONMENT INFORMATION
# # observation space limits
# env.observation_space

# # upper limit
# env.observation_space.high

# # lower limit
# env.observation_space.low

# # action space
# env.action_space

# # all the specs
# env.spec

# # maximum number of steps per episode
# env.spec.max_episode_steps

# # reward threshold per episode
# env.spec.reward_threshold

# SIMULATE ENVIRONMENT
episodeNumber = 5
timeSteps = 100

# An episode will be a full run of the game within the environment
# Some environments have a fixed apisode length e.g.: CartPole, others are continuous
# e.g.: Breakout play until you run out of lives
for episodeIndex in range(episodeNumber):
    # Initial set of observations for the environment, four key components:
    # agent, action, environment and observations + rewards
    # env.observation_space = Box([])
    initial_state = env.reset()
    print(episodeIndex)
    # render the environment
    env.render()
    appendedObservations = []
    score = 0

    for timeIndex in range(timeSteps):
        print(timeIndex)
        random_action = env.action_space.sample()
        # Apply an action to the environment
        # Returns: (array([ 0.12885198,  0.56016594, -0.22173828, -1.1977195 ], dtype=float32), ---> Next set of observations
        #  1.0, ---> Reward
        #  True, ---> Wether or not the episode is done
        #  False,
        #  {})
        observation, reward, terminated, truncated, info = env.step(
            random_action)

        appendedObservations.append(observation)
        score += reward

        time.sleep(0.1)

        if (terminated):
            print('Episode: {} Score: {}'.format(episodeIndex, score))
            time.sleep(1)
            break
# Close render frame
env.close()

# 3. Train an RL Model


In [None]:
log_path = os.path.join('Training', 'Logs')

env = gym.make(environment_name)
env = DummyVecEnv({lambda: env})
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000)

# 4. Save and reload model


In [None]:
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [None]:
# Save
model.save(PPO_Path)

In [None]:
del model

In [None]:
# Reload
model = PPO.load(PPO_Path, env=env)

In [None]:
model.learn(total_timesteps=10000)

In some algorithms you will get:

-   ep_len_mean: on average how long a particular episode lasted before done
-   ep_rew_mean: the average reward that the agent accumulated per episode


# 5. Evaluation


In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
# Close environment
env.close()

# 6. Testing the agent


In [None]:
# Test model in five episodes
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')
environment_name = 'CartPole-v1'
env = gym.make(environment_name, render_mode='human')
env = DummyVecEnv({lambda: env})
model = PPO.load(PPO_Path, env=env)
episodes = 5

for episode in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        action, _states = model.predict(obs)  # Use our model here
        obs, reward, done, info = env.step(action)
        score += reward
        env.render()
    print('Episode: {} Score: {}'.format(episode, score))

In [None]:
env.close()

# 7. Viewing Logs in Tensorboard


In [None]:
log_path = os.path.join('Training', 'Logs')
training_log_path = os.path.join(log_path, 'PPO_2')

In [None]:
training_log_path

In [None]:
!tensorboard --logdir={training_log_path}
# Go to http://localhost:6006/

# 8. Adding a callback to the taining Stage


In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [None]:
save_path = os.path.join('Training', 'Saved Models')

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [None]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

In [None]:
# Cap training when reward reaches 200
model.learn(total_timesteps=20000, callback=eval_callback)

# 9. Changing Policies


In [None]:
# You can simplify by typing: new_arch=(128, 128)
net_arch = (dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128]))

In [None]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path,
            policy_kwargs={'net_arch': net_arch})

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

# 10. Using an Alternate Algorithm

In [None]:
from stable_baselines3 import DQN

In [None]:
env = gym.make(environment_name)
# Make DQN model
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

In [None]:
# Learn DQN model
model.learn(total_timesteps=200000)

In [None]:
# Save the model
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')
model.save(dqn_path)

In [None]:
# Load the model
model = DQN.load(dqn_path, env=env)

In [None]:
# Train again the same DQN model
env = gym.make(environment_name)
model.learn(total_timesteps=200000)

In [None]:
# Train best DQN model
env = gym.make(environment_name)
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=300, verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             verbose=1)
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_best')
model = DQN.load(dqn_path, env=env)
model.learn(total_timesteps=200000, callback=eval_callback)
model.save(dqn_path)

In [None]:
# Test agent
# Test model in five episodes
# dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_best')
environment_name = 'CartPole-v1'
env = gym.make(environment_name, render_mode='human')
env = DummyVecEnv({lambda: env})
model = DQN.load(dqn_path, env=env)
episodes = 5

for episode in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        action, _states = model.predict(obs)  # Use our model here
        obs, reward, done, info = env.step(action)
        score += reward
        env.render()
    print('Episode: {} Score: {}'.format(episode, score))

In [None]:
env = gym.make(environment_name, render_mode='human')
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()