## Import Dependencies

In [1]:
#!pip install stable-baselines3[extra]

In [2]:
import os
import gym 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## load enviroment

In [14]:
enviroment_name = 'CartPole-v0'

In [15]:
env = gym.make(enviroment_name, render_mode='human')

In [19]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, g = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:23.0
Episode:2 Score:10.0
Episode:3 Score:13.0
Episode:4 Score:13.0
Episode:5 Score:43.0


In [20]:
env.reset() # for demonstration
# env.close()

(array([-0.01018651, -0.00068204, -0.00457492, -0.01299267], dtype=float32),
 {})

In [21]:
env.step(1) # for demonstration, you will get in every step()

(array([-0.01020015,  0.19450521, -0.00483478, -0.30711553], dtype=float32),
 1.0,
 False,
 False,
 {})

## Undestanding enviroment

In [22]:
env.action_space # 0 and 1 i.e left and right

Discrete(2)

In [23]:
env.observation_space.sample() #cart pos from -4.b - 4.8, cart velocity, pole angle, pole angular velocity

array([-3.5071158e+00,  3.1625246e+38,  3.4877753e-01, -6.9282992e+37],
      dtype=float32)

## Training The model
#### model-free RL

In [28]:
# make directories first
log_path = os.path.join('Training', 'Logs') # saving tf board logs

In [29]:
log_path

'Training\\Logs'

In [30]:
env = gym.make(enviroment_name) 
# add ,render_mode='human' in (), removed here because of system resource limitation
env = DummyVecEnv([lambda: env]) # wrapped 
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)
# defining agent
# policy : multilayer perceptron policy NN
# env : enviroment (dummyvectorized)
# verbose : = 1 to log result
# tensorboard_log : to save logs

Using cpu device


In [None]:
# PPO??
model.learn(total_timesteps = 20000)

## Saving the model

In [32]:
PPO_Path = os.path.join('Training', 'Saved_Models', 'PPO_Model_CartPole')

In [33]:
model.save(PPO_Path)

In [34]:
# for demonstration not necessary
del model

In [35]:
model = PPO.load(PPO_Path, env = env) 

In [36]:
PPO_Path

'Training\\Saved_Models\\PPO_Model_CartPole'

## Testing & Evaluation

In [38]:
# avg score = 200 is solved 
evaluate_policy(model, env, n_eval_episodes = 10, render = True)
# avg = 200, sd = 0

(200.0, 0.0)

In [39]:
#env.close()

In [40]:
# Test model

In [41]:
obs = env.reset()
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) # using model here
        obs, reward, done, info= env.step(action)
        score += reward
    print("Episode:{} Score{}".format(episode, score))

Episode:1 Score[200.]
Episode:2 Score[200.]
Episode:3 Score[200.]
Episode:4 Score[200.]
Episode:5 Score[200.]


In [42]:
#env.close()

In [43]:
#model.predict??

In [44]:
model.predict(obs)

(array([1], dtype=int64), None)

In [45]:
env.action_space.sample()

1

In [46]:
env.step(action) # checking our metrics

(array([[-0.02771483,  0.1484123 ,  0.03494873, -0.3075838 ]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{'TimeLimit.truncated': False}])

## Viewing Logs in Tensorboard

In [47]:
training_log_path = os.path.join(log_path, 'PPO_3')
training_log_path

'Training\\Logs\\PPO_3'

In [48]:
# !tensorboard --logdir={training_log_path} # use either

In [49]:
# !tensorboard --logdir="training_log_path" --port=8892 
# or run in cmd without !
# Reinforcement_learning\Training\Logs> tensorboard --logdir =.
# http://localhost:6006/

## Adding callbacks to training stage

In [50]:
# our training will stop once it hits a benchmark

In [51]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [52]:
save_path = os.path.join('Training', 'Saved_Models')

In [53]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best = stop_callback, 
                             eval_freq = 10000, 
                             best_model_save_path = save_path, 
                             verbose = 1)


In [54]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)

Using cpu device


In [None]:
model.learn(total_timesteps = 20000, callback = eval_callback)

## Alternate DQN algorithm

In [56]:
from stable_baselines3 import DQN

In [57]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)

Using cpu device


In [None]:
model.learn(total_timesteps = 20000, callback = eval_callback)

In [59]:
dqn_path = os.path.join('Training', 'Saved_Models', 'DQN_model')

In [60]:
model.save(dqn_path)
#can be loaded and used in a similar way to PPO