# Simple Example

In [16]:
import os
import gym
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy


In [24]:
environment_name='CartPole-v1'
env=gym.make(environment_name,new_step_api=True)

In [18]:
episodes=5
for episode in range(episodes+1):
    state=env.reset()
    done=False
    score=0

    while not done:
        gym.make(environment_name,render_mode='human',new_step_api='True')
        env.render()
        action=env.action_space.sample()
        n_state,reward,done,info,_=env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Episode:0 Score:13.0
Episode:1 Score:36.0
Episode:2 Score:20.0
Episode:3 Score:19.0
Episode:4 Score:13.0
Episode:5 Score:10.0


In [19]:
print(type(action))

<class 'int'>


## Training

In [30]:
log_path=os.path.join('Training','Logs')

In [31]:
env=gym.make(environment_name)
env=DummyVecEnv([lambda:env]) #vectorized wrapper
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cuda device


In [None]:


model.learn(total_timesteps=1000)

In [None]:
PPO_Path=os.path.join('Training','Saved Models','PPO_Model_Cartpole')

In [None]:
model.save(PPO_Path)

In [None]:
del model

In [None]:
model=PPO.load(PPO_Path,env=env)

## Testing and Evaluation

In [None]:
evaluate_policy(model,env,n_eval_episodes=10,render=True)

In [None]:
episodes=5
for episode in range(episodes+1):
    obs=env.reset()
    done=False
    score=0

    while not done:
        gym.make(environment_name,render_mode='human',new_step_api='True')
        env.render()
        action,_=model.predict(obs)
        obs,reward,done,info,_=env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

In [None]:
#training_log_path=os.path.join(log_path,'PPO_1')
%tensorboard --logdir={Training_log_path}

## Adding callback

In [None]:
from stable_baselines3.common.callbacks import EvalCallback,StopTrainingOnRewardThreshold

In [None]:
save_path=os.path.join('Training','Saved Model')

In [None]:
stop_callback=StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1)
eval_callback=EvalCallback(env,
                           callback_on_new_best=stop_callback,
                           eval_freq=10000,
                           best_model_save_path=save_path,
                           verbose=1)

In [None]:
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000,callback=eval_callback)

## Alternative Algorithm

In [None]:
from stable_baselines3 import DQN

In [None]:
model=DQN('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=10000)