# CartPole AI 

### Installing Packages

In [1]:
!pip install stable-baselines3[extra]
!pip install gym





### Importing Packages

In [2]:
import os
import gym 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [3]:
env_id = "CartPole-v0"
env = gym.make(env_id)

### Testing the Environment

In [4]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:32.0
Episode:2 Score:13.0
Episode:3 Score:30.0
Episode:4 Score:16.0
Episode:5 Score:27.0


In [5]:
env.observation_space.sample()

array([ 3.7286401e+00, -2.8970889e+38, -1.8258455e-01,  4.9848321e+37],
      dtype=float32)

### Setting up Callback Evaluation

In [7]:
MODEL_DIR = './models'
LOG_DIR = './logs'

In [8]:
save_path = os.path.join(MODEL_DIR, 'PPO_model')

In [9]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

### Training The Model (PPO)

In [11]:
env = gym.make(env_id)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 0, tensorboard_log=LOG_DIR)

In [12]:
model.learn(total_timesteps=30000, callback = eval_callback)



Eval num_timesteps=10000, episode_reward=181.80 +/- 36.40
Episode length: 181.80 +/- 36.40
New best mean reward!
Eval num_timesteps=20000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
New best mean reward!
Stopping training because the mean reward 200.00  is above the threshold 190


<stable_baselines3.ppo.ppo.PPO at 0x11fabfb0cd0>

In [None]:
model.save(save_path)

In [13]:
del model

### Loading and Evaluation

In [16]:
model = PPO.load('./models/PPO_model/best_model', env=env)

In [17]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(200.0, 0.0)

In [18]:
env.close()

In [19]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done: 
        print('info', info)
        break
env.close()

info [{'terminal_observation': array([-0.47279328, -0.35963345, -0.21927232, -0.5159051 ], dtype=float32)}]


### TensorBoard Logs

In [25]:
train_log = os.path.join(LOG_DIR, 'PPO_1')

In [26]:
!tensorboard --logdir={train_log}

^C
