In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt
import pandas as pd

In [6]:
envs = {}
for e in list(gym.envs.registry.all()):
    envs[e.id] = e

print('Total envs available:', len(envs))

list(envs.keys())[:5]

Total envs available: 103


['CartPole-v0',
 'CartPole-v1',
 'MountainCar-v0',
 'MountainCarContinuous-v0',
 'Pendulum-v1']

In [11]:
action_space = 5
action_space = gym.spaces.Box(
    low=np.array([-action_space, -10]),
    high=np.array([+action_space, 10]), dtype=np.float32
)

Box([ -5. -10.], [ 5. 10.], (2,), float32)

## Stable Baselines

In [47]:
from stable_baselines3 import PPO, DDPG, A2C
import numpy as np
import gym

In [48]:
env = gym.make("CartPole-v1")

model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000)

vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render()
    # VecEnv resets automatically
    # if done:
    #   obs = vec_env.reset()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 41.1     |
|    ep_rew_mean        | 41.1     |
| time/                 |          |
|    fps                | 931      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.637   |
|    explained_variance | -0.0324  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.53     |
|    value_loss         | 7.01     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 43.4     |
|    ep_rew_mean        | 43.4     |
| time/                 |          |
|    fps                | 1080     |
|    iterations         | 200      |
|    time_elapsed 

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 67.5      |
|    ep_rew_mean        | 67.5      |
| time/                 |           |
|    fps                | 1119      |
|    iterations         | 1400      |
|    time_elapsed       | 6         |
|    total_timesteps    | 7000      |
| train/                |           |
|    entropy_loss       | -0.517    |
|    explained_variance | -6.83e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1399      |
|    policy_loss        | 0.337     |
|    value_loss         | 1.1       |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 71.5      |
|    ep_rew_mean        | 71.5      |
| time/                 |           |
|    fps                | 1129      |
|    iterations         | 1500      |
|    time_elapsed       | 6         |
|    total_timesteps    | 7500      |
| train/    

ImportError: 
    Cannot import pyglet.
    HINT: you can install pyglet directly via 'pip install pyglet'.
    But if you really just want to install all Gym dependencies and not have to think about it,
    'pip install -e .[all]' or 'pip install gym[all]' will do it.
    

# Learning: Cart Pole

$\text{Action Space: } \{0: \text{ left, } 1: \text{ right } \} $

$\text{Observation Space: } [\text{Cart Position, Cart Velocity, Pole Angle, Angular Velocity}], \text{ in range } [(-4.8:4.8), (-\infty : \infty), (-0.418:0.418), (-\infty : \infty)]$

$\text{All environments are randomly initialized to values between -0.05 to 0.05.}$

In [6]:
import numpy as np
import gym
import time

In [4]:
env_name = "CartPole-v1"
env = gym.make(env_name)
env.reset() # 

array([-0.00682458, -0.02802546, -0.04673157, -0.0345641 ], dtype=float32)

In [7]:
print("Observation space (state): ", env.observation_space)
print()
print("Action space", env.action_space)

Observation space (state):  Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

Action space Discrete(2)


In [7]:
s = time.time()
env.step(env.action_space.sample())
time.time() - s

0.002866983413696289

#### Take Random Actions

In [52]:
env.reset()
for i in range(40):
    action = env.action_space.sample()
    observation, reward, done, truncated, info = env.step(action)
    print(reward)
    env.render()

1.0


If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0


  logger.warn(


0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [53]:
observation

array([-0.45570993, -0.75292265,  1.522458  ,  5.461001  ], dtype=float32)

### Train Agent using Q Table

In [11]:
state_space = 4 # number of states
action_space = 2 # number of possible actions

def Qtable(state_space,action_space,bin_size = 30):
    
    bins = [np.linspace(-4.8,4.8,bin_size),
            np.linspace(-4,4,bin_size),
            np.linspace(-0.418,0.418,bin_size),
            np.linspace(-4,4,bin_size)]
    
    q_table = np.random.uniform(low=-1,high=1,size=([bin_size] * state_space + [action_space])) # size = [30, 30, 30, 30, 2]
    return q_table, bins

def Discrete(state, bins):
    index = []
    for i in range(len(state)): index.append(np.digitize(state[i],bins[i]) - 1)
    return tuple(index)

In [9]:
np.random.uniform(size=[2,2])

array([[0.00144776, 0.54035932],
       [0.4512143 , 0.35349524]])

In [13]:
q_table, bins = Qtable(state_space, action_space, 4)

In [15]:
def Q_learning(q_table, bins, episodes = 5000, gamma = 0.95, lr = 0.1, timestep = 100, epsilon = 0.2):
    
    rewards = 0
    steps = 0
    for episode in range(1,episodes+1):
        steps += 1 
        # env.reset() => initial observation
        current_state = Discrete(env.reset(),bins)
      
        score = 0
        done = False
        while not done: 
            if episode%timestep==0: 
                env.render()
            if np.random.uniform(0,1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[current_state])
            observation, reward, done, truncated, info = env.step(action)
            next_state = Discrete(observation,bins)
            score += reward
            if not done:
                max_future_q = np.max(q_table[next_state])
                current_q = q_table[current_state+(action,)]
                new_q = (1-lr)*current_q + lr*(reward + gamma*max_future_q)
                q_table[current_state+(action,)] = new_q
            current_state = next_state
            
        # End of the loop update
        else:
            rewards += score
            if score > 195 and steps >= 100: print('Solved')
        if episode % timestep == 0: print(reward / timestep)

In [16]:
Q_learning(q_table, bins)

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
