In [21]:
import time
import gym
import numpy as np
from gym import Wrapper
from rl.core import Env

ENV_NAME = "CartPole-v0"
env = gym.make(ENV_NAME)

def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

def try_policy(policy, n_episodes=5):
    totals = []
    for episode in range(n_episodes):
        episode_rewards = 0
        obs = env.reset()
        for step in range(1000):
            action = policy(obs)
            obs, reward, done, info = env.step(action)
            episode_rewards += reward
            env.render()
            time.sleep(1 / 60)
            if done:
                break
        totals.append(episode_rewards)
        time.sleep(1)
    env.close()
    print(np.mean(totals), np.std(totals), np.min(totals), np.max(totals))

# try_policy(basic_policy)

In [22]:
from keras import Sequential
from keras.layers import Dense, Softmax, Flatten
from keras.activations import relu
from keras.optimizers import Adam
from rl.memory import SequentialMemory
from rl.policy import BoltzmannQPolicy
from rl.agents import DQNAgent

n_inputs = 4
n_hidden = 4
n_outputs = 2
n_actions = env.action_space.n

nn = Sequential([
    Flatten(input_shape=(1,) + env.observation_space.shape),
    Dense(16, activation=relu),
    Dense(16, activation=relu),
    Dense(16, activation=relu),
    Dense(n_actions),
    Softmax()
])

memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(
    model=nn,
    nb_actions=n_actions,
    memory=memory,
    nb_steps_warmup=10,
    target_model_update=1e-2,
    policy=policy
)
dqn.compile(Adam(), metrics=["mae"])

# dqn.fit(env, nb_steps=500000, visualize=False, verbose=0)
# 
# dqn.save_weights("dqn_{}_weights.h5f".format(ENV_NAME), overwrite=True)

# dqn.test(env, nb_episodes=4, visualize=True)
# 
# env.close()

In [33]:
class MyEnv(Wrapper):
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        super().__init__(self.env)
    
    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        # Penalize large values in the vector (pole should be upright, cart in the center)
        reward = 2 * reward - pow(obs[0], 3) - pow(obs[2], 2)
        return obs, reward, done, info

env = MyEnv()

dqn.load_weights("dqn_{}_weights_4.h5f".format(ENV_NAME))

dqn.fit(env, nb_steps=100000, visualize=False, verbose=1)

dqn.save_weights("dqn_{}_weights_4.h5f".format(ENV_NAME), overwrite=True)

dqn.test(env, nb_episodes=20, visualize=True)

env.close()


Training for 100000 steps ...
Interval 1 (0 steps performed)
180 episodes - episode_reward: 108.881 [17.824, 332.176] - loss: 0.756 - mean_absolute_error: 0.575 - mean_q: 0.886

Interval 2 (10000 steps performed)
171 episodes - episode_reward: 117.637 [21.835, 311.637] - loss: 1.221 - mean_absolute_error: 0.741 - mean_q: 0.902

Interval 3 (20000 steps performed)
191 episodes - episode_reward: 104.162 [23.803, 345.409] - loss: 1.675 - mean_absolute_error: 0.901 - mean_q: 0.919

Interval 4 (30000 steps performed)
183 episodes - episode_reward: 108.862 [23.781, 342.677] - loss: 2.120 - mean_absolute_error: 1.058 - mean_q: 0.930

Interval 5 (40000 steps performed)
180 episodes - episode_reward: 110.980 [21.807, 355.943] - loss: 2.521 - mean_absolute_error: 1.214 - mean_q: 0.942

Interval 6 (50000 steps performed)
180 episodes - episode_reward: 111.138 [17.834, 313.398] - loss: 2.607 - mean_absolute_error: 1.271 - mean_q: 0.952

Interval 7 (60000 steps performed)
176 episodes - episode_rewa