In [1]:
import random

import gym

In [2]:
environment = gym.make('CartPole-v0')

In [5]:
# Test using random action
for epoch in range(1, 5+1):
    done = False
    score = 0
    state = environment.reset()

    while not done:
        environment.render()
        _, reward, done, _ = environment.step(random.choice([0,1]))
        score += reward

    print(f'Epoch {epoch} finshed with a total score of {score}...')

Epoch 1 finshed with a total score of 13.0...
Epoch 2 finshed with a total score of 26.0...
Epoch 3 finshed with a total score of 18.0...
Epoch 4 finshed with a total score of 21.0...
Epoch 5 finshed with a total score of 29.0...


In [8]:
# size of the state returned from the environment
# this directly determines the shape of the input layer of our neural network
state_size = environment.observation_space.shape[0]
print('State size', state_size)

State size 4


In [10]:
# number of actions that can be taken (determines count of output nodes)
num_actions = environment.action_space.n
print('Total Number of Actions', num_actions)

Total Number of Actions 2


In [78]:
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential

def build_model(input_shape, num_outputs):
    model = Sequential()

    model.add(Flatten(input_shape=input_shape))
    model.add(Dense(6, activation='relu'))
#     model.add(Dense(16, activation='relu'))
    model.add(Dense(num_outputs, activation='linear'))

    return model

In [79]:
model = build_model((1, state_size), num_actions)
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_9 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_23 (Dense)             (None, 6)                 30        
_________________________________________________________________
dense_24 (Dense)             (None, 2)                 14        
Total params: 44
Trainable params: 44
Non-trainable params: 0
_________________________________________________________________


In [59]:
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

def build_agent(model, num_actions, num_warmup_steps=10):
    return DQNAgent(
        memory=SequentialMemory(limit=5000, window_length=1),
        model=model,
        nb_actions=num_actions,
        nb_steps_warmup=num_warmup_steps,
        policy=BoltzmannQPolicy(),
        target_model_update=1e-2,
    )

In [80]:
dqn = build_agent(model, num_actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [97]:
# run training iterations
dqn.fit(environment, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 59.416 seconds


<tensorflow.python.keras.callbacks.History at 0x7f4ee6f3dd00>

In [98]:
# test trained agent
dqn.test(environment, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200


<tensorflow.python.keras.callbacks.History at 0x7f4ee6f3daf0>