In [1]:
import numpy as np
import tensorflow as tf
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import gym

In [2]:
n_capital = 1000

class Planner(gym.Env):
    def __init__(self):
        self.k = np.linspace(0.01, 1.0, n_capital)
        self.action_space = gym.spaces.Discrete(n_capital)
        self.observation_space = gym.spaces.Discrete(n_capital)
        self.decision_count = 0
        self.decision_max = 100
        self.observation = 500
        self.alpha = 0.33

    def step(self, action):
        assert self.action_space.contains(action)
        self.decision_count += 1
        done = False
        if (self.observation**self.alpha - action) > 0:
            reward = np.log(self.k[self.observation]**self.alpha - self.k[action])
        else:
            reward = -1000
        self.observation = action
        if (self.decision_count >= self.decision_max) or reward == -1000:
            done = True
        return self.observation, reward, done, {"decisions": self.decision_count}

    def reset(self):
        self.decision_count = 0
        self.observation = 500
        return self.observation

In [3]:
env = Planner()

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(1,) + env.observation_space.shape),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(n_capital, activation="linear")
])

In [4]:
memory = SequentialMemory(limit=10000, window_length=1)

policy = EpsGreedyQPolicy(0.30)

dqn = DQNAgent(model=model, nb_actions=n_capital, memory=memory,
               nb_steps_warmup=100, gamma=0.95, target_model_update=1e-2, policy=policy)

dqn.compile(optimizer=tf.keras.optimizers.Adam(0.005), metrics=["mse"])
history = dqn.fit(env, nb_steps=10000)

Training for 10000 steps ...
Interval 1 (0 steps performed)




done, took 105.647 seconds
