In [1]:
import gym
import random
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from collections import deque
print("Gym:", gym.__version__)
print("Tensorflow:", tf.__version__)

Instructions for updating:
non-resource variables are not supported in the long term
Gym: 0.17.2
Tensorflow: 2.2.0


In [2]:
env_name= "CartPole-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box(4,)
Action space: Discrete(2)


In [3]:

class QNetwork():
    def __init__(self, state_dim, action_size):
        self.state_in = tf.placeholder(tf.float32, shape=[None, *state_dim])
        self.action_in = tf.placeholder(tf.int32, shape=[None])
        self.q_target_in = tf.placeholder(tf.float32, shape=[None])
        action_one_hot = tf.one_hot(self.action_in, depth=action_size)
        
        self.hidden1 = tf.layers.dense(self.state_in, 100, activation=tf.nn.relu)
        self.q_state = tf.layers.dense(self.hidden1, action_size, activation=None)
        self.q_state_action = tf.reduce_sum(tf.multiply(self.q_state, action_one_hot), axis=1)
        
        self.loss = tf.reduce_mean(tf.square(self.q_state_action - self.q_target_in))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.loss)
        
    def update_model(self, session, state, action, q_target):
        feed = {self.state_in: state, self.action_in: action, self.q_target_in: q_target}
        session.run(self.optimizer, feed_dict=feed)
        
    def get_q_state(self, session, state):
        q_state = session.run(self.q_state, feed_dict={self.state_in: state})
        return q_state



In [4]:
class ReplayBuffer():
    def __init__(self, maxlen):
        self.buffer = deque(maxlen=maxlen)
        
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        sample_size = min(len(self.buffer), batch_size)
        samples = random.choices(self.buffer, k=sample_size)
        return map(list, zip(*samples))

In [5]:
class DQNAgent():
    def __init__(self, env):
        self.state_dim = env.observation_space.shape
        self.action_size = env.action_space.n
        self.q_network = QNetwork(self.state_dim, self.action_size)
        self.replay_buffer = ReplayBuffer(maxlen=10000)
        self.gamma = 0.97
        self.eps = 1.0
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def get_action(self, state):
        q_state = self.q_network.get_q_state(self.sess, [state])
        action_greedy = np.argmax(q_state)
        action_random = np.random.randint(self.action_size)
        action = action_random if random.random() < self.eps else action_greedy
        return action
    
    def train(self, state, action, next_state, reward, done):
        self.replay_buffer.add((state, action, next_state, reward, done))
        states, actions, next_states, rewards, dones = self.replay_buffer.sample(50)
        q_next_states = self.q_network.get_q_state(self.sess, next_states)
        q_next_states[dones] = np.zeros([self.action_size])
        q_targets = rewards + self.gamma * np.max(q_next_states, axis=1)
        self.q_network.update_model(self.sess, states, actions, q_targets)
        
        if done: self.eps = max(0.1, 0.99*self.eps)
    
    def __del__(self):
        self.sess.close()

In [7]:
agent = DQNAgent(env)
num_episodes = 30

for ep in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train(state, action, next_state, reward, done)
#         env.render()
        total_reward += reward
        state = next_state
        
    print("Episode: {}, total_reward: {:.2f}".format(ep, total_reward))
env.close()

Episode: 0, total_reward: 15.00
Episode: 1, total_reward: 20.00
Episode: 2, total_reward: 42.00
Episode: 3, total_reward: 10.00
Episode: 4, total_reward: 11.00
Episode: 5, total_reward: 18.00
Episode: 6, total_reward: 21.00
Episode: 7, total_reward: 14.00
Episode: 8, total_reward: 14.00
Episode: 9, total_reward: 11.00
Episode: 10, total_reward: 23.00
Episode: 11, total_reward: 17.00
Episode: 12, total_reward: 55.00
Episode: 13, total_reward: 22.00
Episode: 14, total_reward: 12.00
Episode: 15, total_reward: 10.00
Episode: 16, total_reward: 12.00
Episode: 17, total_reward: 16.00
Episode: 18, total_reward: 35.00
Episode: 19, total_reward: 25.00
Episode: 20, total_reward: 21.00
Episode: 21, total_reward: 20.00
Episode: 22, total_reward: 50.00
Episode: 23, total_reward: 20.00
Episode: 24, total_reward: 12.00
Episode: 25, total_reward: 33.00
Episode: 26, total_reward: 15.00
Episode: 27, total_reward: 18.00
Episode: 28, total_reward: 18.00
Episode: 29, total_reward: 18.00


In [12]:
!pip install gym[all]

Collecting box2d-py~=2.3.5; extra == "all"
  Downloading box2d-py-2.3.8.tar.gz (374 kB)
[K     |████████████████████████████████| 374 kB 100 kB/s eta 0:00:01
[?25hCollecting atari-py~=0.2.0; extra == "all"
  Using cached atari-py-0.2.6.tar.gz (790 kB)
Collecting Pillow; extra == "all"
  Using cached Pillow-7.2.0-cp38-cp38-manylinux1_x86_64.whl (2.2 MB)
Collecting mujoco-py<2.0,>=1.50; extra == "all"
  Downloading mujoco-py-1.50.1.68.tar.gz (120 kB)
[K     |████████████████████████████████| 120 kB 199 kB/s eta 0:00:01
[?25hCollecting imageio; extra == "all"
  Downloading imageio-2.9.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 169 kB/s eta 0:00:01
Collecting glfw>=1.4.0
  Downloading glfw-1.12.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (203 kB)
[K     |████████████████████████████████| 203 kB 184 kB/s eta 0:00:01
[?25hCollecting Cython>=0.27.2
  Downloading Cython-0.29.21-cp38-cp38-manylinux1_x86_64.wh

  Building wheel for mujoco-py (setup.py) ... [?25lerror
[31m  ERROR: Command errored out with exit status 1:
   command: /home/halcyoona/miniconda3/envs/gym/bin/python -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-da3npivq/mujoco-py/setup.py'"'"'; __file__='"'"'/tmp/pip-install-da3npivq/mujoco-py/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d /tmp/pip-wheel-dqg1__bh
       cwd: /tmp/pip-install-da3npivq/mujoco-py/
  Complete output (27 lines):
  running bdist_wheel
  running build
  Traceback (most recent call last):
    File "<string>", line 1, in <module>
    File "/tmp/pip-install-da3npivq/mujoco-py/setup.py", line 32, in <module>
      setup(
    File "/home/halcyoona/miniconda3/envs/gym/lib/python3.8/site-packages/setuptools/__init__.py", line 165, in setup
      return distutils.core.setup(**attrs)
    Fil