In [None]:
# SETUP
import numpy as np
import tensorflow as tf
import random
from collections import deque
import matplotlib.pyplot as plt
import math
import os

import gym
env = gym.make('CartPole-v0')
env._max_episode_steps = 10001
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.9
REPLAY_MEMORY = 5000
max_episodes = 500
step_count_list = []

In [None]:
# NETWORK
class DQN:
    
    def __init__(self,session, input_size, output_size, name="main"):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        self._build_network()
    
    def _build_network(self, hiddenSize=10, l_rate=1e-1):
        with tf.variable_scope(self.net_name):
            
            self._X = tf.placeholder(tf.float32, [None, input_size])
            W1 = tf.Variable(tf.truncated_normal([input_size, hiddenSize], stddev=1.0 / math.sqrt(float(input_size))))
            b1 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))  
            input_layer = tf.nn.relu(tf.matmul(self._X, W1) + b1)

            W2 = tf.Variable(tf.truncated_normal([hiddenSize, hiddenSize],stddev=1.0 / math.sqrt(float(hiddenSize))))
            b2 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))
            hidden_layer = tf.nn.relu(tf.matmul(input_layer, W2) + b2)

            W3 = tf.Variable(tf.truncated_normal([hiddenSize, output_size],stddev=1.0 / math.sqrt(float(hiddenSize))))
            b3 = tf.Variable(tf.truncated_normal([output_size], stddev=0.01))
            output_layer = tf.matmul(hidden_layer, W3) + b3
            
            # Q prediction
            self._Qpred = output_layer
            
        # We need to define the parts of the network needed for learnig a policy
        self._Y = tf.placeholder(
            shape=[None, self.output_size], dtype=tf.float32)
        
        # Loss function
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        # Learning
        self._train = tf.train.AdamOptimizer(
            learning_rate= l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1, self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X: x})
    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train], feed_dict={
            self._X: x_stack, self._Y: y_stack})

In [None]:
# Train from replay buffer
def replay_train(mainDQN, targetDQN, train_batch):
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    
    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        # terminal?
        if done:
            Q[0, action] = reward
        else:
            # Obtain the Q' values by feeding the new state through network
            Q[0, action] = reward + dis * np.max(targetDQN.predict(next_state))
            
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(x_stack, y_stack)

In [None]:
def copy_network(*, dest_scope_name="target", src_scope_name="main"):
    # Copy variables src_scope to dest_scope
    op_holder = []
    
    src_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
    dest_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
    
    return op_holder

In [None]:
# main
def main():
    # store the previous observations in replay memory
    replay_buffer = deque()
    
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN(sess, input_size, output_size, name="target")
        tf.global_variables_initializer().run()
        
        # initial copy q_net --> target_net
        copy = copy_network(dest_scope_name="target",
                                   src_scope_name="main")
        sess.run(copy)
        
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()
            
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedily from the Q_network
                    action = np.argmax(mainDQN.predict(state))
                
                # Get the state and reward from enviroment
                next_state, reward, done, _ = env.step(action)
                if done: # big penalty
                    reward = -100
                
                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                
                state = next_state
                step_count += 1
                if step_count > 30000:
                    break
                
            print("Episode: {} steps: {}".format(episode, step_count))
            step_count_list.append(step_count)
            
            if step_count >30000:
                pass
                # break
            
            if episode % 20 == 1: # train every 20 episode
                # Get a random batch of experiences.
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 50)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                
                print("Loss: ", loss)
                # copy Q_net --> target_net
                sess.run(copy_ops)

In [None]:
if __name__ == "__main__":
    main()

In [None]:
plt.plot(step_count_list)
plt.show()

In [None]:
cnt = 0
a = 0
score_list_50 = []
for j in (step_count_list):
    cnt += 1
    a += j
    if cnt == 50:
        score_list_50.append(a/cnt)
        cnt = 0
        a = 0
plt.plot(score_list_50)
plt.xlabel("epoch: 1epoch = 50 episodes")
plt.ylabel("score")
plt.title("CartPole_DQN(2015)")
plt.show()

In [None]:
cnt = 0
a = 0
score_list_125 = []
for j in (step_count_list):
    cnt += 1
    a += j
    if cnt == 125:
        score_list_125.append(a/cnt)
        cnt = 0
        a = 0
plt.plot(score_list_125)
plt.xlabel("epoch: 1epoch = 125 episodes")
plt.ylabel("score")
plt.title("CartPole_DQN(2015)")
plt.show()