In [1]:
from keras.models import Model
from keras.layers import Input,Dense,Lambda
from keras.optimizers import Adam
from keras import backend as K
import gym
import tensorflow as tf
import numpy as np
import os
from utils import *
from tqdm import trange


from keras.backend.tensorflow_backend import set_session

MAXSTEP=500
convergence_reward = 475
VERSION='nips dueling'


np.random.seed(1024)
tf.set_random_seed(1024)

config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

Using TensorFlow backend.


In [2]:
class duel_double_dqn:
    def __init__(self, env, hidden_units=20, maxlen=10000, batch_size=32, 
                 explore_init=0.5, explore_end=0.01, explore_steps=100000,
                 update_fre=20, gamma=0.99, train_times=1, version='double'):
        self.env=env
        self.batch_size=batch_size
        self.explore=explore_init
        self.explore_init=explore_init
        self.explore_end=explore_end
        self.explore_steps=explore_steps
        self.update_fre=update_fre
        self.gamma=gamma
        self.train_times=train_times
        self.version=version
        
        self.model=self.create_model(hidden_units)
        if self.version.find('nips')<0:
            self.target_model=self.create_model(hidden_units)
        self.memory=Memory(maxlen)
        self.time_stamp=0
        
    def create_model(self, hidden_units):
        x=Input(self.env.observation_space.shape)
        h=Dense(hidden_units, activation='tanh')(x)
        if self.version.find('duel')>-1:
            a=Dense(self.env.action_space.n,)(h)
            v=Dense(1,)(h)
            z=Lambda(lambda a:a[0]+a[1]-K.mean(a[1],keepdims=True))([v,a])
        else:
            z=Dense(self.env.action_space.n,)(h)
        model=Model(inputs=x, outputs=z)
        model.compile(optimizer='adam', loss='mse')
        return model
    
    def update(self):
        self.time_stamp+=1
        if len(self.memory.buffer)>=self.batch_size:
            for _ in range(self.train_times):
                #data=random.sample(self.memory,self.batch_size)
                data=self.memory.sample(self.batch_size)
                s=[d[0] for d in data]
                a=[d[1] for d in data]
                r=[d[2] for d in data]
                s_=[d[3] for d in data]
                done=[d[4] for d in data]
                q1=self.model.predict(np.array(s))
                if self.version.find('nips')>-1:
                    q2=self.model.predict(np.array(s_))
                else:
                    q2=self.target_model.predict(np.array(s_))
                for i in range(self.batch_size):
                    if done[i]:
                        q1[i,a[i]]=r[i]
                    else:
                        if self.version.find('double')>-1:
                            q1[i,a[i]]=r[i]+self.gamma*q2[i, np.argmax(q1[i])]
                        else:
                            q1[i,a[i]]=r[i]+self.gamma*np.max(q2[i])
                self.model.train_on_batch(np.array(s),q1)
        if self.version.find('nips')<0 and self.time_stamp%self.update_fre==0:
            self.target_model.set_weights(self.model.get_weights())
        
    def store(self, s, a, r, s_, done):
        #self.memory.append([s,a,r,s_,done])
        self.memory.add((s, a, r, s_, done))
        
    def get_action(self, s, flag=True):
        if flag:
            if self.explore>self.explore_end:
                self.explore-=(self.explore_init-self.explore_end)/self.explore_steps
            if np.random.rand()<self.explore:
                return self.env.action_space.sample()
        return np.argmax(self.model.predict(s[np.newaxis,:],))

In [3]:
if __name__ == "__main__":
    model_name = "nips_dqn"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.1
    set_session(tf.Session(config=config))

    train_episodes = 5000  # 1000          # max number of episodes to learn from
    max_steps = MAXSTEP  # 200                # max steps in an episode
    gamma = 0.99  # future reward discount

    # agent parameters
    state_size = 4
    action_size = 2
    # training process
    rewards_list = []
    test_rewards_list = []
    show_every_steps = 100

    # Exploration parameters
    explore_start = 0.5  # exploration probability at start
    explore_stop = 0.01  # minimum S probability
    decay_rate = 0.0001  # expotentional decay rate for exploration prob

    # Network parameters
    hidden_size = 20  # number of units in each Q-network hidden layer


    # Memory parameters
    memory_size = 10000  # memory capacity
    batch_size = 32  # experience mini-batch size
    pretrain_length = batch_size  # number experiences to pretrain the memory



    # Initialize the simulation
    env = gym.make('CartPole-v1').env
    env.reset()
    # Take one random step to get the pole and cart moving
    state, reward, done, _ = env.step(env.action_space.sample())
    #TODO 指定网络参数和名字
    agent = duel_double_dqn(env,maxlen=10000,version=VERSION)
    model_name = "nature_dqn_C5_dueling"
    #memory = Memory(max_size=memorSy_size)

    # Make a bunch of random actions and store the experiences
    for ii in range(pretrain_length):
        # Uncomment the line below to watch the simulation
        # env.render()

        # Make a random action
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)

        if done:
            # The simulation fails so no next state
            next_state = np.zeros(state.shape)
            # Add experience to memory
            agent.store(state, action, reward, next_state, done)

            # Start new episode
            env.reset()
            # Take one random step to get the pole and cart moving
            state, reward, done, _ = env.step(env.action_space.sample())
        else:
            # Add experience to memory
            agent.store(state, action, reward, next_state, done)
            state = next_state



    step = 0
    for ep in range(1, train_episodes):
        total_reward = 0
        t = 0
        #episode_total
        while t < max_steps:
            step += 1
            # Uncomment this next line to watch the training
            #env.render()
            action = agent.get_action(state)

            # Take action, get new state and reward
            next_state, reward, done, _ = env.step(action)

            total_reward += reward

            if done:
                # the episode ends so no next state
                next_state = np.zeros(state.shape)
                t = max_steps
                rewards_list.append((ep, total_reward))
                # Add experience to memory
                agent.store(state, action, reward, next_state, done)

                # Start new episode
                state=env.reset()
            else:
                # Add experience to memory
                agent.store(state, action, reward, next_state, done)
                state = next_state
                t += 1

            agent.update()

        test_rewards_list.extend(test_agent(agent, env,test_max_steps=MAXSTEP))
        cur_compute_len = min(100, len(test_rewards_list))
        mean_reward = np.mean(test_rewards_list[len(test_rewards_list) - cur_compute_len:])
        print('Episode: {}'.format(ep),
              'Mean test reward: {:.1f}'.format(mean_reward), )
        if mean_reward > convergence_reward:
            print(ep, "收敛")
            break


[2018-01-28 17:17:56,097] Making new env: CartPole-v1


Episode: 1 Mean test reward: 9.0
Episode: 2 Mean test reward: 8.5
Episode: 3 Mean test reward: 8.3
Episode: 4 Mean test reward: 8.2
Episode: 5 Mean test reward: 8.2
Episode: 6 Mean test reward: 8.3
Episode: 7 Mean test reward: 8.4
Episode: 8 Mean test reward: 8.5
Episode: 9 Mean test reward: 8.4
Episode: 10 Mean test reward: 8.5
Episode: 11 Mean test reward: 8.5
Episode: 12 Mean test reward: 8.4
Episode: 13 Mean test reward: 8.4
Episode: 14 Mean test reward: 8.4
Episode: 15 Mean test reward: 8.3
Episode: 16 Mean test reward: 8.2
Episode: 17 Mean test reward: 8.3
Episode: 18 Mean test reward: 8.3
Episode: 19 Mean test reward: 8.3
Episode: 20 Mean test reward: 8.3
Episode: 21 Mean test reward: 8.3
Episode: 22 Mean test reward: 8.4
Episode: 23 Mean test reward: 8.4
Episode: 24 Mean test reward: 8.4
Episode: 25 Mean test reward: 8.4
Episode: 26 Mean test reward: 8.4
Episode: 27 Mean test reward: 8.4
Episode: 28 Mean test reward: 8.4
Episode: 29 Mean test reward: 8.4
Episode: 30 Mean test r

In [4]:
mean_reward

478.79000000000002

In [5]:
test_rewards_list

[9,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 8,
 9,
 8,
 8,
 8,
 8,
 7,
 8,
 9,
 8,
 9,
 9,
 8,
 9,
 9,
 8,
 9,
 9,
 8,
 9,
 9,
 8,
 8,
 7,
 11,
 9,
 8,
 9,
 9,
 9,
 11,
 13,
 13,
 15,
 17,
 13,
 18,
 19,
 29,
 19,
 31,
 38,
 31,
 51,
 36,
 51,
 76,
 65,
 284,
 217,
 116,
 217,
 97,
 42,
 37,
 47,
 40,
 43,
 88,
 27,
 34,
 27,
 34,
 26,
 21,
 18,
 23,
 29,
 37,
 22,
 16,
 17,
 22,
 29,
 21,
 52,
 30,
 49,
 70,
 59,
 78,
 65,
 41,
 64,
 79,
 96,
 500,
 105,
 146,
 86,
 134,
 122,
 113,
 375,
 160,
 160,
 221,
 469,
 252,
 367,
 372,
 393,
 289,
 280,
 387,
 393,
 228,
 500,
 428,
 500,
 500,
 439,
 295,
 300,
 268,
 500,
 394,
 494,
 450,
 412,
 500,
 267,
 253,
 318,
 339,
 500,
 352,
 444,
 500,
 357,
 279,
 500,
 500,
 277,
 345,
 391,
 250,
 249,
 489,
 235,
 266,
 251,
 270,
 215,
 472,
 229,
 447,
 299,
 206,
 266,
 273,
 333,
 500,
 239,
 262,
 500,
 500,
 420,
 500,
 414,
 458,
 243,
 224,
 373,
 269,
 428,
 500,
 273,
 278,
 500,
 334,
 407,
 500,
 373,
 362,
 291,
 328,
 500,
 500,
 229,


In [6]:
reward_list = []
test_max_steps = convergence_reward + 5

state = env.reset()
t = 0
while True:
    env.render()
    action = agent.get_action(state,False)
    next_state, reward, done, _ = env.step(action)
    if done:
        break
    else:
        state = next_state
        t += 1
            
            
print(t)

923


In [7]:
#nips 372 553
#nips dueling 414 558
#nature 359 328
#nature dueling 300 330
#double 338 394
#double dueling  306 366 350