In [1]:
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

Using TensorFlow backend.


In [12]:
class DQNAgent:
    def __init__(self,state_size,action_size):
        self.action_size=action_size
        self.state_size=state_size
        
        self.discount_factor=0.99
        self.learning_rate=0.001
        self.epsilon=1.0
        self.epsilon_decay=0.999
        self.epsilon_min=0.01
        self.batch_size=64
        self.train_start=1000
        
        self.memory=deque(maxlen=2000)
        self.model=self.build_model()
        self.target_model=self.build_model()
        
        self.update_target_model()
        
    def build_model(self):
        model=Sequential()
        model.add(Dense(32,input_dim=self.state_size,activation='relu',
                       kernel_initializer='he_uniform'))
        model.add(Dense(32,activation='relu',
                       kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size,activation='linear',
                       kernel_initializer='he_uniform'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
    
    def get_action(self,state):
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value=self.model.predict(state)
            return np.argmax(q_value[0])
    
    def append_sample(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
        
    def train_model(self):
        if self.epsilon>self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        mini_batch=random.sample(self.memory,self.batch_size)
        states=np.zeros((self.batch_size,self.state_size))
        next_states=np.zeros((self.batch_size,self.state_size))
        actions,rewards,dones=[],[],[]
        for i in range(self.batch_size):
            states[i]=mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            next_states[i]=mini_batch[i][3]
            dones.append(mini_batch[i][4])
            
        q_value=self.model.predict(states)
        next_q_value=self.target_model.predict(next_states)
        
        for i in range(self.batch_size):
            if dones[i]:
                q_value[i][actions[i]]=rewards[i]
            else:
                q_value[i][actions[i]]=rewards[i]+self.discount_factor*(
                np.amax(next_q_value[i]))
            self.model.fit(states,q_value,batch_size=self.batch_size,
                          epochs=1,verbose=0)

In [13]:
env=gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(state_size, action_size)
scores,episodes=[],[]

EPISODES=500



In [None]:
for e in range(EPISODES):
    done=False
    score=0
    
    state=env.reset()
    state=np.reshape(state,[1,state_size])
    
    while not done:
        action=agent.get_action(state)
        next_state,reward,done,_=env.step(action)
        next_state=np.reshape(next_state,[1,state_size])
        
        agent.append_sample(state,action,reward,next_state,done)
        
        if len(agent.memory)>=agent.train_start:
            agent.train_model()
        score+=reward
        state=next_state
        
        if done:
            agent.update_target_model()
            
            scores.append(score)
            episodes.append(e)
            pylab.plot(episodes,scores,'b')
            print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)
            if np.mean(scores[-min(3,len(scores)):])>490:
                sys.exit()

episode: 0   score: 23.0   memory length: 23   epsilon: 1.0
episode: 1   score: 25.0   memory length: 48   epsilon: 1.0
episode: 2   score: 19.0   memory length: 67   epsilon: 1.0
episode: 3   score: 43.0   memory length: 110   epsilon: 1.0
episode: 4   score: 22.0   memory length: 132   epsilon: 1.0
episode: 5   score: 15.0   memory length: 147   epsilon: 1.0
episode: 6   score: 17.0   memory length: 164   epsilon: 1.0
episode: 7   score: 23.0   memory length: 187   epsilon: 1.0
episode: 8   score: 31.0   memory length: 218   epsilon: 1.0
episode: 9   score: 10.0   memory length: 228   epsilon: 1.0
episode: 10   score: 14.0   memory length: 242   epsilon: 1.0
episode: 11   score: 13.0   memory length: 255   epsilon: 1.0
episode: 12   score: 10.0   memory length: 265   epsilon: 1.0
episode: 13   score: 13.0   memory length: 278   epsilon: 1.0
episode: 14   score: 46.0   memory length: 324   epsilon: 1.0
episode: 15   score: 11.0   memory length: 335   epsilon: 1.0
episode: 16   score: 

episode: 115   score: 110.0   memory length: 2000   epsilon: 0.10145456026018454
episode: 116   score: 17.0   memory length: 2000   epsilon: 0.09974356180769112
episode: 117   score: 31.0   memory length: 2000   epsilon: 0.09669744692213977
episode: 118   score: 16.0   memory length: 2000   epsilon: 0.09516183749001367
episode: 119   score: 93.0   memory length: 2000   epsilon: 0.0867068131465305
episode: 120   score: 25.0   memory length: 2000   epsilon: 0.08456495652839072
episode: 121   score: 79.0   memory length: 2000   epsilon: 0.07813830745782524
episode: 122   score: 30.0   memory length: 2000   epsilon: 0.07582783328659772
episode: 123   score: 99.0   memory length: 2000   epsilon: 0.06867710519555668
episode: 124   score: 66.0   memory length: 2000   epsilon: 0.06428863486834849
episode: 125   score: 104.0   memory length: 2000   epsilon: 0.057935529344130744
episode: 126   score: 119.0   memory length: 2000   epsilon: 0.05143255252326238
episode: 127   score: 77.0   memory l