In [1]:
import gym
import random
import math
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from collections import deque
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
%matplotlib notebook

In [24]:
class DQNAgent:
    def __init__(self,state_size,action_size,epsilon,epsilon_decay):
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = 200
        self.learning_rate = 0.001
        self.Epsilon = epsilon
        self.Gamma = 0.85
        self.Epsilon_decay = epsilon_decay
        self.Epsilon_min = 0.001
        self.memory = deque(maxlen = 10000)
        self.model = self.buildModel()
    
    def buildModel(self):
        model = Sequential()
        model.add(Dense(12,input_dim = self.state_size,activation = 'relu'))
        model.add(Dense(6,activation = 'relu'))
        model.add(Dense(self.action_size,activation = 'softmax'))
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        return model
    
    def chooseAction(self,state):
        if (np.random.uniform() <= self.Epsilon):
            return random.randrange(self.action_size)
        action = self.model.predict(state)
        return np.argmax(action)
    
    def store(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    def replay(self):
        if (len(self.memory)<self.batch_size):
            batch = random.sample(self.memory,len(self.memory))
        else:
            batch = random.sample(self.memory,self.batch_size)
        cost = 0
        loss = np.array([])
        for state,action,reward,next_state,done in batch:
            if done:
                target  = reward
            else:
                target = reward + self.Gamma * np.amax(self.model.predict(next_state))
            current = self.model.predict(state)
            cost += abs(target - current[0][action])
            current[0][action] = target
            loss = np.append(loss,self.model.fit(state,current,epochs=1,verbose=0).history['loss'])
        if (self.Epsilon > self.Epsilon_min):
            self.Epsilon *= self.Epsilon_decay
        return loss
        
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)
        
    def change_epsilon(self,eps):
        self.Epsilon = eps

In [16]:
np.random.seed(2)
Episodes = 3000

In [17]:
env = gym.make('LunarLander-v2')
recorder = VideoRecorder(env, base_path='record')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
print (state_size,action_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
8 4


In [34]:
agent = DQNAgent(state_size,action_size,0.0,0.999)

In [35]:
agent.load("Lunar_lander-dqn.h5")

In [36]:
done = False
correct =0
for e in range(Episodes):
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    steps = 0
    total_reward = 0
    for time in range(1000):
        #env.render()
        #recorder.capture_frame()
        action = agent.chooseAction(state)
        if action != 1:
            steps +=1
        next_state,reward,done,_ = env.step(action)
        total_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.store(state,action,total_reward,next_state,done)
        state = next_state
        if done:
            break
    agent.replay()
    if (e%10 == 0):
        correct = 0
    if total_reward >= 0:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\n') 
        correct += 1
        if (correct >= 5):
            cont = input()
            if (cont=='no'):
                break;
    else:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}::"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\r')                 
env.close()
#recorder.close()

Episode: 3/3000, e: 0.0, Time: 549, Reward: 118.17562002559654,Steps: 455:::
Episode: 7/3000, e: 0.0, Time: 687, Reward: 81.5522162006861,Steps: 510595::
Episode: 8/3000, e: 0.0, Time: 773, Reward: 34.00672453325457,Steps: 537
Episode: 10/3000, e: 0.0, Time: 999, Reward: 10.302298007086737,Steps: 727
Episode: 11/3000, e: 0.0, Time: 859, Reward: 33.69567878677526,Steps: 473
Episode: 12/3000, e: 0.0, Time: 595, Reward: 127.43028210250111,Steps: 428
Episode: 15/3000, e: 0.0, Time: 999, Reward: 7.152525559517258,Steps: 899:::
Episode: 19/3000, e: 0.0, Time: 979, Reward: 66.11764968820816,Steps: 89698::
no


In [37]:
done = False
for e in range(5):
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    steps = 0
    total_reward = 0
    for time in range(1000):
        env.render()
        recorder.capture_frame()
        action = agent.chooseAction(state)
        if action != 1:
            steps +=1
        next_state,reward,done,_ = env.step(action)
        total_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        state = next_state
        if done:
            break
    if total_reward >= 0:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\n')
    else:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}::"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\r')                 
env.close()
recorder.close()

Episode: 1/3000, e: 0.0, Time: 568, Reward: 171.84682548221292,Steps: 5249::
Episode: 2/3000, e: 0.0, Time: 652, Reward: 132.4929319176512,Steps: 605
Episode: 3/3000, e: 0.0, Time: 892, Reward: 66.58056660843235,Steps: 839
Episode: 4/3000, e: 0.0, Time: 587, Reward: 171.25028170361927,Steps: 539


In [22]:
agent.save("Lunar_lander-dqn.h5")