In [55]:
import gym
import random
import math
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque
from threading import Thread

In [68]:
class DQNAgent:
    def __init__(self,state_size,action_size,epsilon,epsilon_decay):
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = 100
        self.learning_rate = 0.001
        self.Epsilon = epsilon
        self.Gamma = 0.7
        self.Epsilon_decay = epsilon_decay
        self.Epsilon_min = 0.001
        self.memory = deque(maxlen = 20000)
        self.model = self.buildModel()
    
    def buildModel(self):
        model = Sequential()
        model.add(Dense(12,input_dim = self.state_size,activation = 'relu'))
        model.add(Dense(6,activation = 'relu'))
        model.add(Dense(self.action_size,activation = 'softmax'))
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        return model
    
    def chooseAction(self,state):
        if (np.random.uniform() <= self.Epsilon):
            return random.randrange(self.action_size)
        action = self.model.predict(state)
        return np.argmax(action)
    
    def store(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    def replay(self):
        if (len(self.memory)<self.batch_size):
            batch = random.sample(self.memory,len(self.memory))
        else:
            batch = random.sample(self.memory,self.batch_size)
        cost = 0
        for state,action,reward,next_state,done in batch:
            if done:
                target  = reward
            else:
                target = reward + self.Gamma * np.amax(self.model.predict(next_state))
            current = self.model.predict(state)
            cost += abs(target - current[0][action])
            current[0][action] = target
            self.model.fit(state,current,epochs=1,verbose=0)
        if (self.Epsilon > self.Epsilon_min):
            self.Epsilon *= self.Epsilon_decay
        
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)
        

In [69]:
np.random.seed(2)
Episodes = 5000

In [70]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
print (state_size,action_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
8 4


In [71]:
agent = DQNAgent(state_size,action_size,1.0,0.995)

In [72]:
done = False
for e in range(Episodes):
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    steps = 0
    total_reward = 0
    for time in range(1000):
        env.render()
        action = agent.chooseAction(state)
        if action != 1:
            steps +=1
        next_state,reward,done,_ = env.step(action)
        total_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.store(state,action,total_reward,next_state,done)
        state = next_state
        if done:
            break
    agent.replay()
    if total_reward >= 0:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\n')            
    else:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}::"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\r')                 
env.close()

Episode: 832/5000, e: 0.015, Time: 638, Reward: 141.56798235600345,Steps: 499:::
Episode: 843/5000, e: 0.015, Time: 174, Reward: 8.564117948573767,Steps: 134::::
Episode: 846/5000, e: 0.014, Time: 514, Reward: 132.11171099348587,Steps: 4254::
Episode: 850/5000, e: 0.014, Time: 999, Reward: 27.911520198310654,Steps: 8993::
Episode: 855/5000, e: 0.014, Time: 999, Reward: 20.086323228162996,Steps: 713:::
Episode: 872/5000, e: 0.013, Time: 899, Reward: 77.80860034159966,Steps: 805::::
Episode: 873/5000, e: 0.013, Time: 738, Reward: 129.0812065669326,Steps: 642
Episode: 874/5000, e: 0.012, Time: 450, Reward: 134.99193802678892,Steps: 407
Episode: 877/5000, e: 0.012, Time: 652, Reward: 102.66776384519538,Steps: 5543::
Episode: 878/5000, e: 0.012, Time: 875, Reward: 157.63716595671647,Steps: 709
Episode: 879/5000, e: 0.012, Time: 449, Reward: 158.1197482856443,Steps: 394
Episode: 882/5000, e: 0.012, Time: 511, Reward: 134.3029345082059,Steps: 44051::
Episode: 883/5000, e: 0.012, Time: 999, Re

In [67]:
env.close()

In [73]:
agent.save("Lunar_lander-dqn.h5")