In [1]:
import gym
import random
import math
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from collections import deque
import matplotlib.pyplot as plt
import pandas as pd

Using TensorFlow backend.


In [2]:
%matplotlib notebook

In [3]:
class DQNAgent:
    def __init__(self,state_size,action_size,epsilon,epsilon_decay):
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = 200
        self.learning_rate = 0.000001
        self.Epsilon = epsilon
        self.Gamma = 0.95
        self.Epsilon_decay = epsilon_decay
        self.Epsilon_min = 0.0001
        self.memory = deque(maxlen = 10000)
        self.model = self.buildModel()
    
    def buildModel(self):
        model = Sequential()
        model.add(Dense(12,input_dim = self.state_size,activation = 'tanh'))
        model.add(Dense(6,activation = 'tanh'))
        model.add(Dense(self.action_size,activation = 'sigmoid'))
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        return model
    
    def chooseAction(self,state):
        if (np.random.uniform() <= self.Epsilon):
            return random.randrange(self.action_size)
        action = self.model.predict(state)
        return np.argmax(action)
    
    def store(self,state,action,reward,next_state,done):
        reward = reward/260
        self.memory.append((state,action,reward,next_state,done))
    
    def replay(self):
        if (len(self.memory)<self.batch_size):
            batch = random.sample(self.memory,len(self.memory))
        else:
            batch = random.sample(self.memory,self.batch_size)
        cost = 0
        loss = np.array([])
        for state,action,reward,next_state,done in batch:
            if done:
                target  = reward
            else:
                target = reward + self.Gamma * np.amax(self.model.predict(next_state))
            if (target > 1):
                target = 1
            elif (target < 0):
                target = 0
            current = self.model.predict(state)
            cost += abs(target - current[0][action])
            current[0][action] = target
            loss = np.append(loss,self.model.fit(state,current,epochs=1,verbose=0).history['loss'])
        if (self.Epsilon > self.Epsilon_min):
            self.Epsilon *= self.Epsilon_decay
        return loss
        
    def load(self, name):
        self.model.load_weights(name)
        return self.model

    def save(self, name):
        self.model.save_weights(name)
        
    def change_epsilon(self,eps):
        self.Epsilon = eps

In [4]:
np.random.seed(2)
Episodes = 1000

In [5]:
env = gym.make('LunarLander-v2')
recorder = VideoRecorder(env, base_path='record')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
print (state_size,action_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
8 4


In [6]:
agent = DQNAgent(state_size,action_size,1,0.995)

In [7]:
model1 = agent.load("Lunar_lander-dqn.h5")

In [8]:
agent.change_epsilon(0.041)

In [None]:
done = False
correct =0
# states = np.array([])
for e in range(Episodes):
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    steps = 0
#     if (states.size == 0):
#         states = np.append(states,state)
#     else:
#         states = np.vstack((states,state))
    total_reward = 0
    for time in range(1000):
        #env.render()
        #recorder.capture_frame()
        action = agent.chooseAction(state)
        if action != 1:
            steps +=1
        next_state,reward,done,_ = env.step(action)
        total_reward += reward
#         states = np.vstack((states,next_state))
        next_state = np.reshape(next_state, [1, state_size])
        agent.store(state,action,total_reward,next_state,done)
        state = next_state
        if done:
            break
    agent.replay()
    if (e%10 == 0):
        correct = 0
    if total_reward >= 0:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\n') 
        correct += 1
        if (correct >= 5):
            cont = input()
            if (cont=='no'):
                break;
    else:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}::"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\r')                 
env.close()
#recorder.close()

In [None]:
print (states.shape)
for i in range(states.shape[1]):
    output = pd.Series(states[:,i])
    print (output.describe(), end = '\n \n')

In [55]:
env.close()
recorder.close()

In [None]:
done = False
for e in range(5):
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    steps = 0
    total_reward = 0
    for time in range(1000):
        env.render()
        recorder.capture_frame()
        action = agent.chooseAction(state)
        if action != 1:
            steps +=1
        next_state,reward,done,_ = env.step(action)
        total_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        state = next_state
        if done:
            break
    if total_reward >= 0:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\n')
    else:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}::"
          .format(e, Episodes, agent.Epsilon,time,total_reward,steps),end = '\r')                 
env.close()
recorder.close()

In [92]:
agent.save("Lunar_lander-dqn.h5")

# Transfer Learning

In [9]:
env2 = gym.make('LunarLanderContinuous-v2')
state_size = env2.observation_space.shape[0]
action_size = 2
print (state_size,action_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
8 2


In [118]:
class DQNAgent2:
    def __init__(self,state_size,action_size,epsilon,epsilon_decay,model_init):
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = 200
        self.learning_rate = 0.001
        self.Epsilon = epsilon
        self.Gamma = 0.85
        self.Epsilon_decay = epsilon_decay
        self.Epsilon_min = 0.001
        self.memory = deque(maxlen = 10000)
        self.model_old = model_init
        self.model = self.buildModel()
    
    def buildModel(self):
        model = Sequential()
        model.add(Dense(12,input_dim = self.state_size,activation = 'relu'))
        model.add(Dense(6,activation = 'relu'))
        model.add(Dense(4,activation = 'relu'))
        model.add(Dense(self.action_size,activation = 'tanh'))
        model.layers[0].set_weights(self.model_old.layers[0].get_weights())
        model.layers[0].trainable = False
        model.layers[1].set_weights(self.model_old.layers[1].get_weights())
        model.layers[1].trainable = False
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        return model
    
    def chooseAction(self,state):
        if (np.random.uniform() <= self.Epsilon):
            return np.random.randint(-9999,high = 9999, size=2)/10000
        action = self.model.predict(state)
        action = action[0]
        if (action[0]>1):
            action[0] = 1
        elif (action[0]<-1):
            action[0] = -1
        if (action[1]>1):
            action[1] = 1
        elif (action[1]<-1):
            action[1] = -1
        return action
    
    def store(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    def replay(self):
        if (len(self.memory)<self.batch_size):
            batch = random.sample(self.memory,len(self.memory))
        else:
            batch = random.sample(self.memory,self.batch_size)
        cost = 0
        for state,action,reward,next_state,done in batch:
            if done:
                target  = np.array([[reward,reward]])
            else:
                target = reward + self.Gamma * self.model.predict(next_state)
            self.model.fit(state,target,epochs=1,verbose=0)
        if (self.Epsilon > self.Epsilon_min):
            self.Epsilon *= self.Epsilon_decay
        
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)
        
    def change_epsilon(self,eps):
        self.Epsilon = eps

In [121]:
agent2 = DQNAgent2(state_size,action_size,1,0.999,model1)

In [None]:
done = False
correct =0
for e in range(Episodes):
    state = env2.reset()
    state = np.reshape(state, [1,state_size])
    steps = 0
    total_reward = 0
    for time in range(1000):
        #env2.render()
        #recorder.capture_frame()
        action = agent2.chooseAction(state)
#         print (action.shape == (2,)  and (action >= -1).all() and (action <= 1).all(),action)
        next_state,reward,done,_ = env2.step(action)
        total_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent2.store(state,action,total_reward,next_state,done)
        state = next_state
        if done:
            break
    agent2.replay()
    if (e%10 == 0):
        correct = 0
    if total_reward >= 0:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}"
          .format(e, Episodes, agent2.Epsilon,time,total_reward,steps),end = '\n') 
        correct += 1
        if (correct >= 5):
            cont = input()
            if (cont=='no'):
                break;
    else:
        print("Episode: {}/{}, e: {:.2}, Time: {}, Reward: {},Steps: {}::"
          .format(e, Episodes, agent2.Epsilon,time,total_reward,steps),end = '\r')                 
env2.close()
#recorder.close()

Episode: 1895/3000, e: 0.15, Time: 80, Reward: -932.4545083896038,Steps: 0::::

In [91]:
print(np.random.randint(-10000,high = 10000, size=2)/10000)

[0.9472 0.7179]
