<a href="https://colab.research.google.com/github/kvenkman/my_rl_gym/blob/master/LunarLander-v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install box2d-py --user



In [2]:
# Imports
from collections import deque
import random
import numpy as np
import gym
import os
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt


In [0]:
# DQN Agent Class
class Agent():
    def __init__(self, env):
        self.env = env
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.gamma = 0.99
        self.batch_size = 32
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.alpha = 0.05
        self.memory = deque(maxlen=10000)

        self.train = True # Easy way switch train/test

        # Qnetwork and target network
        self.qnetwork = self.defineNetwork()
        self.target = self.defineNetwork()
        self.alignTarget()


    def defineNetwork(self):
        model = Sequential()
        model.add(Dense(32, input_dim = self.env.observation_space.shape[0], activation="relu"))  # input layer
        model.add(Dense(32, activation="relu")) # hidden layer
        model.add(Dense(self.env.action_space.n, activation='linear')) # output layer
        model.compile(loss='mse',optimizer=Adam(lr=self.alpha))
        return model

    def alignTarget(self):
        self.target.set_weights(self.qnetwork.get_weights())

    def setSimParameters(self, episodes=100, ntimesteps=1000):
        self.episodes = episodes
        self.ntimesteps = ntimesteps

    def loadModel(self, qnetwork_path, target_path):
        self.qnetwork = load_model(qnetwork_path)
        self.target = load_model(target_path)

    def selectAction(self, state):
        if(random.uniform(0, 1) < self.epsilon):
            return env.action_space.sample()
        return np.argmax(self.qnetwork.predict(np.array(state)))

    def addToMemory(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def optimize(self): # , state, action, reward, next_state, done
        if (self.train and (len(self.memory) > self.batch_size)):
            batch = np.array(random.sample(self.memory, self.batch_size))

            train_states = []
            target_rewards = []

            for state, action, reward, next_state, done in batch:
                target = self.qnetwork.predict(state)
                if (done):
                    target[0][action] = reward
                else:
                    t = self.target.predict(next_state)
                    target[0][action] = reward + self.gamma*np.amax(t)

                train_states.append(state)
                target_rewards.append(target)

            self.qnetwork.fit(np.squeeze(train_states), np.squeeze(target_rewards), epochs = 1, verbose = 0)

    def updateEpsilon(self):
        self.epsilon *= self.epsilon_decay


In [4]:
# Initialize environment, agent, sim parameters
env_name = "LunarLander-v2"
env = gym.make(env_name)
myLander = Agent(env)

# Define simulation length
myLander.setSimParameters(episodes = 1000, ntimesteps = 2000)



Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
# To test lander
# myLander.train = False
# myLander.epsilon = 0
# myLander.loadModel("pickled_lander_qnetwork.h5", "pickled_lander_target.h5")

In [0]:
# Run simulation
for i in range(myLander.episodes): # myLander.episodes
    state = env.reset()
    print("Current episode number: ", i, myLander.ntimesteps)
    current_reward = [0]
    for _ in range(myLander.ntimesteps): # myLander.ntimesteps
        if not myLander.train:
            env.render()

        action = myLander.selectAction(state.reshape(1, 8))
        new_state, reward, done, _ = env.step(action)

        current_reward.append(reward)

        myLander.addToMemory(state.reshape(1, 8), action, reward, new_state.reshape(1, 8), done)
        myLander.optimize()

        state = new_state

        if (done):
            break

    myLander.updateEpsilon()
    myLander.alignTarget()

    if((i % 50) == 0):
        myLander.qnetwork.save("pickled_lander_qnetwork.h5")
        myLander.target.save("pickled_lander_target.h5")
        # os.system("git add pickled_lander_*.h5")
        # os.system("git commit -m \"autoupdate saved lunar lander network weights\"")

Current episode number:  0 2000
Current episode number:  1 2000
Current episode number:  2 2000
Current episode number:  3 2000
Current episode number:  4 2000
Current episode number:  5 2000
Current episode number:  6 2000
Current episode number:  7 2000
Current episode number:  8 2000
Current episode number:  9 2000
Current episode number:  10 2000
Current episode number:  11 2000
Current episode number:  12 2000
Current episode number:  13 2000
Current episode number:  14 2000
Current episode number:  15 2000
Current episode number:  16 2000
Current episode number:  17 2000
Current episode number:  18 2000
Current episode number:  19 2000
Current episode number:  20 2000
Current episode number:  21 2000
Current episode number:  22 2000
Current episode number:  23 2000
Current episode number:  24 2000
Current episode number:  25 2000
Current episode number:  26 2000
Current episode number:  27 2000
Current episode number:  28 2000
Current episode number:  29 2000
Current episode numb

In [0]:
plt.figure(figsize=(12,6))
plt.plot(current_reward)
plt.title("Rewards vs Episode #", size=15)
plt.xlabel("Episode #", size=12)
plt.ylabel("Rewards", size=12)
plt.grid()
plt.savefig("rewards.png")

In [0]:
myLander.qnetwork.save("pickled_lander_qnetwork.h5")
myLander.target.save("pickled_lander_target.h5")