<div style="font-size: 40px;">
    Lunar Landing Deep Q learning Project
</div>

In [121]:
import numpy as np
import random
import tensorflow as tf
import gymnasium as gym
from collections import deque
from bayes_opt import BayesianOptimization


<div style="font-size: 20px;">
    Creating the agent and methods for training
</div>

Below is the class concerning the agent but also includes important methods such as replay which will sample from the memory and using to fit the neural network. This method will be called inside another one that will also store experiences and select actions accoding to the epsilon greedy policy.

In [127]:
class agent():

    #creates the neural network
    def create_model(self):
        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        layers=[]
        
        #appends the layers according to the hyperparamater self.num_layers
        for layer in range(self.num_layers):
            if self.num_layers == 1:
                layers.append(tf.keras.layers.Dense(4, input_shape=(8,), activation="linear"))
                break
            elif layer==0:
                layers.append(tf.keras.layers.Dense(self.num_neurons[layer], input_shape=(8,), activation="relu"))
            elif layer==self.num_layers-1:
                layers.append(tf.keras.layers.Dense(4, activation="linear"))
            else:
                layers.append(tf.keras.layers.Dense(self.num_neurons[layer], activation="relu"))

        model = tf.keras.Sequential(layers)
        model.compile(optimizer=optimizer, loss="mse", metrics=["accuracy"])

        return model
    

    def __init__(self, discount_rate, epsilon_decay, eps, batch_size, learning_rate, num_layers, num_neurons):
        self.env = gym.make("LunarLander-v2")
        self.learning_rate = float(learning_rate)
        self.num_layers = int(num_layers)
        self.num_neurons = [int(n) for n in num_neurons]
        self.model = self.create_model()
        self.discount_rate = float(discount_rate)
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = float(epsilon_decay)
        self.training_start = 1000
        self.eps = int(eps)
        self.batch_size = int(batch_size)
        self.memory = deque(maxlen=1000000)

    #picks an action using the epsilon greedy policy
    def act(self, state):
        if random.uniform(0, 1) < self.epsilon:
            action = self.env.action_space.sample()
        else:
            action = np.argmax(self.model.predict(state, verbose=0))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        return int(action)

    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def untuple(self, state):
        if isinstance(state, tuple):
            state = state[0]
        return np.array(state).reshape((1,8))
    
    #samples a batch from the memory and uses it to
    #compute the target from the Bellman equation
    #in order to train the model
    def replay(self):
        if len(self.memory)<self.training_start:
            return
        
        batch = random.sample(self.memory, min(len(self.memory), self.batch_size))
        states = np.zeros((self.batch_size, 8))
        next_states = np.zeros((self.batch_size, 8))
        rewards, dones, actions = [], [], []

        for i in range(self.batch_size):
            states[i]= batch[i][0]
            next_states[i]= batch[i][3]
            dones.append(batch[i][-1])
            rewards.append(batch[i][2])
            actions.append(batch[i][1])

        target_state = self.model.predict(states, verbose=0)
        target_next_state = self.model.predict(next_states, verbose=0)

        for i in range(self.batch_size):
            if dones[i]:
                target_state[i][int(actions[i])] = rewards[i]
            else:
                target_state[i][int(actions[i])]= rewards[i]+self.discount_rate*np.max(target_next_state[i])
        
        self.model.fit(states, target_state, batch_size = self.batch_size, verbose=0)

    def save(self, name):
        self.model.save(name)

    #trains the model as well as appends each experience the memory
    #and replays at every episode
    def run(self):
        training = True
        scores=[]
        for episode in range(self.eps):
            state = self.env.reset()
            state = self.untuple(state)
            done = False
            score = 0
            i=0
            if not training:
                break
            while not done:
                i+=1
                action = self.act(state)
                next_state, reward, done, info, _ = self.env.step(action)
                next_state = self.untuple(next_state)
                next_state = np.array(next_state)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                score+=reward

                if done or i>=500:
                    print("episode: {}/{}, score:{}".format(episode+1, self.eps, score))
                    scores.append(score)
                    if score>=300:
                        training=False
                        self.save("model.keras")
                    break

                self.replay()

        print("Average score:{}".format(sum(scores)/len(scores)))

        return int(sum(scores)/len(scores))
    

    def test(self):
        env = gym.make("LunarLander-v2", render_mode="human")
        model = tf.keras.models.load_model("model.keras")
        for episode in range(10):
            state = env.reset()
            done = False
            if isinstance(state, tuple):
                state = np.array(state[0])
            state=state.reshape((1,8))

            score=0
            while not done:
                action = np.argmax(model.predict(state, verbose=0))
                next_state, reward, done, info, _ = env.step(action)
                if isinstance(next_state, tuple):
                    next_state = np.array(next_state[0])
                next_state=next_state.reshape((1,8))
                state = next_state
                score+=reward

                if done or (state[0][6]==1 and state[0][7]==1):
                    print("episode: {}/{}, score:{}".format(episode+1, 10, score))
                    break


<div style="font-size: 20px;">
    Hyperparameter Tuning Using Bayesian Optimization
</div>

The code below uses the bayesian optimization which essentially figures out what is the best combinations of hyperparamters including the neural network architecture and learning rate, that will maximize the average reward in the first 100 episodes.

In [None]:
#(Please do not run this code as it is extremely long to run and not necessary)

#dictionary that sets the bounds for the hyperparameters
parameter_bounds = {
    'learning_rate':(1e-4, 1e-2),
    'num_layers':(1,5),
    'num_neuron1':(1,600),
    'num_neuron2':(1,600),
    'num_neuron3':(1,600),
    'num_neuron4':(1,600),
    'num_neuron5':(1,600),
    'discount_rate':(0.7, 0.99),
    'epsilon_decay':(0.01, 0.1),
    'batch_size':(8, 512),
}

#function used that will be used by the optimizer
def optimize(learning_rate,num_neuron1,num_neuron2,num_neuron3,num_neuron4,num_neuron5, num_layers, discount_rate, epsilon_decay, batch_size):
    num_neurons=[int(num_neuron1),int(num_neuron2),int(num_neuron3),int(num_neuron4),int(num_neuron5)]
    learning_rate = float(learning_rate)
    num_layers = int(num_layers)
    discount_rate = float(discount_rate)
    epsilon_decay = float(epsilon_decay)
    batch_size = int(batch_size)
    
    space_ship = agent(discount_rate, epsilon_decay, 100, batch_size, learning_rate, num_layers, num_neurons)
    return space_ship.run()


optimizer = BayesianOptimization(
    f=optimize,
    pbounds=parameter_bounds
)

optimizer.maximize()

print(optimizer.max)



<div style="font-size: 20px;">
    Training the Agent
</div>

Using the hyperparameters from the Bayesian Optimization, we will train the agent in a series of 200 epsiodes.

In [None]:
space_ship=agent(batch_size=179, discount_rate=0.99, epsilon_decay=0.995, learning_rate=0.001346, num_layers=3, num_neurons=[128, 128, 0, 0, 0], eps=200)

In [None]:
#(Please do not run this code as it is extremely long to run and not necessary)
space_ship.run()

<div style="font-size: 20px;">
    Testing
</div>

In [None]:
space_ship.test()