In [None]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
import random
from tensorflow import keras
from tensorflow.keras import layers
from collections import deque

In [None]:
#test
env = gym.make("LunarLander-v2", render_mode="human")
num_actions = env.action_space.n
input_shape = env.observation_space.shape

# print(f"num_actions: {num_actions}, input_shape: {input_shape}")
# env.reset()
# print(env.step(0))

In [None]:
#DQN-AGENT
class DQN:
    def __init__(self, input_shape, num_actions):
        self.input_shape = input_shape
        self.num_actions = num_actions
        self.replay_buffer = deque(maxlen=100000) #a high enough replay-buffer size is needed to prevent catastrophic forgetting (initial value was 100.000, but I reduced it to 20.000 to save ram)
        self.learning_rate = 0.0001
        self.discount_factor = 0.99
        
        self.model = self.build_model()
        self.target_model = self.build_model()

    # build model
    def build_model(self):
        model = tf.keras.models.Sequential([
            layers.Dense(64, activation="relu", input_shape=self.input_shape),
            layers.Dense(64, activation="relu"), 
            layers.Dense(64, activation="relu"),
            layers.Dense(self.num_actions, activation="linear")
        ])
        #compiles model with mean squared error loss and adam optimizer
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model
    
    #agent takes action based on epsilon-greedy policy
    def act(self, state, epsilon = 0.0):
        if np.random.rand() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            Q_values = self.model.predict(state[np.newaxis])
            #Q_values = self.model.predict(state)
            return np.argmax(Q_values[0])
    
    #agent stores experience in replay buffer
    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    #agent samples from replay buffer and updates Q-values
    def replay(self, batch_size):
        if len(self.replay_buffer) < batch_size:
            return
        
        samples = random.sample(self.replay_buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*samples))
        
        #should q_values be calculated using the target model or the model?
        q_values = self.model.predict(states)
        next_q_values = self.target_model.predict(next_states)
        max_next_q_values = np.max(next_q_values, axis=1)
        target_q_values = (rewards + (1 - dones) * self.discount_factor * max_next_q_values)
        q_values[range(batch_size), actions] = target_q_values
        self.model.fit(states, q_values, verbose=0)

    #updates target model weights with model weights
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())


In [None]:
#Preperation
keras.utils.disable_interactive_logging()
env = gym.make("LunarLander-v2", render_mode="human")
num_actions = env.action_space.n
input_shape = env.observation_space.shape

#Training
dqn = DQN(input_shape, num_actions)

In [None]:
#training loop

running_reward = []
batch_size = 32 
epsilon = 1.0
epsilon_decay = 0.99941 #0.995
min_epsilon = 0.01
target_update_counter = 0
#every 10 steps the target model is updated
target_update_freq = 1
num_episodes = 5000 
min_reward = -250

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    while not done:
        env.render()
        action = dqn.act(state, epsilon)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        dqn.remember(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward
        dqn.replay(batch_size)

        if episode_reward < min_reward:
            done = True

    target_update_counter += 1
    if target_update_counter % target_update_freq == 0:
        dqn.update_target_model()
        
    running_reward.append(episode_reward)
    epsilon = max(min_epsilon, epsilon * epsilon_decay)
    print(f"Episode: {episode} episode reward: {episode_reward} Epsilon: {epsilon}")

In [None]:
# create model and load 
NAME_TO_LOAD = "models/initial/weights.h5"
dqn.model.load_weights(NAME_TO_LOAD)

In [None]:
#testing model 
for episode in range(20):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    while not done:
        env.render()
        #with epsilon = 0.0 so the agent exploits the learned policy 
        action = dqn.act(state, 0.0)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        state = next_state
        episode_reward += reward
    print(f"Test Episode: {episode} episode reward: {episode_reward}")

In [None]:
#save weights, rewards and model
NAME = "initial"

dqn.model.save(f"models/{NAME}/weights.h5")
np.save(f"models/{NAME}/rewards.npy", running_reward)

#if test_rewards is not empty
# if test_rewards:
#     np.save(f"models/{NAME}/test_rewards.npy", test_rewards)

#save architecture, replay_buffer_length, learning rate, discount factor, batch_size, epsilon_decay, target_update_frequency
with open(f"models/{NAME}/info.txt", "w") as f:
    #write model summary and other details to file
    dqn.model.summary(print_fn=lambda x: f.write(x + '\n'))
    f.write(f"Replay buffer length: {len(dqn.replay_buffer)}\n")
    f.write(f"Learning rate: {dqn.learning_rate}\n")
    f.write(f"Discount factor: {dqn.discount_factor}\n")
    f.write(f"Batch size: {batch_size}\n")
    f.write(f"Epsilon decay: {epsilon_decay}\n")
    f.write(f"Target update frequency: {target_update_freq}\n")
    f.write(f"Episodes: {num_episodes}\n")

In [None]:
import matplotlib.pyplot as plt

#plot the running reward over the episodes
plt.plot(running_reward)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.show()