In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random
from collections import deque
import argparse
from tensorflow.keras.callbacks import TensorBoard
import time



In [2]:
# Set this to True to enable TensorBoard, or False to disable it
use_tensorboard = False

if use_tensorboard:
    tensorboard = TensorBoard(log_dir=f"logs/{time.time()}")
else:
    tensorboard = None

# ... rest of your code


In [3]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0   # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])


    def replay(self, batch_size, tensorboard_callback=None):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0, callbacks=[tensorboard_callback] if tensorboard_callback else [])
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


In [None]:
if __name__ == "__main__":
    env = gym.make('LunarLander-v2')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    episodes = 1000

    for e in range(episodes):
        state_info = env.reset()
        state = state_info[0] if isinstance(state_info, tuple) else state_info
        state = np.reshape(state, [1, state_size])
        total_reward = 0

        # Render the environment every 50 episodes
        render = e % 50 == 0

        for time_step in range(500):
            if render:
                %env.render()

            action = agent.act(state)
            step_info = env.step(action)

            if isinstance(step_info, tuple) and len(step_info) >= 4:
                next_state, reward, done, _ = step_info[:4]
            else:
                raise ValueError(f"Unexpected return format from env.step: {step_info}")

            next_state = next_state[0] if isinstance(next_state, tuple) else next_state
            next_state = np.reshape(next_state, [1, state_size])

            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            if len(agent.memory) > 32:
                agent.replay(32, tensorboard_callback=tensorboard)

            if done:
                print(f"Episode: {e}/{episodes}, Reward: {total_reward}, Epsilon: {agent.epsilon}", flush=True)
                break

        if e % 10 == 0:
            agent.save(f"./save/lunarlander-dqn-{e}.h5")


2023-12-18 00:43:39.892175: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-12-18 00:43:39.892300: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-18 00:43:39.894057: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2023-12-18 00:43:39.973101: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2023-12-18 00:43:39.973300: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2995200000 Hz


In [None]:
env.render()

In [None]:
trained_agent = DQNAgent(state_size, action_size)
trained_agent.load("./save/lunarlander-dqn.h5")

for e in range(100):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        env.render()
        action = trained_agent.act(state)
        next_state, reward, done, _ = env.step(action)
        state = np.reshape(next_state, [1, state_size])
        if done:
            break
env.close()
