In [1]:
# dependencies
# !pip install gym box2d 

In [2]:
# import os 
# os.environ['CUDA_VISIBLE_DEVICES'] = '4'

In [3]:
import time
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym

from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
# from replay_buffer import ReplayBuffer

In [4]:
class ReplayBuffer:
    
    def __init__(self, size, input_shape):
        self.size = size
        self.counter = 0
        self.state_buffer = np.zeros((self.size, input_shape), dtype=float)
        self.action_buffer = np.zeros(self.size, dtype=int)
        self.reward_buffer = np.zeros(self.size, dtype=float)
        self.new_state_buffer = np.zeros((self.size, input_shape), dtype=float)
        self.terminal_buffer = np.zeros(self.size, dtype=bool)

    
    def store_tuples(self, state, action, reward, new_state, done):
        i = self.counter % self.size
        self.state_buffer[i] = state
        self.action_buffer[i] = action
        self.reward_buffer[i] = reward
        self.new_state_buffer[i] = new_state
        self.terminal_buffer[i] = done
        self.counter += 1

    
    def sample_buffer(self, batch_size):
        max_buffer = min(self.counter, self.size)
        batch = np.random.choice(max_buffer, batch_size, replace=False)
        state_batch = self.state_buffer[batch]
        action_batch = self.action_buffer[batch]
        reward_batch = self.reward_buffer[batch]
        new_state_batch = self.new_state_buffer[batch]
        done_batch = self.terminal_buffer[batch]

        return state_batch, action_batch, reward_batch, new_state_batch, done_batch

In [5]:
def DeepQNetwork(lr, num_actions, input_dims, fc1, fc2):
    q_net = Sequential()
    q_net.add(Dense(fc1, input_dim=input_dims, activation='relu'))
    q_net.add(Dense(fc2, activation='relu'))
    q_net.add(Dense(num_actions, activation=None))
    q_net.compile(optimizer=Adam(learning_rate=lr), loss='mse')

    return q_net

In [6]:
def plot_graph(episodes, scores, avg_scores, obj):
    df = pd.DataFrame({'x': episodes, 'Score': scores, 'Average Score': avg_scores, 'Solved Requirement': obj})

    plt.plot('x', 'Score', data=df, marker='', color='blue', linewidth=2, label='Score')
    plt.plot('x', 'Average Score', data=df, marker='', color='orange', linewidth=2, linestyle='dashed',
             label='AverageScore')
    plt.plot('x', 'Solved Requirement', data=df, marker='', color='red', linewidth=2, linestyle='dashed',
             label='Solved Requirement')
    plt.legend()
    plt.show()

In [7]:
class Agent:
    
    def __init__(self, lr, discount_factor, num_actions, epsilon, epsilon_decay, batch_size, input_dims, fc1, fc2):
        self.action_space = [i for i in range(num_actions)]
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.epsilon_decay = epsilon_decay
        self.update_rate = 120
        self.step_counter = 0
        self.buffer = ReplayBuffer(500000, input_dims)
        self.q_net = DeepQNetwork(lr, num_actions, input_dims, fc1, fc2)
        self.q_target_net = DeepQNetwork(lr, num_actions, input_dims, fc1, fc2)

    
    def store_tuple(self, state, action, reward, new_state, done):
        self.buffer.store_tuples(state, action, reward, new_state, done)

    
    def policy(self, observation):
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            state = np.array([observation])
            actions = self.q_net(state)
            action = tf.math.argmax(actions, axis=1).numpy()[0]

        return action

    
    def train(self):
        if self.buffer.counter < self.batch_size:
            return
        if self.step_counter % self.update_rate == 0:
            self.q_target_net.set_weights(self.q_net.get_weights())

        state_batch, action_batch, reward_batch, new_state_batch, done_batch = \
            self.buffer.sample_buffer(self.batch_size)

        q_predicted = self.q_net(state_batch)
        q_next = self.q_target_net(new_state_batch)
        q_max_next = tf.math.reduce_max(q_next, axis=1, keepdims=True).numpy()
        q_target = np.copy(q_predicted)

        for i in range(done_batch.shape[0]):
            target_q_val = reward_batch[i]
            if not done_batch[i]:
                target_q_val += self.discount_factor*q_max_next[i]
            q_target[i, action_batch[i]] = target_q_val
        self.q_net.train_on_batch(state_batch, q_target)
        self.step_counter += 1

    def train_model(self, env, num_episodes, graph):
        
        scores, episodes, avg_scores, obj = [], [], [], []
        goal = 200
        f = 0
        txt = open("saved_networks.txt", "w")
        t1 = time.perf_counter()

        for i in range(num_episodes):
            done = False
            score = 0.0
            state = env.reset()
            while not done:
                action = self.policy(state)
                new_state, reward, done, _ = env.step(action)
                score += reward
                self.store_tuple(state, action, reward, new_state, done)
                state = new_state
                self.train()
            scores.append(score)
            obj.append(goal)
            episodes.append(i)
            avg_score = np.mean(scores[-100:])
            avg_scores.append(avg_score)
            
            avg_score_10 = np.mean(scores[-10:])
            
            print_count = 100
            if (i % print_count == 0) and (i != 0):
#                 plot_graph(episodes, scores, avg_scores, obj)
                print("Episode {0}/{1}, Score: {2} ({3}), AVG Score: {4}".format(i, num_episodes, score, self.epsilon, avg_score))
                t2 = time.perf_counter()
                print("Finished {} episodes in {} seconds".format(print_count, t2-t1))
                t1 = time.perf_counter()
                
            self.epsilon *= self.epsilon_decay
            
            if avg_score_10 > goal:
                print("The average rewards of the last 10 episodes > {}. Early stopping...".format(goal))
                self.q_net.save(("saved_networks/dqn_model{0}".format(i)))
                self.q_net.save_weights(("saved_networks/dqn_model{0}/net_weights{0}.h5".format(i)))
                txt.write("Save {0} - Episode {1}/{2}, Score: {3} ({4}), AVG Score: {5}\n".format(i, i, num_episodes,
                                                                                                  score, self.epsilon,
                                                                                                  avg_score))
                break
            
            if (i==0) or (i==num_episodes-1):
                self.q_net.save(("saved_networks/dqn_model{0}".format(i)))
                self.q_net.save_weights(("saved_networks/dqn_model{0}/net_weights{0}.h5".format(i)))
                txt.write("Save {0} - Episode {1}/{2}, Score: {3} ({4}), AVG Score: {5}\n".format(i, i, num_episodes,
                                                                                                  score, self.epsilon,
                                                                                                  avg_score))
#                 f += 1
                print("Network saved")

        txt.close()
        
        if graph:
            
            plot_graph(episodes, scores, avg_scores, obj)
#             df = pd.DataFrame({'x': episodes, 'Score': scores, 'Average Score': avg_scores, 'Solved Requirement': obj})

#             plt.plot('x', 'Score', data=df, marker='', color='blue', linewidth=2, label='Score')
#             plt.plot('x', 'Average Score', data=df, marker='', color='orange', linewidth=2, linestyle='dashed',
#                      label='AverageScore')
#             plt.plot('x', 'Solved Requirement', data=df, marker='', color='red', linewidth=2, linestyle='dashed',
#                      label='Solved Requirement')
#             plt.legend()
#             plt.savefig('LunarLander_Train.png')
            
        return scores

    def test(self, env, num_episodes, file_type, file, graph):
        if file_type == 'tf':
            self.q_net = tf.keras.models.load_model(file)
        elif file_type == 'h5':
            self.train_model(env, 5, False)
            self.q_net.load_weights(file)
        self.epsilon = 0.0
        scores, episodes, avg_scores, obj = [], [], [], []
        goal = 200
        score = 0.0
        for i in range(num_episodes):
            state = env.reset()
            done = False
            episode_score = 0.0
            while not done:
                action = self.policy(state)
                new_state, reward, done, _ = env.step(action)
                episode_score += reward
                state = new_state
            score += episode_score
            scores.append(episode_score)
            print(f"{i}th round - {episode_score}")
            obj.append(goal)
            episodes.append(i)
            avg_score = np.mean(scores[-100:])
            avg_scores.append(avg_score)

        if graph:
            df = pd.DataFrame({'x': episodes, 'Score': scores, 'Average Score': avg_scores, 'Solved Requirement': obj})

            plt.plot('x', 'Score', data=df, marker='', color='blue', linewidth=2, label='Score')
            plt.plot('x', 'Average Score', data=df, marker='', color='orange', linewidth=2, linestyle='dashed',
                     label='AverageScore')
            plt.plot('x', 'Solved Requirement', data=df, marker='', color='red', linewidth=2, linestyle='dashed',
                     label='Solved Requirement')
            plt.legend()
            plt.savefig('LunarLander_Test.png')

        env.close()



In [8]:
dqn_agent = Agent(lr=0.001, discount_factor=0.99, num_actions=4, epsilon=1.0, epsilon_decay=0.995, batch_size=128, input_dims=8, fc1=512, fc2=256)
dqn_agent.q_net.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               4608      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 1028      
Total params: 136,964
Trainable params: 136,964
Non-trainable params: 0
_________________________________________________________________


In [9]:
env = gym.make("LunarLander-v2")
spec = gym.spec("LunarLander-v2")
train = 1
test = 0
num_episodes = 5000
graph = True

file_type = 'h5'
file = 'saved_networks/dqn_model0'

In [10]:
t_start = time.perf_counter()


if train and not test:
    scores = dqn_agent.train_model(env, num_episodes, graph)
else:
    dqn_agent.test(env, num_episodes, file_type, file, graph)
    
t_end = time.perf_counter()

print("Finished {} episodes in {} seconds".format(num_episodes, t_end - t_start))

INFO:tensorflow:Assets written to: saved_networks/dqn_model0/assets
Network saved


KeyboardInterrupt: 

In [None]:
np.savetxt('scores.out', np.array(scores), delimiter=',') # saving scores for each episode to scores.out

In [None]:
dqn_agent.epsilon