In [145]:
import numpy as np
import tensorflow as tf
from random import random, sample
from keras.layers import Dense
from keras.models import Sequential
from collections import deque
%run MazeEnv.ipynb

In [171]:
class Agent:
    def __init__(self):
        # hyperparameters
        self.episodes = 500
        self.time_allowed_in_game = 150
        self.epsilon = 1
        self.min_epsilon = 0.01
        self.epsilon_multiplier = 0.95
        self.discount_rate = 0.9

        # create 2 models here - one for use with the predictions and the other to train on
        # the target model will be set to the trained model after a specfic num of iterations
        self.model = self.create_model()
        self.target_model = self.create_model()
        # both models must start with the same weights
        self.target_model.set_weights(self.model.get_weights())
        
        # create the list for the replay memory
        self.replay_memory = deque(maxlen=1028)
        
        # keep track of how often the target_weights = normal_weights
        self.t_target_current = 0
        self.t_target_threshold = 256
        # keep track of how often the to get from minibatch and train
        self.t_train_current = 0
        self.t_train_threshold = 128
        # mini batch to train on
        self.mini_batch_size = 64
        
    
    # create the model structure to be used
    def create_model(self):
        model = Sequential()
        
        # inputs will be the x and y coordinates of the maze - 2 input
        model.add(Dense(3, input_dim = 2, activation = "relu"))
        model.add(Dense(4, activation = "linear"))
        
        # the mse loss works best in dnq
        model.compile(optimizer = "adam", loss = "mean_squared_error")
        return model

    
    # choose an action to perform in the game
    def select_action(self, state):
        r = random()
        # choose a random action
        if r < self.epsilon:
            # this is coming from the maze file
            return select_move_from_num(r)
        else:
            x = state[0]
            y = state[1]
            inputs = np.array([x, y]).reshape((1, 2))
            move_prediction = np.array(self.model.predict(inputs)[0])
            prediction = np.argmax(move_prediction)
            return prediction
    
    
    # train the model
    def train(self):
        print("TRAINING")
        # training the model based on the minibatch - check if enough to do the training
        if len(self.replay_memory) < self.mini_batch_size:
            return
        
        # get a sample of the replay memory with the minibatch
        mini_batch = sample(self.replay_memory, self.mini_batch_size)
        
        # use the target values for training the model
        target_y = []
        x = []
        
        # go through each of the transitions for gradient descent
        for i, (old_state, action_direction, reward, new_state, done) in enumerate(mini_batch):
            actual = reward
            if done == False:
                inputs = np.array(new_state).reshape((1, 2))
                move_prediction = np.array(self.target_model.predict(inputs)[0])
                next_state_q_val = max(move_prediction)
                actual = reward + (self.discount_rate * next_state_q_val)
            
            target_y.append(actual)
            x.append(list(old_state))
            
        target_y = np.array(target_y).reshape((len(target_y), 1))
        x = np.array(x).reshape((len(x), 2))
        self.model.fit(x, target_y)
        
        
    # run the game multiple times across the env
    def run_game(self):
        for i in range(self.episodes):
            print(f"EPISODE {i}")
            env = Environment()
            
            for j in range(self.time_allowed_in_game):
                # get the values for the transition tuple to add to replay memory
                old_state = (env.current_point_x, env.current_point_y)
                action_direction = self.select_action(old_state)
                env.move(action_direction)
                new_state = (env.current_point_x, env.current_point_y)
                done = env.is_done()
                reward = env.get_reward()
#                 print(old_state, action_direction, new_state, reward)
                self.replay_memory.append((old_state, action_direction, reward, new_state, done))
                
                self.t_train_current += 1
                self.t_target_current += 1
                
                if self.t_train_current % self.t_train_threshold == 0:
                    self.train()
                
                # update the target model with the weight of the trained model
                if self.t_target_current % self.t_target_threshold == 0:
                    self.target_model.set_weights(self.model.get_weights())
                
                # break out of loop since it is completed - no longer wait on timesteps
                if done:
                    break
            
            # modify the epsilon value after each episode
            # encourage exploration at the beginning then exploitation
            self.epsilon = max(self.epsilon * self.epsilon_multiplier, self.min_epsilon)

In [None]:
agent = Agent()
agent.run_game()

EPISODE 0
TRAINING
EPISODE 1
TRAINING
EPISODE 2
TRAINING
EPISODE 3
TRAINING
EPISODE 4
TRAINING
EPISODE 5
TRAINING
TRAINING
EPISODE 6
TRAINING
EPISODE 7
TRAINING
EPISODE 8
TRAINING
EPISODE 9
TRAINING
EPISODE 10
TRAINING
EPISODE 11
TRAINING
TRAINING
EPISODE 12
TRAINING
EPISODE 13
TRAINING
EPISODE 14
TRAINING
EPISODE 15
TRAINING
EPISODE 16
TRAINING
EPISODE 17
TRAINING
TRAINING
EPISODE 18
TRAINING
EPISODE 19
TRAINING
EPISODE 20
TRAINING
EPISODE 21
TRAINING
EPISODE 22
TRAINING
EPISODE 23
TRAINING
TRAINING
EPISODE 24
TRAINING
EPISODE 25
TRAINING
EPISODE 26
TRAINING
EPISODE 27
TRAINING
EPISODE 28
TRAINING
EPISODE 29
TRAINING
TRAINING
EPISODE 30
TRAINING
EPISODE 31
TRAINING
EPISODE 32
TRAINING
EPISODE 33
TRAINING
EPISODE 34
TRAINING
TRAINING
EPISODE 35
TRAINING
EPISODE 36
TRAINING
EPISODE 37
TRAINING
EPISODE 38
TRAINING
EPISODE 39
TRAINING
EPISODE 40
TRAINING
TRAINING
EPISODE 41
TRAINING
EPISODE 42
TRAINING
EPISODE 43
TRAINING
EPISODE 44
TRAINING
EPISODE 45
TRAINING
EPISODE 46
TRAINING
TRAININ

EPISODE 82
TRAINING
EPISODE 83
TRAINING
EPISODE 84
TRAINING
EPISODE 85
TRAINING
EPISODE 86
TRAINING
EPISODE 87
TRAINING
TRAINING
EPISODE 88
TRAINING
EPISODE 89
TRAINING
EPISODE 90
TRAINING
EPISODE 91
TRAINING
EPISODE 92
TRAINING
EPISODE 93
TRAINING
TRAINING
EPISODE 94
TRAINING
EPISODE 95
TRAINING
EPISODE 96
TRAINING
EPISODE 97
TRAINING


In [156]:
agent.model.get_weights()

[array([[-0.9470717 ,  1.0848715 ,  0.90001225],
        [-0.355273  ,  0.97696346, -0.00558327]], dtype=float32),
 array([ 0.        ,  0.12024263, -0.10834593], dtype=float32),
 array([[-0.05456865,  0.6267977 ,  0.15340126, -0.0262621 ],
        [ 0.7338221 ,  0.4332879 ,  0.70204073,  0.3178726 ],
        [-0.11937756,  0.7750663 , -0.4944009 ,  0.12508443]],
       dtype=float32),
 array([ 0.10908177, -0.09804757,  0.11643344,  0.11921539], dtype=float32)]

In [170]:
env_test = Environment()
env_test.current_point_x = 1
env_test.current_point_y = 9
state = (env_test.current_point_x, env_test.current_point_y)
            
for j in range(100):
    x = state[0]
    y = state[1]
    inputs = np.array([x, y]).reshape((1, 2))
    move_prediction = np.array(agent.model.predict(inputs)[0])
    print(move_prediction)
    prediction_action = np.argmax(move_prediction)
    env_test.move(prediction_action)
    state = (env_test.current_point_x, env_test.current_point_y)
    print(state, prediction_action)
    env_test.maze[env_test.current_point_y, env_test.current_point_x] = 1
    done = env_test.is_done()
    if done:
        print("DONE")
        break
print(env_test.maze)

[7.357169  4.808519  6.768729  3.3899775]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 2.9523869]
(0, 9) 0
[6.649575  3.7638104 6.3736625 