In [61]:
import numpy as np
import tensorflow as tf
from random import random, sample
from keras.layers import Dense
from keras.models import Sequential
from collections import deque
%run MazeEnv.ipynb

In [62]:
class Agent:
    def __init__(self):
        # hyperparameters
        self.episodes = 1028
        self.time_allowed_in_game = 500
        self.epsilon = 1
        self.min_epsilon = 0.01
        self.epsilon_multiplier = 0.99
        self.discount_rate = 0.9

        # create 2 models here - one for use with the predictions and the other to train on
        # the target model will be set to the trained model after a specfic num of iterations
        self.model = self.create_model()
        self.target_model = self.create_model()
        # both models must start with the same weights
        self.target_model.set_weights(self.model.get_weights())
        
        # create the list for the replay memory
        self.replay_memory = deque(maxlen=1028)
        
        # keep track of how often the target_weights = normal_weights
        self.t_target_current = 0
        self.t_target_threshold = 512
        # keep track of how often the to get from minibatch and train
        self.t_train_current = 0
        self.t_train_threshold = 256
        # mini batch to train on
        self.mini_batch_size = 64
        
    
    # create the model structure to be used
    def create_model(self):
        model = Sequential()
        
        # inputs will be the x and y coordinates of the maze - 2 input
        model.add(Dense(3, input_dim = 2, activation = "relu"))
        model.add(Dense(4, activation = "linear"))
        
        # the mse loss works best in dnq
        model.compile(optimizer = "adam", loss = "mean_squared_error")
        return model

    
    # choose an action to perform in the game
    def select_action(self, state):
        r = random()
        # choose a random action
        if r < self.epsilon:
            # this is coming from the maze file
            return select_move_from_num(r)
        else:
            x = state[0]
            y = state[1]
            inputs = np.array([x, y]).reshape((1, 2))
            move_prediction = np.array(self.model.predict(inputs)[0])
            prediction = np.argmax(move_prediction)
            return prediction
    
    # reward + bestaction(for next state) == q(state)
    # train the model
    def train(self):
        print("TRAINING")
        # training the model based on the minibatch - check if enough to do the training
        if len(self.replay_memory) < self.mini_batch_size:
            return
        
        # get a sample of the replay memory with the minibatch
        mini_batch = sample(self.replay_memory, self.mini_batch_size)
        
        # use the target values for training the model
        target_y = []
        x = []
        
        # go through each of the transitions for gradient descent
        for i, (old_state, action_direction, reward, new_state, done) in enumerate(mini_batch):
            actual = reward
            if done == False:
                inputs = np.array(new_state).reshape((1, 2))
                move_prediction = np.array(self.target_model.predict(inputs)[0])
                next_state_q_val = max(move_prediction)
                actual = reward + (self.discount_rate * next_state_q_val)
            
            target_y.append(actual)
            x.append(list(old_state))
            
        target_y = np.array(target_y).reshape((len(target_y), 1))
        x = np.array(x).reshape((len(x), 2))
        self.model.fit(x, target_y)
        
        
    # run the game multiple times across the env
    def run_game(self):
        for i in range(self.episodes):
            print(f"EPISODE {i}")
            env = Environment()
            
            for j in range(self.time_allowed_in_game):
                # get the values for the transition tuple to add to replay memory
                old_state = (env.current_point_x, env.current_point_y)
                action_direction = self.select_action(old_state)
                reward = env.move(action_direction)
                new_state = (env.current_point_x, env.current_point_y)
                done = env.is_done()
                self.replay_memory.append((old_state, action_direction, reward, new_state, done))
    
                # increase the train and target values
                self.t_train_current += 1
                self.t_target_current += 1
                
                # update the weights of the train model
                if self.t_train_current  % self.t_train_threshold == 0:
                    self.train()
                
                # update the target model with the weight of the trained model
                if self.t_target_current % self.t_target_threshold == 0:
                    self.target_model.set_weights(self.model.get_weights())
                
                # break out of loop since it is completed - no longer wait on timesteps
                if done:
                    break
            
            # modify the epsilon value after each episode
            # encourage exploration at the beginning then exploitation
            self.epsilon = max(self.epsilon * self.epsilon_multiplier, self.min_epsilon)

In [63]:
agent = Agent()
agent.run_game()

EPISODE 0
TRAINING
EPISODE 1
TRAINING
EPISODE 2
EPISODE 3
EPISODE 4
TRAINING
EPISODE 5
EPISODE 6
TRAINING
TRAINING
EPISODE 7
TRAINING
EPISODE 8
EPISODE 9
TRAINING
EPISODE 10
TRAINING
EPISODE 11
EPISODE 12
TRAINING
EPISODE 13
TRAINING
EPISODE 14
TRAINING
EPISODE 15
EPISODE 16
TRAINING
TRAINING
EPISODE 17
TRAINING
TRAINING
EPISODE 18
TRAINING
TRAINING
EPISODE 19
TRAINING
TRAINING
EPISODE 20
TRAINING
TRAINING
EPISODE 21
TRAINING
TRAINING
EPISODE 22
TRAINING
EPISODE 23
TRAINING
TRAINING
EPISODE 24
TRAINING
TRAINING
EPISODE 25
TRAINING
TRAINING
EPISODE 26
TRAINING
TRAINING
EPISODE 27
TRAINING
TRAINING
EPISODE 28
TRAINING
TRAINING
EPISODE 29
TRAINING
TRAINING
EPISODE 30
TRAINING
TRAINING
EPISODE 31
TRAINING
TRAINING
EPISODE 32
TRAINING
TRAINING
EPISODE 33
TRAINING
TRAINING
EPISODE 34
TRAINING
TRAINING
EPISODE 35
TRAINING
TRAINING
EPISODE 36
TRAINING
TRAINING
EPISODE 37
TRAINING
TRAINING
EPISODE 38
TRAINING
TRAINING
EPISODE 39
TRAINING
TRAINING
EPISODE 40
TRAINING
TRAINING
EPISODE 41
TRAINING

EPISODE 61
TRAINING
TRAINING
EPISODE 62
TRAINING
TRAINING
EPISODE 63
TRAINING
EPISODE 64
TRAINING
TRAINING
EPISODE 65
TRAINING
TRAINING
EPISODE 66
TRAINING
TRAINING
EPISODE 67
TRAINING
TRAINING
EPISODE 68
TRAINING
TRAINING
EPISODE 69
TRAINING
TRAINING
EPISODE 70
TRAINING
TRAINING
EPISODE 71
TRAINING
TRAINING
EPISODE 72
TRAINING
TRAINING
EPISODE 73
TRAINING
TRAINING
EPISODE 74
TRAINING
TRAINING
EPISODE 75
TRAINING
TRAINING
EPISODE 76
TRAINING
TRAINING
EPISODE 77
TRAINING
TRAINING
EPISODE 78
TRAINING
TRAINING
EPISODE 79
TRAINING
TRAINING
EPISODE 80
TRAINING
TRAINING
EPISODE 81
TRAINING
TRAINING
EPISODE 82
TRAINING
TRAINING
EPISODE 83
TRAINING
TRAINING
EPISODE 84
TRAINING
EPISODE 85
TRAINING
TRAINING
EPISODE 86
TRAINING
TRAINING
EPISODE 87
TRAINING
TRAINING
EPISODE 88
TRAINING
TRAINING
EPISODE 89
TRAINING
TRAINING
EPISODE 90
TRAINING
TRAINING
EPISODE 91
TRAINING
TRAINING
EPISODE 92
TRAINING
TRAINING
EPISODE 93
TRAINING
TRAINING
EPISODE 94
TRAINING
TRAINING
EPISODE 95
TRAINING
TRAINING
EPI

EPISODE 112
TRAINING
TRAINING
EPISODE 113
TRAINING
TRAINING
EPISODE 114
TRAINING
TRAINING
EPISODE 115
TRAINING
TRAINING
EPISODE 116
TRAINING
TRAINING
EPISODE 117
TRAINING
TRAINING
EPISODE 118
TRAINING
TRAINING
EPISODE 119
TRAINING
TRAINING
EPISODE 120
TRAINING
TRAINING
EPISODE 121
TRAINING
TRAINING
EPISODE 122
TRAINING
TRAINING
EPISODE 123
TRAINING
TRAINING
EPISODE 124
TRAINING
TRAINING
EPISODE 125
TRAINING
TRAINING
EPISODE 126
TRAINING
TRAINING
EPISODE 127
TRAINING
EPISODE 128
TRAINING
TRAINING
EPISODE 129
TRAINING
TRAINING
EPISODE 130
TRAINING
TRAINING
EPISODE 131
TRAINING
TRAINING
EPISODE 132
TRAINING
TRAINING
EPISODE 133
TRAINING
TRAINING
EPISODE 134
TRAINING
TRAINING
EPISODE 135
TRAINING
TRAINING
EPISODE 136
TRAINING
TRAINING
EPISODE 137
TRAINING
TRAINING
EPISODE 138
TRAINING
TRAINING
EPISODE 139
TRAINING
TRAINING
EPISODE 140
TRAINING
TRAINING
EPISODE 141
TRAINING
TRAINING
EPISODE 142
TRAINING
TRAINING
EPISODE 143
TRAINING
TRAINING
EPISODE 144
TRAINING
TRAINING
EPISODE 145
TRAININ

EPISODE 204
EPISODE 205
EPISODE 206
EPISODE 207
EPISODE 208
EPISODE 209
EPISODE 210
EPISODE 211
TRAINING
EPISODE 212
EPISODE 213
EPISODE 214
EPISODE 215
EPISODE 216
EPISODE 217
EPISODE 218
EPISODE 219
EPISODE 220
EPISODE 221
EPISODE 222
EPISODE 223
EPISODE 224
EPISODE 225
EPISODE 226
EPISODE 227
EPISODE 228
EPISODE 229
EPISODE 230
EPISODE 231
EPISODE 232
EPISODE 233
EPISODE 234
EPISODE 235
EPISODE 236
EPISODE 237
TRAINING
EPISODE 238
EPISODE 239
EPISODE 240
EPISODE 241
EPISODE 242
EPISODE 243
EPISODE 244
EPISODE 245
EPISODE 246
EPISODE 247
EPISODE 248
EPISODE 249
EPISODE 250
EPISODE 251
EPISODE 252
EPISODE 253
EPISODE 254
EPISODE 255
EPISODE 256
EPISODE 257
EPISODE 258
EPISODE 259
EPISODE 260
EPISODE 261
EPISODE 262
EPISODE 263
EPISODE 264
EPISODE 265
TRAINING
TRAINING
EPISODE 266
TRAINING
TRAINING
EPISODE 267
TRAINING
TRAINING
EPISODE 268
TRAINING
TRAINING
EPISODE 269
TRAINING
TRAINING
EPISODE 270
TRAINING
TRAINING
EPISODE 271
TRAINING
TRAINING
EPISODE 272
EPISODE 273
EPISODE 274
EPIS

EPISODE 717
EPISODE 718
EPISODE 719
EPISODE 720
EPISODE 721
EPISODE 722
EPISODE 723
EPISODE 724
EPISODE 725
EPISODE 726
EPISODE 727
EPISODE 728
EPISODE 729
EPISODE 730
EPISODE 731
EPISODE 732
EPISODE 733
EPISODE 734
TRAINING
EPISODE 735
EPISODE 736
EPISODE 737
EPISODE 738
EPISODE 739
EPISODE 740
EPISODE 741
EPISODE 742
EPISODE 743
EPISODE 744
EPISODE 745
EPISODE 746
EPISODE 747
EPISODE 748
EPISODE 749
EPISODE 750
EPISODE 751
EPISODE 752
EPISODE 753
EPISODE 754
EPISODE 755
EPISODE 756
EPISODE 757
EPISODE 758
EPISODE 759
EPISODE 760
EPISODE 761
EPISODE 762
EPISODE 763
EPISODE 764
EPISODE 765
EPISODE 766
EPISODE 767
EPISODE 768
EPISODE 769
EPISODE 770
EPISODE 771
EPISODE 772
EPISODE 773
EPISODE 774
EPISODE 775
TRAINING
EPISODE 776
EPISODE 777
EPISODE 778
EPISODE 779
EPISODE 780
EPISODE 781
EPISODE 782
EPISODE 783
EPISODE 784
EPISODE 785
EPISODE 786
EPISODE 787
EPISODE 788
EPISODE 789
EPISODE 790
EPISODE 791
EPISODE 792
EPISODE 793
EPISODE 794
EPISODE 795
EPISODE 796
EPISODE 797
EPISODE 79

In [64]:
agent.model.get_weights()

[array([[ 0.4368444 , -0.9581646 ,  0.01781208],
        [-0.01402762, -0.3584761 ,  0.86739975]], dtype=float32),
 array([-0.18262763,  0.        , -0.36357975], dtype=float32),
 array([[ 0.03510503,  0.12043034, -0.840428  ,  0.7559384 ],
        [ 0.7003944 , -0.7513062 ,  0.09019983, -0.4290819 ],
        [ 0.23032059,  0.05712557, -0.7452618 , -0.5685754 ]],
       dtype=float32),
 array([-0.71301055, -0.23562774,  0.32984608,  0.34440985], dtype=float32)]

In [65]:
env_test = Environment()
env_test.current_point_x = 1
env_test.current_point_y = 1
env_test.maze[env_test.current_point_y, env_test.current_point_x] = -1

#env_test.end_point_y = 9
state = (env_test.current_point_x, env_test.current_point_y)
#state = (env_test.end_point_x, env_test.end_point_y)
            
for j in range(100):
    x = state[0]
    y = state[1]
    inputs = np.array([x, y]).reshape((1, 2))
    move_prediction = np.array(agent.model.predict(inputs)[0])
    print(move_prediction)
    prediction_action = np.argmax(move_prediction)
    env_test.move(prediction_action)
    state = (env_test.current_point_x, env_test.current_point_y)
    print(state, prediction_action)
    env_test.maze[env_test.current_point_y, env_test.current_point_x] = 1
    done = env_test.is_done()
    if done:
        print("DONE")
        break
        
print(env_test.maze)

[-0.58443606 -0.17690316 -0.26076806  0.22939089]
(1, 2) 3
[-0.38514853 -0.12904179 -0.8954187  -0.27439532]
(2, 2) 1
[-0.3657106  -0.07541496 -1.2758298   0.04570469]
(2, 3) 3
[-0.16642296 -0.02755359 -1.9104805  -0.4580815 ]
(3, 3) 1
[-0.14698505  0.02607325 -2.2908914  -0.13798162]
(4, 3) 1
[-0.12754714  0.07970008 -2.671302    0.18211833]
(4, 4) 3
DONE
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  1.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


In [47]:
env_test = Environment()

for row in range(10):
    arr = []
    for col in range(10):
        inputs = np.array([col, row]).reshape((1, 2))
#         print(row, col)
        move_prediction = np.array(agent.model.predict(inputs)[0])
        prediction_action = np.argmax(move_prediction)
        if prediction_action == 0:
            arr.append("<")
        elif prediction_action == 1:
            arr.append(">")
        elif prediction_action == 2:
            arr.append("^")
        else:
            arr.append("v")
    print(arr)

['v', '<', '<', '<', '<', '<', '<', '<', '<', '<']
['v', '<', '<', '<', '<', '<', '<', '<', '<', '<']
['v', '<', '<', '<', '<', '<', '<', '<', '<', '<']
['v', '<', '<', '<', '<', '<', '<', '<', '<', '<']
['v', 'v', '<', '<', '<', '<', '<', '<', '<', '<']
['v', 'v', '<', '<', '<', '<', '<', '<', '<', '<']
['v', 'v', '<', '<', '<', '<', '<', '<', '<', '<']
['v', 'v', '<', '<', '<', '<', '<', '<', '<', '<']
['v', 'v', '<', '<', '<', '<', '<', '<', '<', '<']
['v', 'v', 'v', '<', '<', '<', '<', '<', '<', '<']
