In [20]:
import snake, queue, random, threading, math, time, pickle
import tensorflow as tf
import numpy as np
import tkinter as tkinter

physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)
# tf.debugging.set_log_device_placement(True)

save_path = "./save"
print(physical_devices)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [21]:
stack_size = 8
epsilon = 0.1
discount = 0.95
learning_rate = 0.1
memory_size = 100000
batch_size = 1000

update_index = 0
filled_memory = 0

In [22]:
try:
    with open(save_path + "/optimizer.dat", "rb") as openfile:
        optimizer = pickle.load(openfile)
except:
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

try:
    q = tf.keras.models.load_model(save_path + "/model")
except:
    # Q-Network
    q = tf.keras.Sequential()

    input_size = (15, 15, stack_size)

    q.add(tf.keras.layers.Conv2D(15, 3,
                                activation="relu", input_shape=input_size))
    q.add(tf.keras.layers.Conv2D(30, 3,
                                activation="relu"))
    q.add(tf.keras.layers.Conv2D(30, 3,
                                activation="relu"))
    q.add(tf.keras.layers.Flatten())
    q.add(tf.keras.layers.Dense(60, activation="relu"))
    q.add(tf.keras.layers.Dense(28, activation="relu"))
    q.add(tf.keras.layers.Dense(4))
    q.compile(optimizer=optimizer, loss="mse")

q.summary()

# Replay Memory
states_memory = np.ndarray((memory_size, 15, 15, stack_size))
action_memory = np.ndarray((memory_size))
reward_memory = np.ndarray((memory_size))
transitions_memory = np.ndarray((memory_size, 15, 15, stack_size))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 12, 12, 15)        1935      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 10, 10, 12)        1632      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 9, 10)          490       
_________________________________________________________________
flatten (Flatten)            (None, 810)               0         
_________________________________________________________________
dense (Dense)                (None, 108)               87588     
_________________________________________________________________
dense_1 (Dense)              (None, 48)                5232      
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 1

In [23]:
# class experience:
#     states = None
#     action = None
#     reward = None
#     transitions = None
    
#     def __init__(self, states, action, reward, transitions):
#         self.states = states
#         self.action = action
#         self.reward = reward
#         self.transitions = transitions

In [24]:
class agent:
    directions = ["UP", "DOWN", "LEFT", "RIGHT"]
    phi = queue.deque()

    def __init__(self, game, return_queue):
        self.game = game
        self.return_queue = return_queue

    def update_memory(self, states, action, reward, transitions):
        global update_index, filled_memory
        if update_index >= memory_size:
            update_index = 0

        states_memory[update_index] = states
        action_memory[update_index] = action
        reward_memory[update_index] = reward
        transitions_memory[update_index] = transitions

        update_index += 1
        if filled_memory < batch_size:
            filled_memory += 1

    def stack(self, frames):
        fstack = frames[0]
        for x in range(1, len(frames)):
            fstack = np.dstack((fstack, frames[x]))
            
        return fstack

    def epsilon_action(self):
        if random.uniform(0, 1) <= epsilon or len(self.phi) < stack_size:
            action = self.directions[random.randint(0, 3)]
        else:
            action = self.directions[
                np.argmax(
                    q.predict(
                        np.expand_dims(self.stack(self.phi), axis=0)))]
        return action

    def step(self):
        action = self.epsilon_action()
        self.game.step(action)

        while self.return_queue.empty():
            if not self.game.running:
                break

        state_reward = self.return_queue.get()

        phi_last = list(self.phi)
        self.phi.appendleft(state_reward[0])

        if len(self.phi) > stack_size:
            phi_last = self.stack(phi_last)
            self.phi.pop()
            phi_current = self.stack(self.phi)

            self.update_memory(phi_last, self.directions.index(action), state_reward[1], phi_current)

    def get_batch_indices(self, memory):
        indices = []
        for x in range(batch_size):
            indices.append(random.randint(0, filled_memory - 1))

        return indices

    def losses(self):
        loss_tensor = np.ndarray((batch_size))

        indices = self.get_batch_indices(states_memory)
        states = states_memory[indices]
        action = action_memory[indices]
        reward = reward_memory[indices]
        transitions = transitions_memory[indices]

        print("before")
        q_phi = q.predict(states)
        q_phi_next = q.predict(transitions)
        print("after")

        for t in range(batch_size):
            yj = reward[t] + discount * np.amax(q_phi_next[t])
            loss_tensor[t] = math.pow(yj - q_phi[t][int(action[t])], 2)

        return loss_tensor

    def learn(self):
        if np.count_nonzero(states_memory) != 0:
            losses = self.losses()

            gradient = optimizer.get_gradients(losses, ())
            optimizer.apply_gradients(gradient)


In [25]:
def train_agent(agent, epoch):
    time.sleep(0.01)
    print("Training episode " + str(epoch) + ".")

    while agent.game.running:
        agent.step()
        agent.learn()

    print("Training ended, agent scored " + str(agent.game.score) + " points.")

In [53]:
game = snake.game(queue.Queue(1))
for x in range(1):
    return_queue = queue.Queue(1)

    dqn = agent(game, return_queue)

    training_thread = threading.Thread(target=train_agent, args=(dqn, x)) 
    training_thread.start()

    game.start(return_queue)
    training_thread.join()

    q.save(save_path + "/model", overwrite=True, include_optimizer=True)
    with open(save_path + "/optimizer.dat", "wb") as openfile:
        pickle.dump(optimizer, openfile)

game.w.destroy()

Training episode 0.
before
after
before
after
before
after
before
after
before
after
before
after
before
after
before
after
before
after
before
after
before
after
before
after
Training ended, agent scored 0 points.
INFO:tensorflow:Assets written to: ./save/model\assets


In [52]:
indices = []
for x in range(batch_size):
    indices.append(random.randint(0, filled_memory - 1)) 
for x in range(2):
    q.predict(states_memory[indices])