In [154]:
import tensorflow as tf
import numpy as np
import random
import math
import os

# Parameters
epsilon = 1
epsilon_minimum_value = 0.001
nb_actions = 3
epoch = 1001
hidden_size = 100
max_memory = 500
batch_size = 50
grid_size = 10
nb_states = grid_size * grid_size
discount = 0.9
learning_rate = 0.2

# Create the base model.
X = tf.keras.Input(shape=(nb_states,), dtype=tf.float32)
W1 = tf.Variable(tf.keras.initializers.GlorotNormal()(shape=(nb_states, hidden_size)))
b1 = tf.Variable(tf.random.normal([hidden_size], stddev=0.01))
input_layer = tf.nn.relu(tf.matmul(X, W1) + b1)

W2 = tf.Variable(tf.random.normal([hidden_size, hidden_size], stddev=0.01))
b2 = tf.Variable(tf.random.normal([hidden_size], stddev=0.01))
hidden_layer = tf.nn.relu(tf.matmul(input_layer, W2) + b2)

W3 = tf.Variable(tf.keras.initializers.GlorotNormal()(shape=(hidden_size, nb_actions)))
b3 = tf.Variable(tf.random.normal([nb_actions], stddev=0.01))
output_layer = tf.matmul(hidden_layer, W3) + b3

# True labels
Y = tf.keras.Input(shape=(nb_actions,), dtype=tf.float32)

# Mean squared error cost function
cost = tf.reduce_sum(tf.square(Y - output_layer)) / (2 * batch_size)

# Stochastic Gradient Descent Optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
train_step = optimizer.minimize(cost, var_list=[W1, b1, W2, b2, W3, b3])



# Helper function: Chooses a random value between the two boundaries.
def randf(s, e):
    return (float(random.randrange(0, int((e - s) * 9999))) / 10000) + s


# The environment: Handles interactions and contains the state of the environment
class CatchEnvironment:
    def __init__(self, grid_size):
        self.grid_size = grid_size
        self.nb_states = self.grid_size * self.grid_size
        self.state = np.empty(3, dtype=np.uint8)

    def observe(self):
        canvas = self.draw_state()
        canvas = np.reshape(canvas, (-1, self.nb_states))
        return canvas

    def draw_state(self):
        canvas = np.zeros((self.grid_size, self.grid_size))
        canvas[self.state[0] - 1, self.state[1] - 1] = 1  # Draw the fruit.
        canvas[self.grid_size - 1, self.state[2] - 1 - 1] = 1  # Draw the basket.
        canvas[self.grid_size - 1, self.state[2] - 1] = 1
        canvas[self.grid_size - 1, self.state[2] - 1 + 1] = 1
        return canvas

    def reset(self):
        initial_fruit_column = random.randrange(1, self.grid_size + 1)
        initial_bucket_position = random.randrange(2, self.grid_size + 1 - 1)
        self.state = np.array([1, initial_fruit_column, initial_bucket_position])
        return self.get_state()

    def get_state(self):
        state_info = self.state
        fruit_row, fruit_col, basket = state_info
        return fruit_row, fruit_col, basket

    def get_reward(self):
        fruit_row, fruit_col, basket = self.get_state()
        if fruit_row == self.grid_size - 1:
            return 1 if abs(fruit_col - basket) <= 1 else -1
        return 0

    def is_game_over(self):
        return self.state[0] == self.grid_size - 1

    def update_state(self, action):
        if action == 1:
            action = -1
        elif action == 2:
            action = 0
        else:
            action = 1
        fruit_row, fruit_col, basket = self.get_state()
        new_basket = min(max(2, basket + action), self.grid_size - 1)
        fruit_row = fruit_row + 1
        self.state = np.array([fruit_row, fruit_col, new_basket])

    def act(self, action):
        self.update_state(action)
        reward = self.get_reward()
        game_over = self.is_game_over()
        return self.observe(), reward, game_over, self.get_state()


# The memory: Handles the internal memory that we add experiences that occur based on agent's actions,
# and creates batches of experiences based on the mini-batch size for training.
class ReplayMemory:
    def __init__(self, grid_size, max_memory, discount):
        self.max_memory = max_memory
        self.grid_size = grid_size
        self.nb_states = self.grid_size * self.grid_size
        self.discount = discount
        canvas = np.zeros((self.grid_size, self.grid_size))
        canvas = np.reshape(canvas, (-1, self.nb_states))
        self.input_state = np.empty((self.max_memory, 100), dtype=np.float32)
        self.actions = np.zeros(self.max_memory, dtype=np.uint8)
        self.next_state = np.empty((self.max_memory, 100), dtype=np.float32)
        self.game_over = np.empty(self.max_memory, dtype=np.bool)
        self.rewards = np.empty(self.max_memory, dtype=np.int8)
        self.count = 0
        self.current = 0

    def remember(self, current_state, action, reward, next_state, game_over):
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.input_state[self.current, ...] = current_state
        self.next_state[self.current, ...] = next_state
        self.game_over[self.current] = game_over
        self.count = max(self.count, self.current + 1)
        self.current = (self.current + 1) % self.max_memory

    def get_batch(self, model, batch_size, nb_actions, nb_states, sess, X):

        memory_length = self.count
        chosen_batch_size = min(batch_size, memory_length)

        inputs = np.zeros((chosen_batch_size, nb_states))
        targets = np.zeros((chosen_batch_size, nb_actions))

        for i in range(chosen_batch_size):
            if memory_length == 1:
                memory_length = 2
            random_index = random.randrange(1, memory_length)
            current_input_state = np.reshape(self.input_state[random_index], (1, 100))

            target = sess.run(model, feed_dict={X: current_input_state})

            current_next_state = np.reshape(self.next_state[random_index], (1, 100))
            current_outputs = sess.run(model, feed_dict={X: current_next_state})

            next_state_max_q = np.amax(current_outputs)
            if self.game_over[random_index] == True:
                target[0, [self.actions[random_index] - 1]] = self.rewards[random_index]
            else:
                target[0, [self.actions[random_index] - 1]] = (
                        self.rewards[random_index] + self.discount * next_state_max_q
                )

            inputs[i] = current_input_state
            targets[i] = target

        return inputs, targets


def main(_):
    print("Training new model")

    # Define Environment
    env = CatchEnvironment(grid_size)

    # Define Replay Memory
    memory = ReplayMemory(grid_size, max_memory, discount)

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    win_count = 0
    with tf.compat.v1.Session() as sess:
        tf.compat.v1.initialize_all_variables().run()

        for i in range(epoch):
            err = 0
            env.reset()
            is_game_over = False
            current_state = env.observe()

            while not is_game_over:
                action = -9999
                if randf(0, 1) <= epsilon:
                    action = random.randrange(1, nb_actions + 1)
                else:
                    q = sess.run(output_layer, feed_dict={X: current_state})
                    index = q.argmax()
                    action = index + 1

                if epsilon > epsilon_minimum_value:
                    epsilon = epsilon * 0.999

                next_state, reward, is_game_over, state_info = env.act(action)

                if reward == 1:
                    win_count += 1

                memory.remember(current_state, action, reward, next_state, is_game_over)

                current_state = next_state

                inputs, targets = memory.get_batch(output_layer, batch_size, nb_actions, nb_states, sess, X)
                _, loss = sess.run([train_step, cost], feed_dict={X: inputs, Y: targets})
                err += loss

            print(
                "Epoch {}: err = {}: Win count = {} Win ratio = {}".format(
                    i, err, win_count, float(win_count) / float(i + 1) * 100
                )
            )

        # Save the variables to disk.
        save_path = saver.save(sess, os.getcwd() + "/model.ckpt")
        print("Model saved in file: %s" % save_path)


if __name__ == "__main__":
    tf.compat.v1.app.run()


TypeError: _BaseOptimizer.minimize() missing 1 required positional argument: 'var_list'