In [None]:
import numpy as np
import tensorflow as tf
from keras import Sequential
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Flatten
import copy

## Parameters

In [None]:
# General

n_agents = 2

# Model

time_steps = 3
grid_size_w = 20
grid_size_h = 20
n_actions = 5

n_channels = time_steps + time_steps + 1

# Q-Learning

n_episodes = 10
n_steps = 100
epsilon = 0.75
epsilon_decay = 0.999
gamma = 0.9

# Policy Gradient

n_runs = 5

## Model

In [None]:
model = Sequential()
model.add(Conv2D(16, (4, 4), activation="relu", input_shape=(grid_size_w, grid_size_h, n_channels)))
model.add(Conv2D(32, (2, 2), activation="relu"))
model.add(MaxPooling2D())
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(n_actions, activation="linear"))
model.compile(optimizer="adam", loss="mse")
model.summary()

## Q-Learning

In [None]:
X = np.random.rand(n_agents, grid_size_w, grid_size_h, n_channels)

# todo: initialize environment

for i in range(n_episodes):
    if i % 10 == 0:
        print("Episode {} of {}".format(i + 1, n_episodes))
    epsilon *= epsilon_decay
    r_sum = 0
    step = 0
    while step < n_steps:
        if np.random.random() < epsilon:
            actions = np.random.randint(0, n_actions)
        else:
            # todo: get current known state as X
            actions = np.argmax(model.predict(X), axis=1)
        # todo: do action, get r and new state as X_new
        
        # after all agents move
        rewards = np.floor(np.random.rand(n_agents) * 10)
        X_new = np.random.rand(n_agents, grid_size_w, grid_size_h, n_channels)
        
        target = model.predict(X)
        target[:, actions] = rewards + gamma * np.max(model.predict(X_new), axis=1)
        
        model.fit(X, target, epochs=1, verbose=0)
        step += 1

## Policy Gradient
### !!Some variables may have the same name as in the DQN segment, but be of a different shape!!

In [None]:
import keras.backend as K

def policy_loss(yPred,reward_sums):
    return -K.dot(K.log(yPred+1),reward_sums)

In [None]:
model = Sequential()
model.add(Conv2D(16, (4, 4), activation="relu", input_shape=(grid_size_w, grid_size_h, n_channels)))
model.add(Conv2D(32, (2, 2), activation="relu"))
model.add(MaxPooling2D())
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(n_actions, activation="softmax"))
model.compile(optimizer="adam", loss=policy_loss)
model.summary()

In [None]:

######NEEDS FIXING

X_runs = np.random.rand(n_runs, n_steps, n_agents, grid_size_w, grid_size_h, n_channels)
# todo: initialize environment
# todo: check if net is predicting roughly uniformly between actions? (else fix)

for i in range(n_episodes):
    if i % 10 == 0:
        print("Episode {} of {}".format(i + 1, n_episodes))
    epsilon *= epsilon_decay
    R = np.zeros((n_runs, n_steps, n_agents, n_actions))
    actions = []
    for run in range(n_runs):
        for step in range(n_steps):
            for agent in range(n_agents):
                # todo: get current known state as X
                X = np.array([np.random.rand(grid_size_w, grid_size_h, n_channels)])
                X_runs[run,step,agent] = copy.copy(X)
                
                action_prob = model.predict(X)[0]
                action = np.random.choice(range(n_actions),1,p=action_prob)[0]
                actions.append(action)
                # todo: do action, get r and new state as X_new

                # after agent moves
                reward = np.random.rand()
                R[run,step,agent,action] = reward
                
                #update previous rewards in R matrix
                for i,prev in enumerate(range(step).__reversed__()): R[run,prev,agent,action] += reward*gamma**i
        
    #epochs may be changed
    X_fit = X_runs.reshape((n_runs*n_steps*n_agents,grid_size_w,grid_size_h,n_channels))
    R_fit = R.reshape((n_runs*n_steps*n_agents,n_actions))
    model.fit(X_fit, R_fit, epochs=1, verbose=0)