In [3]:
import numpy as np
import keras.backend as K
from keras import Sequential
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Flatten
import copy

# import Environment as E

ModuleNotFoundError: No module named 'aenum'

## Parameters

In [9]:
# General

n_agents = 1

# Model

time_steps = 3
grid_size_w = 20
grid_size_h = 20
n_actions = 5

n_channels = time_steps + time_steps + 1

# Q-Learning

n_episodes = 10
n_steps = 100
epsilon = 0.75
epsilon_decay = 0.999
gamma = 0.9

# Policy Gradient

n_runs = 5

## Model

In [5]:
model = Sequential()
model.add(Conv2D(16, (4, 4), activation="relu", input_shape=(grid_size_w, grid_size_h, n_channels)))
model.add(Conv2D(32, (2, 2), activation="relu"))
model.add(MaxPooling2D())
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(n_actions, activation="linear"))
model.compile(optimizer="adam", loss="mse")
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 17, 17, 16)        1808      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 16, 16, 32)        2080      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 8, 32)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 8, 8, 32)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                131136    
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
__________

## Q-Learning

In [5]:
def tuple_to_batch(known_data):
    batch = []
    for i,agent_pos in enumerate(known_data[2]):
        batch.append(np.concatenate(known_data[0],known_data[1],agent_pos[i:i+1]))
    batch = np.array(batch)
    return batch

In [None]:
for i in range(n_episodes):
    env = E.Environment((20,20))
    for i in range(n_agents):
        env.add_agent(coord=(i,0),capacity=100000)
    
    X = env.export_known_data()
    X = tuple_to_batch(X)
    Q_vals = model.predict(X)
    if i % 10 == 0:
        print("Episode {} of {}".format(i + 1, n_episodes))
    epsilon *= epsilon_decay
    r_sum = 0
    step = 0
    
    for step in range(n_steps):
        if np.random.random() < epsilon:
            actions = np.random.randint(0, high = n_actions, size=n_agents).tolist()
        else:
            actions = [np.argmax(q_array) for q_array in Q_vals]
        
        X_reward = env.move_agents(actions) #new known state and rewards
        
        # after all agents move
        X_new = tuple_to_batch(X_reward[:3])
        rewards = X_reward[3]
        Q_vals_new = model.predict(X_new)
        
        Q_vals[:, actions] = rewards + gamma * np.max(Q_vals_new, axis=1)
        model.fit(X, target, epochs=1, verbose=0)
        
        Q_vals = Q_vals_new
        X = X_new

## Policy Gradient
### !!Some variables may have the same name as in the DQN segment, but be of a different shape!!

In [2]:
def policy_loss(yPred,reward_sums):
    return -K.dot(K.log(yPred + 1), reward_sums)

In [None]:
model_PG = Sequential()
model_PG.add(Conv2D(16, (4, 4), activation="relu", input_shape=(grid_size_w, grid_size_h, n_channels)))
model_PG.add(Conv2D(32, (2, 2), activation="relu"))
model_PG.add(MaxPooling2D())
model_PG.add(Dropout(0.25))
model_PG.add(Flatten())
model_PG.add(Dense(64, activation="relu"))
model_PG.add(Dropout(0.25))
model_PG.add(Dense(n_actions, activation="softmax"))
model_PG.compile(optimizer="adam", loss=policy_loss)
model_PG.summary()

In [None]:
######NEEDS FIXING

X_runs = np.random.rand(n_runs, n_steps, n_agents, grid_size_w, grid_size_h, n_channels)
# todo: initialize environment
# todo: check if net is predicting roughly uniformly between actions? (else fix)

for i in range(n_episodes):
    if i % 10 == 0:
        print("Episode {} of {}".format(i + 1, n_episodes))
    epsilon *= epsilon_decay
    R = np.zeros((n_runs, n_steps, n_agents, n_actions))
    actions = []
    for run in range(n_runs):
        for step in range(n_steps):
            for agent in range(n_agents):
                # todo: get current known state as X
                X = np.array([np.random.rand(grid_size_w, grid_size_h, n_channels)])
                X_runs[run,step,agent] = copy.copy(X)
                
                action_prob = model_PG.predict(X)[0]
                action = np.random.choice(range(n_actions), 1, p=action_prob)[0]
                actions.append(action)
                # todo: do action, get r and new state as X_new

                # after agent moves
                reward = np.random.rand()
                R[run, step, agent, action] = reward
                
                #update previous rewards in R matrix
                for i,prev in enumerate(range(step).__reversed__()): 
                    R[run, prev, agent, action] += reward * gamma ** i
        
    #epochs may be changed
    X_fit = X_runs.reshape((n_runs * n_steps * n_agents, grid_size_w, grid_size_h, n_channels))
    R_fit = R.reshape((n_runs * n_steps * n_agents, n_actions))
    model_PG.fit(X_fit, R_fit, epochs=1, verbose=0)