In [1]:
%matplotlib inline
import numpy as np
import keras.backend as K
from keras import Sequential, Model
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Flatten
import copy

import Environment as E
import importlib
import matplotlib.pyplot as plt

Using TensorFlow backend.


## Parameters

In [2]:
# General

n_agents = 1

# Model

time_steps = 3
grid_size_w = 5
grid_size_h = 5
n_actions = 5

n_channels = time_steps + time_steps + 1

# Q-Learning

n_episodes = 10
n_steps = 1000000
epsilon = 0.75
epsilon_decay = 0.99
gamma = 0.9

# Policy Gradient

n_runs = 5

## Model

In [3]:
data_i = Input(shape=(time_steps, grid_size_w, grid_size_h))
conv_1 = Conv2D(16, (2, 2), activation="relu", data_format="channels_first")(data_i)
conv_2 = Conv2D(32, (2, 2), activation="relu", data_format="channels_first")(conv_1)
pool_1 = MaxPooling2D(data_format="channels_first")(conv_2)
drop_1 = Dropout(0.25)(pool_1)
flat_1 = Flatten()(drop_1)
feed_1 = Dense(64, activation="relu")(flat_1)
drop_2 = Dropout(0.25)(feed_1)
feed_2 = Dense(n_actions, activation="linear")(drop_2)
data_o = feed_2

model = Model(inputs=data_i, outputs=data_o)

model.compile(optimizer="adam", loss="mse")

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3, 5, 5)           0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 16, 4, 4)          208       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 3, 3)          2080      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 1, 1)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 1, 1)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                2112      
__________

## Q-Learning

In [4]:
from Motion import Motion

def tuple_to_batch(known_data):
    batch = []
    for i,agent_pos in enumerate(known_data[2]):
        batch.append(np.concatenate((known_data[0],known_data[1],known_data[2][i:i+1])))
    batch = np.array(batch)
    return batch

def valid_move_mask(batch):
    mask = np.ones(shape=(n_agents, n_actions))
    indices = np.array(np.nonzero(batch[:, n_channels, :, :])).T
    for idx in indices:
        agent, row, col = idx
        for action in range(n_actions):
            d_pos = Motion(action).value()
            row += d_pos[0]
            col += d_pos[1]
            if row == -1 or row == grid_size_h or col == -1 or col == grid_size_w:
                mask[agent, action] = 0
    return mask
    

In [6]:
importlib.reload(E)

for i in range(n_episodes):
    env = E.Environment(dim=(grid_size_h,grid_size_w),
                        REWARD_EAT_TRASH=10,
                        REWARD_INVALID_MOVE=0, 
                        REWARD_NOTHING_HAPPEND=0,
                        TRASH_APPEARENCE_PROB=0.1,
                        NUMBER_TRASH_SOURCES=1,
                        saved_timesteps=time_steps)
    d = []
    action_avg = np.zeros((400,5))
    list_avg_rwd = []
    list_steps = []

    for k in range(n_agents):
        env.add_agent(coord=(k,0),capacity=100000)
    
    print(env.export_known_data())
    
    X = tuple_to_batch(env.export_known_data())
    
    print(X.shape)
    
    Q_vals = model.predict(X)
    
    if i % 10 == 0:
        print("Episode {} of {}".format(i + 1, n_episodes))

    r_sum = 0
    
    for step in range(n_steps):
        
        mask = valid_move_mask(X)
        if np.random.random() < epsilon:
            Q_rand = np.random.randint(Q_vals.size()).reshape(Q_vals.shape)
            actions = np.apply_along_axis(np.argmax, 1, Q_rand * mask).tolist()
        else:
            actions = np.apply_along_axis(np.argmax, 1, Q_vals * mask).tolist()
            
        X_reward = env.move_agents(actions) #new known state and rewards
        # after all agents move
        X_new = tuple_to_batch(X_reward[:3])
        rewards = X_reward[3]
        #print("Rewards {}".format(rewards))
        action_avg[step%400, : ] = 0
        action_avg[step%400,actions[0]]= 1
        d.append(rewards[0])
        while(len(d)>400):
            del(d[0])
        if(step% 1000 == 0): 
            mean = sum(d) / 400.0
            epsilon *= epsilon_decay
            print(epsilon)
            print("In Step {} the average reward of 100 is {} ".format(step, mean))
            print("Actions: {}".format(np.sum(action_avg, axis = 0)))
            list_avg_rwd.append(mean)
            list_steps.append(step)
            plt.scatter(list_steps, list_avg_rwd)
            plt.show()
        Q_vals_new = model.predict(X_new)
        
        Q_vals[range(n_agents), actions] = rewards + gamma * np.max(Q_vals_new, axis=1)
        model.fit(X, Q_vals, epochs=1, verbose=0)
        
        Q_vals = Q_vals_new
        X = X_new

(array([[[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]]), array([[[ 1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1]]]), array([[[1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]]))
(1, 3, 5, 5)
Episode 1 of 10


IndexError: index 7 is out of bounds for axis 1 with size 3