In [None]:
%matplotlib inline
import copy
import importlib

import numpy as np
import matplotlib.pyplot as plt

import keras.backend as K
from keras import Sequential, Model
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Flatten


from Heatmap import Heatmap
from Environment import Environment
from GaussianTrashSource import GaussianTrashSource

## Parameters

In [None]:
class config:
    
    # General
    n_agents = 4
    number_print = 100

    # Model

    time_steps = 3
    grid_size_w = 10
    grid_size_h = 10
    n_actions = 5
    n_channels = time_steps + time_steps + 1
    
    trash_source_1 = GaussianTrashSource(mean=[2, 3], 
                                         max_y=grid_size_h - 1, 
                                         max_x=grid_size_w - 1, 
                                         cov=[[0, 0], [0, 0]])
    trash_source_2 = GaussianTrashSource(mean=[1, 6],
                                         max_y=grid_size_h - 1,
                                         max_x=grid_size_w - 1,
                                         cov=[[0, 0], [0, 0]])
    trash_source_3 = GaussianTrashSource(mean=[5, 1],
                                         max_y=grid_size_h - 1,
                                         max_x=grid_size_w - 1,
                                         cov=[[0, 0], [0, 0]])
    trash_source_4 = GaussianTrashSource(mean=[5, 5],
                                         max_y=grid_size_h - 1,
                                         max_x=grid_size_w - 1,
                                         cov=[[0, 0], [0, 0]])

    # Q-Learning

    n_episodes = 1
    n_steps = 10000
    epsilon = 0.75
    epsilon_decay = 1 - 1e-6
    gamma = 0.9

## Model

In [None]:
data_i = Input(shape=(config.n_channels, config.grid_size_w, config.grid_size_h))
conv_1 = Conv2D(16, (2, 2), activation="relu", data_format="channels_first")(data_i)
conv_2 = Conv2D(32, (2, 2), activation="relu", data_format="channels_first")(conv_1)
pool_1 = MaxPooling2D(data_format="channels_first")(conv_2)
drop_1 = Dropout(0.25)(pool_1)
flat_1 = Flatten()(drop_1)
feed_1 = Dense(64, activation="relu")(flat_1)
drop_2 = Dropout(0.25)(feed_1)
feed_2 = Dense(config.n_actions, activation="linear")(drop_2)
data_o = feed_2

model = Model(inputs=data_i, outputs=data_o)

model.compile(optimizer="adam", loss="mse")

model.summary()

## Q-Learning

In [None]:
from Motion import Motion

def tuple_to_batch(known_data):
    batch = []
    for i,agent_pos in enumerate(known_data[2]):
        batch.append(np.concatenate((known_data[0],known_data[1],known_data[2][i:i+1])))
    batch = np.array(batch)
    return batch

def valid_move_mask(batch):
    mask = np.ones(shape=(config.n_agents, config.n_actions))
    indices = np.array(np.nonzero(batch[:, config.n_channels - 1, :, :])).T
    for idx in indices:
        agent, row, col = idx
        for action in range(config.n_actions):
            d_pos = Motion(action).value
            row += d_pos[0]
            col += d_pos[1]
            if row == -1 or row == config.grid_size_h or col == -1 or col == config.grid_size_w:
                mask[agent, action] = 0
    return mask

In [None]:
def run(model, env, random_moves = True, do_train = True):
    
    for idx, trash_source in enumerate(env.debug_data_export()[4]): 
        print("Trash Source {} has its mean at {}".format(idx, trash_source.mean))

    # init statistics data structures
    all_reward_history = []
    avg_reward_history = []
    all_action_history = np.zeros((config.number_print, 5))
    all_agents_heatmap = Heatmap(keep_track_of_steps=1000)
    per_agent_heatmaps = [Heatmap(keep_track_of_steps=1000) for i in range(config.n_agents)]
    for k in range(config.n_agents):
        env.add_agent(coord=(k, 0), capacity=100000)
        all_reward_history.append([])
        avg_reward_history.append([])
    
    X = tuple_to_batch(env.export_known_data())
    Q_vals = model.predict(X)
    for step in range(config.n_steps):
        
        # get actions
        mask = valid_move_mask(X)
        if random_moves and np.random.random() < max(config.epsilon, 0.01):
            Q_rand = np.random.random(config.n_agents * config.n_actions).reshape((config.n_agents, config.n_actions))
            actions = np.apply_along_axis(np.argmax, 1, Q_rand * mask).tolist()
        else:
            actions = np.apply_along_axis(np.argmax, 1, Q_vals * mask).tolist()
        config.epsilon *= config.epsilon_decay
            
        # take actions
        h_trash, h_agents, p_agents, r_agents = env.move_agents(actions)
        X_new = tuple_to_batch([h_trash, h_agents, p_agents])
        
        # update statistics data structures
        all_agents_heatmap.add_map(h_agents[-1])
        all_action_history[step % config.number_print, :] = 0
        all_action_history[step % config.number_print, actions[0]] = 1
        for i in range(config.n_agents):
            all_reward_history[i].append(r_agents[i])
            per_agent_heatmaps[i].add_map(p_agents[i])

        if step % config.number_print == 0:
            
            # mean reward per agent (list)
            means = list(np.mean(all_reward_history[-config.number_print:], axis=1) / config.number_print)
            print("steps {} - {}".format(step - config.number_print, step))
            print("gamma = {}".format(max(config.epsilon, 0.01)))
            for i_agent in range(len(means)):
                print("avg reward for agent", i_agent, ":", means[i_agent])
            
            # plot moving reward averages per agent
            for i_agent in range(config.n_agents):
                avg_reward_history[i_agent].append(means[i_agent])
                plt.plot(range(0, step+1, config.number_print), avg_reward_history[i_agent])
            plt.show()
            
            # Plot rewards for every single time step
            #for i_agent in range(config.n_agents):
            #    plt.scatter(all_time_reward_steps_history, all_time_reward_avg_history)
            #plt.show()
            
            # heatmap
            all_agents_heatmap.show_heatmap()
            for heatmap in per_agent_heatmaps:
                heatmap.show_heatmap()
        
        Q_vals_new = model.predict(X_new)
        
        if do_train:
            Q_vals[range(config.n_agents), actions] = r_agents + config.gamma * np.max(Q_vals_new, axis=1)
            model.fit(X, Q_vals, epochs=1, verbose=0)

        Q_vals = Q_vals_new
        X = X_new

### Run the model in some environment

In [None]:
for i in range(config.n_episodes):
    env = Environment(dim=(config.grid_size_h, config.grid_size_w),
                      reward_eat_trash=10, 
                      reward_invalid_move=0,
                      reward_nothing_happend=0, 
                      trash_appearence_prob=0.1,
                      number_trash_sources=0, 
                      saved_timesteps=config.time_steps)
    env.trash_sources.append(config.trash_source_1)# at [2,3]
    env.trash_sources.append(config.trash_source_2)# at [1,6]

    
    run(model,env)

#### Now reset the model, train it on a different environment and use the pretrained model to learn the environment on top again. Will it learn faster?

In [None]:
###########################################################################################
## Reset the model

data_i = Input(shape=(config.n_channels, config.grid_size_w, config.grid_size_h))
conv_1 = Conv2D(16, (2, 2), activation="relu", data_format="channels_first")(data_i)
conv_2 = Conv2D(32, (2, 2), activation="relu", data_format="channels_first")(conv_1)
pool_1 = MaxPooling2D(data_format="channels_first")(conv_2)
drop_1 = Dropout(0.25)(pool_1)
flat_1 = Flatten()(drop_1)
feed_1 = Dense(64, activation="relu")(flat_1)
drop_2 = Dropout(0.25)(feed_1)
feed_2 = Dense(config.n_actions, activation="linear")(drop_2)
data_o = feed_2

model = Model(inputs=data_i, outputs=data_o)

model.compile(optimizer="adam", loss="mse")

model.summary()


############################################################################################
## Set up a second environment
env2 = Environment(dim = (config.grid_size_h,config.grid_size_w),reward_eat_trash=10, 
                        reward_invalid_move=0, reward_nothing_happend=0, 
                        trash_appearence_prob=0.1, number_trash_sources=0, saved_timesteps=config.time_steps )
env2.trash_sources.append(config.trash_source_3)# at [5,1]
env2.trash_sources.append(config.trash_source_4)# at [5,5]
############################################################################################
## Run the model in the second environment
run(model, env2)

In [None]:
###########################################################################################
## Run the pretrained model again on the old environment
env = Environment(dim = (config.grid_size_h,config.grid_size_w),reward_eat_trash=10, 
                        reward_invalid_move=0, reward_nothing_happend=0, 
                        trash_appearence_prob=0.1, number_trash_sources=0, saved_timesteps=config.time_steps )
env.trash_sources.append(config.trash_source_1)# at [2,3]
env.trash_sources.append(config.trash_source_2)# at [1,6]

run(model, env)

### We now define a new environment and see what the pretrained network will do on that

In [None]:
env2 = Environment(dim = (config.grid_size_h,config.grid_size_w),reward_eat_trash=10, 
                        reward_invalid_move=0, reward_nothing_happend=0, 
                        trash_appearence_prob=0.1, number_trash_sources=0, saved_timesteps=config.time_steps )
env2.trash_sources.append(config.trash_source_3)
env2.trash_sources.append(config.trash_source_4)

run(model, env2)
