In [None]:
%matplotlib inline
import numpy as np
import keras.backend as K
from keras import Sequential, Model
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Flatten
import copy

import Environment as E
import importlib
import matplotlib.pyplot as plt

## Parameters

In [None]:
# General

n_agents = 1

# Model

time_steps = 3
grid_size_w = 7
grid_size_h = 7
n_actions = 5

n_channels = time_steps + time_steps + 1

# Q-Learning

n_episodes = 10
n_steps = 1000000
epsilon = 0.75
epsilon_decay = 0.999
gamma = 0.9

# Policy Gradient

n_runs = 5

## Model

In [None]:
data_i = Input(shape=(time_steps, grid_size_w, grid_size_h))
conv_1 = Conv2D(16, (2, 2), activation="relu", data_format="channels_first")(data_i)
conv_2 = Conv2D(32, (2, 2), activation="relu", data_format="channels_first")(conv_1)
pool_1 = MaxPooling2D(data_format="channels_first")(conv_2)
drop_1 = Dropout(0.25)(pool_1)
flat_1 = Flatten()(drop_1)
feed_1 = Dense(64, activation="relu")(flat_1)
drop_2 = Dropout(0.25)(feed_1)
feed_2 = Dense(n_actions, activation="linear")(drop_2)
data_o = feed_2

K.print_tensor(data_o)

model = Model(inputs=data_i, outputs=data_o)

model.compile(optimizer="adam", loss="mse")

model.summary()

## Q-Learning

In [None]:
def tuple_to_batch(known_data):
    batch = []
    for i,agent_pos in enumerate(known_data[2]):
        batch.append(np.concatenate((known_data[0],known_data[1],known_data[2][i:i+1])))
    batch = np.array(batch)
    return batch

In [None]:
importlib.reload(E)

for i in range(n_episodes):
    env = E.Environment(dim = (grid_size_h,grid_size_w),reward_eat_trash=10, 
                        reward_invalid_move=0, reward_nothing_happend=0, 
                        trash_appearence_prob=0.1, number_trash_sources=1, saved_timesteps=time_steps )
    d = []
    action_avg = np.zeros((10000,5))
    list_avg_rwd = []
    list_steps = []
    all_time_reward = 0
    all_time_reward_avg_history = []
    all_time_reward_steps_history = []
    for k in range(n_agents):
        env.add_agent(coord=(k,0),capacity=100000)
    
    X = env.export_known_data()
    X = tuple_to_batch(X)
    Q_vals = model.predict(X)
    if i % 10 == 0:
        print("Episode {} of {}".format(i + 1, n_episodes))
    
    r_sum = 0
    
    for step in range(n_steps):
        if np.random.random() < max(epsilon, 0.01):
            actions = np.random.randint(0, high=n_actions, size=n_agents).tolist()
        else:
            actions = np.apply_along_axis(np.argmax, 1, Q_vals).tolist()
        #print("Actions {}".format(actions))
        X_reward = env.move_agents(actions) #new known state and rewards
        # after all agents move
        X_new = tuple_to_batch(X_reward[:3])
        rewards = X_reward[3]
        #print("Rewards {}".format(rewards))
        all_time_reward += sum(rewards)
        action_avg[step%10000, : ] = 0
        action_avg[step%10000,actions[0]]= 1
        d.append(sum(rewards))
        while(len(d)>10000):
            del(d[0])
        if(step% 10000 == 0): 
            mean = sum(d) / 10000.0
            
            epsilon *= epsilon_decay
            print("Current Random level is {}".format(max(epsilon, 0.01)))
            print("In Step {} the average reward of 10000 is {} ".format(step, mean))
            print("Actions: {}".format(np.sum(action_avg, axis = 0)))
            list_avg_rwd.append(mean)
            list_steps.append(step)
            
            plt.scatter(list_steps, list_avg_rwd)
            plt.show()
            
            
            all_time_reward_avg = all_time_reward / (step+1)
            all_time_reward_avg_history.append(all_time_reward_avg)
            all_time_reward_steps_history.append(step)
            plt.scatter(all_time_reward_steps_history, all_time_reward_avg_history)
            plt.show()
        Q_vals_new = model.predict(X_new)
        
        Q_vals[range(n_agents), actions] = rewards + gamma * np.max(Q_vals_new, axis=1)
        model.fit(X, Q_vals, epochs=1, verbose=0)
        
        Q_vals = Q_vals_new
        X = X_new