In [None]:
%matplotlib inline
import numpy as np
import keras.backend as K
from keras import Sequential, Model
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Flatten
import copy

import Environment as E
import importlib
import matplotlib.pyplot as plt
import Heatmap as H
import GaussianTrashSource

## Parameters

In [None]:
# General

n_agents = 1

# Model

time_steps = 3
grid_size_w = 7
grid_size_h = 7
n_actions = 5

n_channels = time_steps + time_steps + 1
number_print = 25

#Policy Gradient
n_episodes = 250
n_steps = 30
gamma = 0.9

env_reset_freq = 0 #Environment/trash locations change every env_reset_freq episodes
n_runs = 10
heatmap = H.Heatmap(keep_track_of_steps=1000)

## Model

## Policy Gradient

In [None]:
from Motion import Motion

def tuple_to_batch(known_data):
    batch = []
    for i,agent_pos in enumerate(known_data[2]):
        batch.append(np.concatenate((known_data[0],known_data[1],known_data[2][i:i+1])))
    batch = np.array(batch)
    return batch

In [None]:
data_i = Input(shape=(n_channels, grid_size_w, grid_size_h))
conv_1 = Conv2D(16, (2, 2), activation="relu", data_format="channels_first")(data_i)
conv_2 = Conv2D(32, (2, 2), activation="relu", data_format="channels_first")(conv_1)
pool_1 = MaxPooling2D(data_format="channels_first")(conv_2)
drop_1 = Dropout(0.25)(pool_1)
flat_1 = Flatten()(drop_1)
feed_1 = Dense(64, activation="relu")(flat_1)
drop_2 = Dropout(0.25)(feed_1)
feed_2 = Dense(n_actions, activation="linear")(drop_2)
data_o = feed_2

model = Model(inputs=data_i, outputs=data_o)

### THE ONLY DIFFERENCE FROM Q-Learning loss=policy_loss
def policy_loss(yPred,reward_sums):
    return  - K.log(yPred + 1)*reward_sums

model.compile(optimizer="Adadelta", loss=policy_loss)

model.summary()

In [None]:
np.random.seed(0)
list_avg_rwd = []
for i in range(n_episodes):
    if i==env_reset_freq:
        env = E.Environment(dim = (grid_size_h,grid_size_w),reward_eat_trash=10, 
                            reward_invalid_move=0, reward_nothing_happend=0, 
                            trash_appearence_prob=1, number_trash_sources=3, saved_timesteps=time_steps )
        _, _,  _, _, trash_sources = env.debug_data_export()
        for idx,trash_source in enumerate(trash_sources): 
            print("Trash Source {} has its mean at {}".format(idx, trash_source.mean))
        d = []
        action_avg = np.zeros((10000,5))
        list_steps = []
        all_time_reward = 0
        all_time_reward_avg_history = []
        all_time_reward_steps_history = []
        for k in range(n_agents):
            env.add_agent(coord=(k,0),capacity=100000)
        r_sum = 0
    
    if i % 10 == 0:
        print("Episode {} of {}".format(i + 1, n_episodes))
    
    X = tuple_to_batch(env.export_known_data()) 
    
    R = np.zeros((n_runs, n_steps, n_agents, n_actions))
    X_runs = np.zeros((n_runs, n_steps, n_agents, grid_size_w, grid_size_h, n_channels))

    for run in range(n_runs):
        action_history = np.zeros((n_steps, n_agents))
        for step in range(n_steps):
            #Determine policy decisions in current state
            P_vals = model.predict(X)
            actions = np.random.choice(5,1,P_vals.tolist())

            #New known state and rewards
            X_reward = env.move_agents(actions)
            heatmap.add_map(X_reward[1][-1]) #Get the current position of all agents
            
            #After all agents move
            X_new = tuple_to_batch(X_reward[:3])
            rewards = X_reward[3]

            all_time_reward += sum(rewards)
            action_avg[step%number_print, : ] = 0
            action_avg[step%number_print,actions[0]]= 1
            d.append(sum(rewards))
            while(len(d)>number_print):
                del(d[0])
            if(step% number_print == 0 and i%10==0): 
                mean = sum(d) / number_print
                
                print('Current episode is', i)
                print("In Step {} the average reward of {} is {} ".format(step, number_print, mean))
                print("Actions: {}".format(np.sum(action_avg, axis = 0)))
                list_avg_rwd.append(mean)
                list_steps.append(step)
                print(heatmap.get_heatmap()) #Heatmap


            for agent_i in range(n_agents):
                action_history[step,agent_i] = int(actions[agent_i]) #copy actions to history
                X_runs[run,step,agent_i] = copy.copy(X[agent_i]) #copy current state to X_runs
                R[run, step, agent_i, actions[agent_i]] = rewards[agent_i] #copy current reward to runs reward
                #update previous rewards in R matrix:
                for j,prev in enumerate(range(max([0,step-15]),step).__reversed__()):
                    R[run, prev, agent_i, int(action_history[prev,agent_i])] += rewards[agent_i] * (gamma ** j)
                    
#This block uses all rewards for each action instead of discounting. This may be used instead of previous 2 lines
#                 for j,prev in enumerate(range(step).__reversed__()):
#                     R[run, prev, agent_i, int(action_history[prev,agent_i])] += rewards[agent_i]
                    
            #update state
            X = X_new

    #epochs may be changed
    X_fit = X_runs.reshape((n_runs * n_steps * n_agents, grid_size_w, grid_size_h, n_channels))
    R_fit = R.reshape((n_runs * n_steps * n_agents, n_actions))
    model.fit(X_fit, R_fit, epochs=1, verbose=0)

In [None]:
len(list_avg_rwd)

In [None]:
sum(list_avg_rwd[:50])