### CONCERNS
1. list of directions/positions are never emptied during an episode, which makes the simulation slower and slower after each timestep

### IDEAS
other possible features:
1. robot is fallen down or not
2. distance to border (& which border?)

time optimalization:
1. clear last item from history every 4 timesteps (we only use the current and previous state and the one before that)
2. interval of states to be interpreted: skip N frames before evaluation next state
3. Since the rewards are so sparse, maybe use Imitation learning instead of DQN --> we are "experts" since we know the tactic of the blue bot. we can use this to teach our bot how to defeat the other agent.
4. rewrite the random action function 

In [1]:
from VisualModule import AgentEnvironment
from DQN_Agent import NeurosmashAgent

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

import gym
import math
import random
from collections import namedtuple
from itertools import count
from PIL import Image
import os

from stopwatch import Stopwatch


Using TensorFlow backend.


In [2]:
model_output_dir = "output/model_output/"

if not os.path.exists(model_output_dir):
    os.makedirs(model_output_dir)

max_distance = 600
show_images = False
skip_frames = 15
state_size =  11 # agent_pos, enemy_pos, vec_agent, vec_enemy, rel_pos_enemy, done
action_size = 3
episode_count = 1000
batch_size = 32
size       = 768         # Please check the Updates section above for more details
timescale  = 10           # Please check the Updates section above for more details

In [3]:
agent = NeurosmashAgent(state_size = state_size, action_size = action_size) # action size: move in x or y direction, or do nothing

In [4]:
def compute_reward(standard_reward, distance):
    distance_reward = (max_distance - distance) / max_distance
    total_reward = (distance_reward + standard_reward)/ 20
    return total_reward

In [5]:
def direction(agent_path, enemy_path):
    A_X = (agent_path[-1] - np.array(agent_path[-2]))[0]
    A_Y = -(agent_path[-1] - np.array(agent_path[-2]))[1]
    E_X = (enemy_path[-1] - np.array(enemy_path[-2]))[0]
    E_Y = -(enemy_path[-1] - np.array(enemy_path[-2]))[1]
    return [A_X,A_Y], [E_X,E_Y]

def do_action(action, total_steps, eval_pic, environment):
    stopwatch = Stopwatch() 
    stopwatch.start()
    info, reward, agent_coord, enemy_coord, following_state = environment.actionLoop(action, 0, eval_pic)
    if reward == 10:
        result_game = 1
    else:
        result_game = 0
    stopwatch.stop()

    if len(environment.agent_path) < 2:
        distance = 500 # Initial distance, only for initialisation
        agent_direction = [1, 0] # By definition of facing each other
        enemy_direction = [-1, 0]
    else:
        distance = np.sqrt(np.square(np.array(list(np.array(agent_coord)- np.array(enemy_coord))).sum(axis=0)))
        agent_direction, enemy_direction  = direction(environment.agent_path, environment.enemy_path)
    
    complete_reward = compute_reward(reward, distance)

    rel_pos_enemy = np.array(enemy_coord) - np.array(agent_coord)
    return info, complete_reward, np.array(agent_coord), np.array(enemy_coord), agent_direction, enemy_direction, distance, rel_pos_enemy, following_state, result_game


In [6]:
env = AgentEnvironment(size=size, timescale=timescale)

def init_environment(agent_here):
    info, reward, state = env.reset() 
    
    result_game =0
    
    if reward == 10:
        result_game = 1

    agent_trajectories = []
    enemy_trajectories = []
    
    small_init_state = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    small_init_state = np.reshape(small_init_state, [1, state_size])
 
    action = agent_here.act(small_init_state) # get next action

    info, complete_reward, agent_pos, enemy_pos, agent_direction, enemy_direction, distance, relative_pos_enemy, next_state, res = do_action(action, 1, True, env)  
    
    agent_trajectories.append(list(agent_pos))
    enemy_trajectories.append(list(enemy_pos))
        
    return info, complete_reward, next_state, agent_trajectories, enemy_trajectories, agent_direction, relative_pos_enemy, enemy_direction, env, result_game

In [7]:
complete_rewards = []

win_games = 0
lost_games = 0

for e in range(episode_count):
    status, complete_reward, next_state, agent_trajectories, enemy_trajectories, agent_dir, relative_pos_enemy, enemy_dir, environment, won_lost = init_environment(agent)
    small_state = [agent_trajectories[-1][0], agent_trajectories[-1][1], enemy_trajectories[-1][0], enemy_trajectories[-1][1], agent_dir[0], agent_dir[1], relative_pos_enemy[0], relative_pos_enemy[1], enemy_dir[0], enemy_dir[1], 0]#"agent direction", "relative position enemy", "enemy direction" ]
    small_state = np.reshape(small_state, [1, state_size])

    done = 0
    total_reward = 0
    total_timesteps = 1
    distances = []
    evaluate_frame = False

    while done == False:
        if (total_timesteps % skip_frames == 0) or (total_timesteps % skip_frames == skip_frames-1):
            evaluate_frame = True
        else:
            evaluate_frame = False
        
        action = agent.act(small_state)
        stopwatch = Stopwatch() 
        stopwatch.start()
        status, complete_reward, agent_pos, enemy_pos, agent_dir, enemy_dir, distance, enemy_pos_rel, next_state, won_lost = do_action(action, total_timesteps, evaluate_frame, environment)   
        stopwatch.stop()
        
        total_reward += complete_reward
        
        if won_lost == 0:
            lost_games += 1
        else:
            win_games += 1

        if status == 1:
            print(f"Game nr. {e} is finished, \n your final reward is: {total_reward}, duration was {total_timesteps} timesteps")
            done = 1
        
        agent_trajectories.append(list(agent_pos))
        enemy_trajectories.append(list(enemy_pos))
        distances.append(distance)
        
        done_list = [done]
        next_small_state = [agent_trajectories[-1][0], agent_trajectories[-1][1], enemy_trajectories[1][0], enemy_trajectories[1][1], agent_dir[0], agent_dir[1], enemy_pos_rel[0], enemy_pos_rel[1], enemy_dir[0], enemy_dir[1], done]  
    
        next_small_state = np.reshape(next_small_state, [1, state_size]) # why?
        small_state = np.reshape(small_state, [1, state_size])
        
        if (total_timesteps % skip_frames == 0):
            agent.remember(small_state, action, complete_reward, next_small_state, list(done_list))
        
        small_state = next_small_state # new small state
        total_timesteps += 1
        
    complete_rewards.append(total_reward)

    if len(agent.memory) > batch_size:
        agent.train(batch_size)
        print("train")

    if e % 50 == 0:
        agent.save(model_output_dir + "weights_"+ '{:04d}'.format(e) + ".hdf5")
            
    
print(f"finished all episodes. \nTotal games won: {win_games} \nTotal games lost: {lost_games}")

ValueError: too many values to unpack (expected 9)