### CONCERNS
1. list of directions/positions are never emptied during an episode, which makes the simulation slower and slower after each timestep

### IDEAS
other possible features:
1. robot is fallen down or not
2. distance to border (& which border?)

time optimalization:
1. clear last item from history every 4 timesteps (we only use the current and previous state and the one before that)
2. interval of states to be interpreted: skip N frames before evaluation next state
3. Since the rewards are so sparse, maybe use Imitation learning instead of DQN --> we are "experts" since we know the tactic of the blue bot. we can use this to teach our bot how to defeat the other agent.


In [1]:
from VisualModule import AgentEnvironment
from DQN_Agent import NeurosmashAgent

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

import gym
import math
import random
from collections import namedtuple
from itertools import count
from PIL import Image
import os

from stopwatch import Stopwatch


Using TensorFlow backend.


In [2]:
model_output_dir = "output/model_output/"

if not os.path.exists(model_output_dir):
    os.makedirs(model_output_dir)

show_images = False
skip_frames = 10
state_size =  11 # agent_pos, enemy_pos, vec_agent, vec_enemy, rel_pos_enemy, done
action_size = 3
episode_count = 1000
batch_size = 32
size       = 768         # Please check the Updates section above for more details
timescale  = 5           # Please check the Updates section above for more details

In [3]:
environment = AgentEnvironment(size=size, timescale=timescale)
agent = NeurosmashAgent(state_size = state_size, action_size = action_size) # action size: move in x or y direction, or do nothing

In [4]:
def direction(agent_path, enemy_path):
    A_X = (agent_path[-1] - np.array(agent_path[-2]))[0]
    A_Y = -(agent_path[-1] - np.array(agent_path[-2]))[1]
    E_X = (enemy_path[-1] - np.array(enemy_path[-2]))[0]
    E_Y = -(enemy_path[-1] - np.array(enemy_path[-2]))[1]
    return [A_X,A_Y],[E_X,E_Y]

def do_action(action, total_steps):
    stopwatch = Stopwatch() 
    stopwatch.start()
    info, reward, agent_coord, enemy_coord, following_state = environment.simpleCoord(action, 0, skip_frames, total_steps)
    stopwatch.stop()
    print(f"Total time for do action: {stopwatch.duration}")
    if len(environment.agent_path) < 2:
        distance = 500 # Initial distance, only for initialisation
        agent_direction = [1,0] # By definition of facing each other
        enemy_direction = [-1,0]
    else:
        distance = np.sqrt(np.square(np.array(list(np.array(agent_coord)- np.array(enemy_coord))).sum(axis=0)))
        # Extract all variables 
        agent_direction, enemy_direction  = direction(environment.agent_path, environment.enemy_path)

    rel_pos_enemy = np.array(enemy_coord) - np.array(agent_coord)
    return info, reward, np.array(agent_coord), np.array(enemy_coord), agent_direction, enemy_direction, distance, rel_pos_enemy, following_state


In [5]:
def init_environment(env, agent_here):
    info, reward, state = env.reset() 
    agent_trajectories = []
    enemy_trajectories = []
    
    for i in range(3):
        action = agent_here.act(3) # get next action
        #pre_state_img = np.flip(np.array(state).reshape(3,256,256).transpose(1,2,0),0)
        step_number_now = i+1
        info, reward, agent_pos, enemy_pos, agent_direction, enemy_direction, distance, relative_pos_enemy, next_state = do_action(action, step_number_now)  

        #post_state_img = np.flip(np.array(next_state).reshape(3,256,256).transpose(1,2,0),0)

        #agent_pos, enemy_pos = env_feat.coord(pre_state_img, post_state_img)
        agent_trajectories.append(list(agent_pos))
        enemy_trajectories.append(list(enemy_pos))
        
    return info, reward, next_state, agent_trajectories, enemy_trajectories, agent_direction, relative_pos_enemy, enemy_direction

In [None]:
for e in range(episode_count):
    status, reward, next_state, agent_trajectories, enemy_trajectories, agent_dir, relative_pos_enemy, enemy_dir = init_environment(environment, agent)
    done = False
    total_reward = 0
    total_timesteps = 4
    small_state = [agent_trajectories[-1][0], agent_trajectories[-1][1], enemy_trajectories[-1][0], enemy_trajectories[-1][1], [agent_dir[0]], [agent_dir[1]], [relative_pos_enemy[0]], [relative_pos_enemy[1]], [enemy_dir[0]], [enemy_dir[1]], done]#"agent direction", "relative position enemy", "enemy direction" ]
    distances = []

    while done == False:    
        action = agent.act(small_state) #step(info, reward, state)
        #print(f"agent chooses action: {action}")
        stopwatch = Stopwatch() 
        stopwatch.start()
        status, reward, agent_pos, enemy_pos, agent_dir, enemy_dir, distance, enemy_pos_rel, next_state = do_action(action, total_timesteps)   
        stopwatch.stop()
        print(f"Total time for one step: {stopwatch.duration}")
        
        total_reward += reward

        if status == 1:
            print(f"Game is finished, \n your final reward is: {total_reward}, duration was {total_timesteps} timesteps")
            done = True
        
        agent_trajectories.append(list(agent_pos))
        enemy_trajectories.append(list(enemy_pos))
        distances.append(distance)
        
        done_list = [done]
        next_small_state = [agent_trajectories[-1][0], agent_trajectories[-1][1], enemy_trajectories[1][0], enemy_trajectories[1][1], agent_dir[0], agent_dir[1], enemy_pos_rel[0], enemy_pos_rel[1], enemy_dir[0], enemy_dir[1], done]  
    
        next_small_state = np.reshape(next_small_state, [1, state_size]) # why?

        agent.remember(small_state, action, reward, next_small_state, list(done_list))
        small_state = next_small_state
        total_timesteps += 1

    if len(agent.memory) > batch_size:
        agent.train(batch_size)
        print("train")

    if e % 50 == 0:
        agent.save(model_output_dir + "weights_"+ '{:04d}'.format(e) + ".hdf5")
            
    

Total time to get coords: 1.0212547030023416
Total time for coord init: 1.4936230969979079
Total time for do action: 1.4977690509986132
Total time to get coords: 0.8773263490002137
Total time for coord: 1.074275464998209
Total time for do action: 1.0751117559993872
Total time to get coords: 0.858101493002323
Total time for coord: 1.0465594699999201
Total time for do action: 1.0465863279969199
Total time to get coords: 0.8739427570035332
Total time for coord: 1.055617247002374
Total time for do action: 1.0570532339988858
Total time for one step: 1.0595288049953524
Total time to get coords: 0.9305196040004375
Total time for coord: 1.1430274250014918
Total time for do action: 1.1430497050023405
Total time for one step: 1.145148097006313
Total time to get coords: 0.869450368998514
Total time for coord: 1.0531627829986974
Total time for do action: 1.0531849969993345
Total time for one step: 1.0572397409996483
Total time to get coords: 0.868488909000007
Total time for coord: 1.06197572800010

Total time to get coords: 0.8609144799993373
Total time for coord: 1.0576136020026752
Total time for do action: 1.0576367640023818
Total time for one step: 1.0597390650000307
Total time to get coords: 0.8466958999997587
Total time for coord: 1.0279170729991165
Total time for do action: 1.0279397429985693
Total time for one step: 1.0300336419968517
Total time to get coords: 0.8674179709996679
Total time for coord: 1.0523973620001925
Total time for do action: 1.0524261800019303
Total time for one step: 1.0549091919965576
Total time to get coords: 0.8500447690021247
Total time for coord: 1.0361224990047049
Total time for do action: 1.0361483140004566
Total time for one step: 1.0386907020001672
Total time to get coords: 0.9100808420043904
Total time for coord: 1.0978311930011841
Total time for do action: 1.0978539110001293
Total time for one step: 1.0999771549977595
Total time to get coords: 0.9507927860031486
Total time for coord: 1.1427926769974874
Total time for do action: 1.14281429499