### CONCERNS
1. list of directions/positions are never emptied during an episode, which makes the simulation slower and slower after each timestep

### IDEAS
other possible features:
1. robot is fallen down or not
2. distance to border (& which border?)

time optimalization:
1. clear last item from history every 4 timesteps (we only use the current and previous state and the one before that)
2. interval of states to be interpreted: skip N frames before evaluation next state
3. Since the rewards are so sparse, maybe use Imitation learning instead of DQN --> we are "experts" since we know the tactic of the blue bot. we can use this to teach our bot how to defeat the other agent.


In [1]:
from VisualModule import AgentEnvironment
from DQN_Agent import NeurosmashAgent

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

import gym
import math
import random
from collections import namedtuple
from itertools import count
from PIL import Image
import os

from stopwatch import Stopwatch


Using TensorFlow backend.


In [2]:
model_output_dir = "output/model_output/"

if not os.path.exists(model_output_dir):
    os.makedirs(model_output_dir)

show_images = False
skip_frames = 10
state_size =  11 # agent_pos, enemy_pos, vec_agent, vec_enemy, rel_pos_enemy, done
action_size = 3
episode_count = 1000
batch_size = 32
size       = 768         # Please check the Updates section above for more details
timescale  = 5           # Please check the Updates section above for more details

In [3]:
environment = AgentEnvironment(size=size, timescale=timescale)
agent = NeurosmashAgent(state_size = state_size, action_size = action_size) # action size: move in x or y direction, or do nothing

In [4]:
def direction(agent_path, enemy_path):
    A_X = (agent_path[-1] - np.array(agent_path[-2]))[0]
    A_Y = -(agent_path[-1] - np.array(agent_path[-2]))[1]
    E_X = (enemy_path[-1] - np.array(enemy_path[-2]))[0]
    E_Y = -(enemy_path[-1] - np.array(enemy_path[-2]))[1]
    return [A_X,A_Y],[E_X,E_Y]

def do_action(action, total_steps):
    stopwatch = Stopwatch() 
    stopwatch.start()
    info, reward, agent_coord, enemy_coord, following_state = environment.simpleCoord(action, 0, skip_frames, total_steps)
    stopwatch.stop()
    print(f"Total time for do action: {stopwatch.duration}")
    if len(environment.agent_path) < 2:
        distance = 500 # Initial distance, only for initialisation
        agent_direction = [1,0] # By definition of facing each other
        enemy_direction = [-1,0]
    else:
        distance = np.sqrt(np.square(np.array(list(np.array(agent_coord)- np.array(enemy_coord))).sum(axis=0)))
        # Extract all variables 
        agent_direction, enemy_direction  = direction(environment.agent_path, environment.enemy_path)

    rel_pos_enemy = np.array(enemy_coord) - np.array(agent_coord)
    return info, reward, np.array(agent_coord), np.array(enemy_coord), agent_direction, enemy_direction, distance, rel_pos_enemy, following_state


In [5]:
def init_environment(env, agent_here):
    info, reward, state = env.reset() 
    agent_trajectories = []
    enemy_trajectories = []
    
    for i in range(3):
        action = agent_here.act(3) # get next action
        #pre_state_img = np.flip(np.array(state).reshape(3,256,256).transpose(1,2,0),0)
        step_number_now = i+1
        info, reward, agent_pos, enemy_pos, agent_direction, enemy_direction, distance, relative_pos_enemy, next_state = do_action(action, step_number_now)  

        #post_state_img = np.flip(np.array(next_state).reshape(3,256,256).transpose(1,2,0),0)

        #agent_pos, enemy_pos = env_feat.coord(pre_state_img, post_state_img)
        agent_trajectories.append(list(agent_pos))
        enemy_trajectories.append(list(enemy_pos))
        
    return info, reward, next_state, agent_trajectories, enemy_trajectories, agent_direction, relative_pos_enemy, enemy_direction

In [None]:
for e in range(episode_count):
    status, reward, next_state, agent_trajectories, enemy_trajectories, agent_dir, relative_pos_enemy, enemy_dir = init_environment(environment, agent)
    done = False
    total_reward = 0
    total_timesteps = 4
    small_state = [agent_trajectories[-1][0], agent_trajectories[-1][1], enemy_trajectories[-1][0], enemy_trajectories[-1][1], [agent_dir[0]], [agent_dir[1]], [relative_pos_enemy[0]], [relative_pos_enemy[1]], [enemy_dir[0]], [enemy_dir[1]], done]#"agent direction", "relative position enemy", "enemy direction" ]
    distances = []

    while done == False:    
        action = agent.act(small_state) #step(info, reward, state)
        #print(f"agent chooses action: {action}")
        stopwatch = Stopwatch() 
        stopwatch.start()
        status, reward, agent_pos, enemy_pos, agent_dir, enemy_dir, distance, enemy_pos_rel, next_state = do_action(action, total_timesteps)   
        stopwatch.stop()
        print(f"Total time for one step: {stopwatch.duration}")
        
        total_reward += reward

        if status == 1:
            print(f"Game is finished, \n your final reward is: {total_reward}, duration was {total_timesteps} timesteps")
            done = True
        
        agent_trajectories.append(list(agent_pos))
        enemy_trajectories.append(list(enemy_pos))
        distances.append(distance)
        
        done_list = [done]
        next_small_state = [agent_trajectories[-1][0], agent_trajectories[-1][1], enemy_trajectories[1][0], enemy_trajectories[1][1], agent_dir[0], agent_dir[1], enemy_pos_rel[0], enemy_pos_rel[1], enemy_dir[0], enemy_dir[1], done]  
    
        next_small_state = np.reshape(next_small_state, [1, state_size]) # why?

        agent.remember(small_state, action, reward, next_small_state, list(done_list))
        small_state = next_small_state
        total_timesteps += 1

    if len(agent.memory) > batch_size:
        agent.train(batch_size)
        print("train")

    if e % 50 == 0:
        agent.save(model_output_dir + "weights_"+ '{:04d}'.format(e) + ".hdf5")
            
    

623 165 33 63
enemy coordinates: (653, 195)
agent coordinates: (107, 157, 44, 95)
Total time to get coords: 0.01778029900015099
Total time for coord init: 0.5102901160062174
Total time for do action: 0.5132025799975963
620 165 23 81
enemy coordinates: (650, 195)
agent coordinates: (112, 157, 516, 93)
Total time to get coords: 0.019625412998721004
Total time for coord: 0.24962690100073814
Total time for do action: 0.2505034909991082
623 162 19 57
enemy coordinates: (653, 192)
agent coordinates: (114, 156, 510, 94)
Total time to get coords: 0.019516614003805444
Total time for coord: 0.25451511799474247
Total time for do action: 0.25464662100421265
613 163 31 84
enemy coordinates: (643, 193)
agent coordinates: (117, 155, 503, 94)
Total time to get coords: 0.02037150000251131
Total time for coord: 0.21191450100013753
Total time for do action: 0.21200331999716582
Total time for one step: 0.21635641700413544
620 163 25 64
enemy coordinates: (650, 193)
agent coordinates: (121, 155, 495, 93)
T

510 157 38 48
enemy coordinates: (540, 187)
agent coordinates: (194, 135, 54, 89)
Total time to get coords: 0.0165816540029482
Total time for coord: 0.21617419399990467
Total time for do action: 0.21627647599962074
Total time for one step: 0.21861816200544126
511 156 32 49
enemy coordinates: (541, 186)
agent coordinates: (196, 134, 54, 90)
Total time to get coords: 0.01580647700029658
Total time for coord: 0.2112194260043907
Total time for do action: 0.21130209600232774
Total time for one step: 0.21342758699756814
505 152 30 54
enemy coordinates: (535, 182)
agent coordinates: (198, 134, 56, 88)
Total time to get coords: 0.01639235400216421
Total time for coord: 0.22599028699914925
Total time for do action: 0.2277292579965433
Total time for one step: 0.231027734997042
501 155 37 50
enemy coordinates: (531, 185)
agent coordinates: (202, 134, 56, 86)
Total time to get coords: 0.01661093599977903
Total time for coord: 0.24246483199385693
Total time for do action: 0.24259078500472242
Total 

434 135 26 45
enemy coordinates: (464, 165)
agent coordinates: (272, 107, 71, 96)
Total time to get coords: 0.020897377995424904
Total time for coord: 0.22346798400394619
Total time for do action: 0.22357496600307059
Total time for one step: 0.22682918100326788
436 141 14 39
enemy coordinates: (466, 171)
agent coordinates: (274, 107, 70, 97)
Total time to get coords: 0.016821559001982678
Total time for coord: 0.21071055199718103
Total time for do action: 0.21081397899979493
Total time for one step: 0.21311508700455306
433 139 10 46
enemy coordinates: (463, 169)
agent coordinates: (275, 107, 71, 94)
Total time to get coords: 0.01601032100006705
Total time for coord: 0.23310907099948963
Total time for do action: 0.2356306619985844
Total time for one step: 0.23818706900055986
424 147 24 36
enemy coordinates: (454, 177)
agent coordinates: (276, 107, 73, 81)
Total time to get coords: 0.0164625339966733
Total time for coord: 0.21186718699755147
Total time for do action: 0.2119847890062374
To

380 127 5 4
enemy coordinates: (410, 157)
agent coordinates: (304, 94, 45, 84)
Total time to get coords: 0.016741888000979088
Total time for coord: 0.21617816600337392
Total time for do action: 0.21625852300348924
Total time for one step: 0.2196332160019665
333 128 36 17
enemy coordinates: (363, 158)
agent coordinates: (301, 93, 47, 83)
Total time to get coords: 0.018985345006512944
Total time for coord: 0.2288205399963772
Total time for do action: 0.2288854559956235
Total time for one step: 0.23220541200134903
362 114 7 27
enemy coordinates: (392, 144)
agent coordinates: (298, 92, 48, 83)
Total time to get coords: 0.016125146998092532
Total time for coord: 0.24061358800099697
Total time for do action: 0.24194899000576697
Total time for one step: 0.24530167199554853
325 113 35 17
enemy coordinates: (355, 143)
agent coordinates: (296, 92, 48, 82)
Total time to get coords: 0.019922198996937368
Total time for coord: 0.21324740000272868
Total time for do action: 0.21332642999914242
Total t

332 107 38 38
enemy coordinates: (362, 137)
agent coordinates: (255, 99, 99, 73)
Total time to get coords: 0.017656263000390027
Total time for coord: 0.24350077800045256
Total time for do action: 0.24372095899889246
Total time for one step: 0.24614354200457456
345 65 27 73
enemy coordinates: (375, 95)
agent coordinates: (255, 99, 102, 73)
Total time to get coords: 0.017251737001060974
Total time for coord: 0.2392874790020869
Total time for do action: 0.23937390700302785
Total time for one step: 0.2415028839968727
349 75 25 31
enemy coordinates: (379, 105)
agent coordinates: (256, 100, 102, 67)
Total time to get coords: 0.0167518710004515
Total time for coord: 0.22371310499875108
Total time for do action: 0.22550481499638408
Total time for one step: 0.22814262499741744
358 69 18 39
enemy coordinates: (388, 99)
agent coordinates: (260, 102, 96, 57)
Total time to get coords: 0.019796116997895297
Total time for coord: 0.21509413999592653
Total time for do action: 0.21521134399517905
Total 