In [1]:
%matplotlib tk

import argparse
import gym
import datetime
import os
import random
import tempfile
import numpy as np
import pickle

import ray
from ray import tune
from ray.tune.logger import Logger, UnifiedLogger, pretty_print
from ray.rllib.env.multi_agent_env import make_multi_agent
from ray.rllib.examples.models.shared_weights_model import TF2SharedWeightsModel
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.rllib.agents.ppo import ppo, PPOTrainer, PPOTFPolicy
from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
from ray.rllib.agents.a3c import a3c
from ray.rllib.models import ModelCatalog
from ray.rllib.policy.policy import PolicySpec
from environment_rllib_3d3 import MyEnv
#from test_env_for_lstm import MyEnv
from settings.initial_settings import *
from settings.reset_conditions import reset_conditions

from tensorflow.keras.utils import plot_model
from modules.savers import save_conditions
from utility.result_env import render_env
from utility.terminate_uavsimproc import teminate_proc
from utility.latest_learned_file_path import latest_learned_file_path
from utility.read_wright_weights import save_weights
from utility.read_wright_weights import reload_weights
from utility.save_logs import save_logs
from utility.save_logs import save_hists
from utility.save_logs import save_env_info

import matplotlib.pyplot as plt
import matplotlib
import tensorflow as tf
import cv2
import ctypes
import warnings

#UCAV.exeが起動している場合、プロセスキルする。
teminate_proc.UAVsimprockill(proc_name="UCAV.exe")

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
warnings.filterwarnings('ignore', category=matplotlib.MatplotlibDeprecationWarning)
np.set_printoptions(precision=3, suppress=True)
PROJECT = "UCAV"
TRIAL_ID = 2
TRIAL = 'test_' + str(TRIAL_ID)
EVAL_FREQ = 1
CONTINUAL = False
NUM_EVAL = 1
def custom_log_creator(custom_path, custom_str):
    timestr = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
    logdir_prefix = "{}_{}".format(custom_str, timestr)

    def logger_creator(config):
        if not os.path.exists(custom_path):
            os.makedirs(custom_path)
        logdir = tempfile.mkdtemp(prefix=logdir_prefix, dir=custom_path)
        return UnifiedLogger(config, logdir, loggers=None)

    return logger_creator

ray.shutdown()
ray.init(ignore_reinit_error=True, log_to_driver=False)

#ModelCatalog.register_custom_model('my_model', MyRNNUAVClass)
eval_env = MyEnv({"eval":True})
policy_own_config = (PPOTFPolicy, eval_env.observation_space, eval_env.action_space,
                 {"model":{"vf_share_layers": False,"use_lstm": True,"max_seq_len": 200},
                  "exploration_config": {"type": "StochasticSampling","random_timesteps":0},"explore":True,})
policy_enem_config = (PPOTFPolicy, eval_env.observation_space, eval_env.action_space,
                 {"model":{"vf_share_layers": False,"use_lstm": True,"max_seq_len": 200},
                  "exploration_config": {"type": "StochasticSampling","random_timesteps":0},"explore":False,})
policies_own = {}
policies_enem = {}
for i in range(eval_env.blue_num):
    policies_own["blue_"+str(i)] = policy_own_config
for i in range(eval_env.red_num):
    policies_enem["red_"+str(i)] = policy_enem_config

def policy_mapping_fn(agent_id, episode, **kwargs):
    #print(agent_id,episode)
    #pol_id = policy_ids[agent_id]

    pol_id = agent_id
    return pol_id

# Instanciate the evaluation env
config_own = ppo.DEFAULT_CONFIG.copy()
config_own = {"env": MyEnv,"env_config": {"eval":False},
              "num_gpus": 1,"num_workers": 0,"num_gpus_per_worker": 1,"num_envs_per_worker":eval_env.env_max,
          "train_batch_size": 1200*25,
          "batch_mode": "complete_episodes",
          "gamma":0.995, "lr": 2.5e-4,"shuffle_sequences": True,
          "observation_space":eval_env.observation_space,"action_space":eval_env.action_space,
          "sgd_minibatch_size": 600, "num_sgd_iter":20,
          "multiagent": {"policies": policies_own,  "policy_mapping_fn": policy_mapping_fn}
         }
#config_enem = ppo.DEFAULT_CONFIG.copy()
#config_enem = {"env": MyEnv,"num_gpus": 1,"num_workers": 0, "num_gpus_per_worker": 1,"num_envs_per_worker":1,
#          "train_batch_size": 600,
#          "batch_mode": "complete_episodes",
#          "gamma":0.995, "lr": 2.5e-4,"shuffle_sequences": True,
#          "observation_space":eval_env.observation_space,"action_space":eval_env.action_space,
#          "sgd_minibatch_size": 600, "num_sgd_iter":20,
#          "multiagent": {"policies": policies_enem,  "policy_mapping_fn": policy_mapping_fn}
#         }

res_name = "test"
conditions_dir = os.path.join('./' + PROJECT + '/conditions/')

if not os.path.exists(conditions_dir):
    os.makedirs(conditions_dir)
save_conditions(conditions_dir)

# PPOTrainer()は、try_import_tfを使うと、なぜかTensorflowのeager modeのエラーになる。

trainer = ppo.PPOTrainer(config=config_own,
                         logger_creator=custom_log_creator(
                             os.path.expanduser("./" + PROJECT + "/logs"), TRIAL))

#adversary = ppo.PPOTrainer(config=config_enem,
#                         logger_creator=custom_log_creator(
#                             os.path.expanduser("./" + PROJECT + "/logs"), TRIAL))

if CONTINUAL:
    # Continual learning: Need to specify the checkpoint
    # model_path = PROJECT + '/checkpoints/' + TRIAL + '/checkpoint_000197/checkpoint-197'
    model_path = latest_learned_file_path('./UCAV/checkpoints/test_2/*')
    
    #trainer.restore(checkpoint_path=model_path)
    #save_weights("blue_0",trainer)
    #save_weights("blue_1",trainer)

    #reload_weights(policy_id="red_0",trainer=trainer,set_policy_id="blue_0")
    #reload_weights(policy_id="red_1",trainer=trainer,set_policy_id="blue_1")
    for i in range(eval_env.blue_num):
        reload_weights(policy_id="blue_"+str(i),trainer=trainer,set_policy_id="blue_"+str(i))

    #save_weights("red_0",trainer)
    #save_weights("red_1",trainer)


models_dir = os.path.join('./' + PROJECT + '/models/')
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
hist_dir = os.path.join('./' + PROJECT + '/hist/')
if not os.path.exists(hist_dir):
    os.makedirs(hist_dir)

for j in range(2):
    text_name = models_dir + TRIAL + "blue_"+str(j) +'.txt'
    with open(text_name, "w") as fp:
        trainer.get_policy("blue_"+str(j)).model.base_model.summary(print_fn=lambda x: fp.write(x + "\r\n"))
    png_name = models_dir + TRIAL + '.png'
    plot_model(trainer.get_policy("blue_"+str(j)).model.base_model, to_file=png_name, show_shapes=True)



# Define checkpoint dir
check_point_dir = os.path.join('./' + PROJECT + '/checkpoints/', TRIAL)
if not os.path.exists(check_point_dir):
    os.makedirs(check_point_dir)

  for external in metadata.entry_points().get(self.group, []):

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Could not import from numba, which means that some
parts of this code may run MUCH more slowly.  You
may wish to install numba.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

2022-05-04 11:50:10,518	INFO trainer.py:2141 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
2022-05-04 11:50:10,520	INFO ppo.py:250 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-05-04 11:50:10,520	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.




2022-05-04 11:50:23,922	INFO trainable.py:130 -- Trainable.setup took 13.405 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [2]:

eval_env.reset()
save_env_info(eval_env)
record_mode = 0
results_dir = os.path.join('./' + PROJECT + '/results/')
rewards = {}
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
results_file = results_dir + TRIAL + '.pkl'
for steps in range(10001):
    # Training
    print(f'\n----------------- Training at steps:{steps} start! -----------------')
    eval_env.eval = False
    eval_env.self_play = False
    eval_env.reset()
    proc_list = [0]*eval_env.env_max
    f = open("proc_list"+".pkl",'wb')
    pickle.dump(proc_list,f)
    f.close()
    results = trainer.train()
    save_logs(res_name,results,steps,CONTINUAL,eval_env.blue_num)
    for j in range(eval_env.blue_num):
        save_weights("blue_"+str(j),trainer)
    print(pretty_print(results))
    #check_point = trainer.save(checkpoint_dir=check_point_dir)
    # Evaluation
    if steps % EVAL_FREQ == 0:
        print(f'\n-------------- Evaluation at steps:{steps} starting ! --------------')

        check_point = trainer.save(checkpoint_dir=check_point_dir)
        for i in range(NUM_EVAL):
            # print(f'\nEvaluation {i}:')
            model_path = latest_learned_file_path('./UCAV/checkpoints/test_2/*')
            #trainer.restore(checkpoint_path=model_path)
            trainer = ppo.PPOTrainer(config=config_own,
                         logger_creator=custom_log_creator(
                             os.path.expanduser("./" + PROJECT + "/logs"), TRIAL))
            for j in range(eval_env.blue_num):
                reload_weights(policy_id="blue_"+str(j),trainer=trainer,set_policy_id="blue_"+str(j))
            eval_env.eval = True
            obs = eval_env.reset()
            done = False
            
            step_num = 0
            #fig = plt.figure(1,figsize=(8.0, 6.0))
            ESC = 0x1B          # ESCキーの仮想キーコード
            trajectory_length = 100

            cell_size = 256
            state_0=[np.zeros(cell_size, np.float32),np.zeros(cell_size, np.float32)]
            action_dict0 = [0,0]
            
            state_0=[state_0]*eval_env.blue_num
            action_dict0 = [action_dict0]*eval_env.blue_num
            
            for j in range(eval_env.blue_num):
                rewards["blue_"+str(j)] = 0

            if record_mode == 0:
                file_name = "test_num" + str(steps) +str(i)
                #video = cv2.VideoWriter(file_name+'.mp4',0x00000020,20.0,(800,600))

            while True:
                action_dict = {}
                for j in range(eval_env.blue_num):
                    action_temp = trainer.compute_single_action(obs["blue_"+str(j)],
                                                                state=state_0[j],prev_action=None,prev_reward=None,
                                                                policy_id="blue_"+str(j),explore=False)
                    state_0[j] = action_temp[1]
                    action_dict["blue_"+str(j)] = action_temp[0]
                obs, rewards, dones, infos = eval_env.step(action_dict)
                env_blue_pos_temp_mod, env_red_pos_temp_mod, env_mrm_pos_temp_mod = render_env.copy_from_env_mod(eval_env)
                if eval_env.timer == 1:
                    env_blue_pos_mod = env_blue_pos_temp_mod
                    env_red_pos_mod = env_red_pos_temp_mod
                    env_mrm_pos_mod = env_mrm_pos_temp_mod
                else:
                    env_blue_pos_mod = np.vstack([env_blue_pos_mod,env_blue_pos_temp_mod])
                    env_red_pos_mod = np.vstack([env_red_pos_mod,env_red_pos_temp_mod])
                    env_mrm_pos_mod = np.vstack([env_mrm_pos_mod,env_mrm_pos_temp_mod])


                #if record_mode == 0:
                    #img = np.array(fig.canvas.renderer.buffer_rgba())
                    #img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
                    #video.write(img.astype('uint8'))

                
                step_num = step_num + 1
                
                done = dones["__all__"]
                
                #print(f'rewards:{rewards}')
                #if record_mode == 0:
                #    img = eval_env.render_movie(file_name,step_num)
                #    video.write(img.astype('unit8'))
                #elif record_mode == 1:
                #    eval_env.render()
                #elif record_mode == 2:
                #    eval_env.render()

                # エピソードの終了処理
                if dones['__all__']:
                    save_hists("blue_"+str(i),steps,env_blue_pos_mod,hist_dir)
                    save_hists("red_"+str(i),steps,env_red_pos_mod,hist_dir)
                    save_hists("mrm_"+str(i),steps,env_mrm_pos_mod,hist_dir)
                    for j in range(eval_env.blue_num):
                        save_weights("blue_"+str(j),trainer)
                    #save_weights("blue_1",trainer)
                    #save_weights("blue_2",trainer)
                    # print(f'all done at {env.steps}')
                    break
                

            
            #if record_mode == 0:
               # video.release()

ray.shutdown()

-------------------------- Scene: 0 --------------------------

----------------- Training at steps:0 start! -----------------
-------------------------- Scene: 0 --------------------------
-------------------------- Scene: 0 --------------------------
-------------------------- Scene: 0 --------------------------
-------------------------- Scene: 0 --------------------------
-------------------------- Scene: 0 --------------------------
2 88 blue_1 DOWN
0 91 blue_0 DOWN
2 200 blue_0 Shoot at red_1 launch distance : 55466.51930669528 True True
2 212 blue_0 Shoot at red_1 launch distance : 49289.084653704005 True True
2 246 red_0 Shoot at blue_0
2 257 red_0 Shoot at blue_0
0 295 blue_1 Shoot at red_1 launch distance : 59231.591385003325 True True
2 304 blue_0: Destroyed
2 TIME LIMIT LOSE
2 blue_0 False False 304 0.35833462666666666 78.43166143999997
2 blue_1 False False 304 -0.10133474 -1.5128951133333333
-------------------------- Scene: 0 --------------------------
0 Same tgt shoot
0 

1 192 blue_0 Shoot at red_0 launch distance : 59937.996771663966 True True
2 205 blue_0 Shoot at red_0 launch distance : 59558.18125161312 True True
1 203 blue_0 Shoot at red_0 launch distance : 55229.6067793353 True True
2 216 blue_1 Shoot at red_0 launch distance : 59841.1907468426 True True
2 Same tgt shoot
2 Same tgt shoot
2 217 blue_0 Shoot at red_1 launch distance : 59634.92639385078 True True
2 Same tgt shoot
2 Same tgt shoot
2 235 blue_1 Shoot at red_1 launch distance : 58037.03649394927 True True
1 230 red_1 Shoot at blue_0
1 241 red_1 Shoot at blue_0
1 300 blue_0: Destroyed
2 317 blue_0 Splash :red_0
1 315 blue_0 Splash :red_0
2 332 blue_0 Splash :red_1
2 WIN
2 blue_0 False True 435 12.760166332988506 151.37103260362125
2 blue_1 False True 435 12.760166332988506 115.1750887563217
-------------------------- Scene: 0 --------------------------
0 292 blue_1 Shoot at red_1 launch distance : 59193.95395646417 True True
0 304 blue_1 Shoot at red_1 launch distance : 54167.7708790014

0 Same tgt shoot
0 186 blue_0 Shoot at red_1 launch distance : 59730.43921653347 True True
2 WIN
2 blue_0 False True 405 12.964471126296296 87.93335636629644
2 blue_1 False True 405 12.964474526296295 154.54665194713118
-------------------------- Scene: 0 --------------------------
0 280 blue_0 Splash :red_0
0 298 blue_0 Splash :red_1
0 WIN
0 blue_0 False True 298 32.58207508608501 166.0846651441802
0 blue_1 False True 298 14.028384010917227 10.669589764250562
-------------------------- Scene: 0 --------------------------
3 189 blue_0 Shoot at red_1 launch distance : 52453.85584492335 True True
3 193 blue_1 Shoot at red_1 launch distance : 59745.39040796369 True True
3 200 blue_0 Shoot at red_1 launch distance : 46245.44757054471 True True
3 205 blue_1 Shoot at red_1 launch distance : 53771.70014979999 True True
3 212 red_1 Shoot at blue_0
2 93 blue_0 DOWN
3 234 red_0 Shoot at blue_0
3 245 red_0 Shoot at blue_0
0 84 blue_1 DOWN
0 98 blue_0 DOWN
0 TIME LIMIT LOSE
0 blue_0 False False 98

3 WIN
3 blue_0 False True 423 28.912184057872345 163.93334087185735
3 blue_1 False True 423 12.738425192624113 6.623200419290779
-------------------------- Scene: 0 --------------------------
0 270 blue_0 DOWN
0 336 blue_0 Splash :red_0
1 TIME LIMIT LOSE
1 blue_0 False False 1200 0.90052968 127.36684346986351
1 blue_1 False False 1200 0.89452968 68.12259976333408
-------------------------- Scene: 0 --------------------------
3 169 blue_1 Shoot at red_0 launch distance : 59298.02521163753 True True
3 Same tgt shoot
3 181 blue_1 Shoot at red_1 launch distance : 57786.43326941022 True True
3 Same tgt shoot
3 190 blue_0 Shoot at red_0 launch distance : 59927.155814371836 True True
3 Same tgt shoot
3 Same tgt shoot
3 201 blue_0 Shoot at red_1 launch distance : 58844.42809646467 True True
3 283 blue_1 Splash :red_0
3 293 blue_1 Splash :red_1
1 252 red_1 Shoot at blue_1
3 WIN
3 blue_0 False True 401 12.894066126575229 107.9888348999085
3 blue_1 False True 401 12.894066126575229 157.4554406507



agent_timesteps_total: 61064
custom_metrics: {}
date: 2022-05-04_12-38-16
done: false
episode_len_mean: 610.64
episode_media: {}
episode_reward_max: 276.8825757168569
episode_reward_mean: 107.90554330643816
episode_reward_min: -118.36618548666685
episodes_this_iter: 50
episodes_total: 50
experiment_id: a3ba759500134ec3bf493321bd802397
hostname: DESKTOP
info:
  learner:
    blue_0:
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 0.0002500000118743628
        entropy: 5.7469282150268555
        entropy_coeff: 0.0
        kl: 0.15119312703609467
        model: {}
        policy_loss: -0.03685041144490242
        total_loss: 937.9043579101562
        vf_explained_var: -0.056172873824834824
        vf_loss: 937.9109497070312
      train: null
    blue_1:
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 0.0002500000118743628
        entropy: 5.909292221069336
        entropy_coeff: 0.0
        kl: 0.12638401985168457
        model:





2022-05-04 12:38:30,381	INFO trainable.py:130 -- Trainable.setup took 13.285 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


ValueError: Cannot feed value of shape (256, 256) for Tensor blue_0/Placeholder_blue_0/fc_value_1/kernel:0, which has shape (157, 256)