In [1]:
%matplotlib tk
import argparse
import gym
import datetime
import os
import random
import tempfile
import numpy as np
import pickle

import ray
from ray import tune
from ray.tune.logger import Logger, UnifiedLogger, pretty_print
from ray.rllib.env.multi_agent_env import make_multi_agent
from ray.rllib.examples.models.shared_weights_model import TF2SharedWeightsModel
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.rllib.agents.ppo import ppo, PPOTrainer, PPOTFPolicy
from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
from ray.rllib.agents.a3c import a3c
from ray.rllib.models import ModelCatalog
from ray.rllib.policy.policy import PolicySpec
from environment_rllib_3d1 import MyEnv
#from test_env_for_lstm import MyEnv
from settings.initial_settings import *
from settings.reset_conditions import reset_conditions

from tensorflow.keras.utils import plot_model
from modules.savers import save_conditions
from utility.result_env import render_env
from utility.terminate_uavsimproc import teminate_proc
from utility.latest_learned_file_path import latest_learned_file_path
from utility.read_wright_weights import save_weights
from utility.read_wright_weights import reload_weights
from utility.save_logs import save_logs
from utility.save_logs import save_hists
from utility.save_logs import save_env_info

import matplotlib.pyplot as plt
import matplotlib
import tensorflow as tf
import cv2
import ctypes
import warnings

#UCAV.exeが起動している場合、プロセスキルする。
teminate_proc.UAVsimprockill(proc_name="UCAV.exe")

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
warnings.filterwarnings('ignore', category=matplotlib.MatplotlibDeprecationWarning)
np.set_printoptions(precision=3, suppress=True)
PROJECT = "UCAV"
TRIAL_ID = 2
TRIAL = 'test_' + str(TRIAL_ID)
EVAL_FREQ = 1
CONTINUAL = True
NUM_EVAL = 1
def custom_log_creator(custom_path, custom_str):
    timestr = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
    logdir_prefix = "{}_{}".format(custom_str, timestr)

    def logger_creator(config):
        if not os.path.exists(custom_path):
            os.makedirs(custom_path)
        logdir = tempfile.mkdtemp(prefix=logdir_prefix, dir=custom_path)
        return UnifiedLogger(config, logdir, loggers=None)

    return logger_creator

ray.shutdown()
ray.init(ignore_reinit_error=True, log_to_driver=False)

#ModelCatalog.register_custom_model('my_model', MyRNNUAVClass)

eval_env = MyEnv()
policies_own = {
    "blue_0": (PPOTFPolicy, eval_env.observation_space, eval_env.action_space,
               {"model":{"vf_share_layers": False,"use_lstm": True,"max_seq_len": 200},
               "exploration_config": {"type": "StochasticSampling","random_timesteps":0},"explore":True,}),
    "blue_1": (PPOTFPolicy, eval_env.observation_space, eval_env.action_space,
               {"model":{"vf_share_layers": False,"use_lstm": True,"max_seq_len": 200},
               "exploration_config": {"type": "StochasticSampling","random_timesteps":0},"explore":True,}),
    "red_0": (PPOTFPolicy, eval_env.observation_space, eval_env.action_space,
              {"model":{"vf_share_layers": False,"use_lstm": True,"max_seq_len": 200},"explore":False,}),
    "red_1": (PPOTFPolicy, eval_env.observation_space, eval_env.action_space,
              {"model":{"vf_share_layers": False,"use_lstm": True,"max_seq_len": 200},"explore":False,}),
}
policies_enem = {
    "red_0": (PPOTFPolicy, eval_env.observation_space, eval_env.action_space,
              {"model":{"vf_share_layers": False,"use_lstm": True,"max_seq_len": 200},"explore":False,}),
    "red_1": (PPOTFPolicy, eval_env.observation_space, eval_env.action_space,
              {"model":{"vf_share_layers": False,"use_lstm": True,"max_seq_len": 200},"explore":False,}),
}
# policy_ids = list(policies.keys())

def policy_mapping_fn(agent_id, episode, **kwargs):
    #print(agent_id,episode)
    #pol_id = policy_ids[agent_id]

    pol_id = agent_id
    return pol_id

# Instanciate the evaluation env
config_own = ppo.DEFAULT_CONFIG.copy()
config_own = {"env": MyEnv,"num_gpus": 1,"num_workers": 0, "num_cpus_per_worker": 0,"num_gpus_per_worker": 1,
          "train_batch_size": 1200*25,
          "batch_mode": "complete_episodes",
          "gamma":0.995, "lr": 2.5e-4,"shuffle_sequences": True,
          "observation_space":eval_env.observation_space,"action_space":eval_env.action_space,
          "sgd_minibatch_size": 600, "num_sgd_iter":20,
          "multiagent": {"policies": policies_own,  "policy_mapping_fn": policy_mapping_fn}
         }
config_enem = ppo.DEFAULT_CONFIG.copy()
config_enem = {"env": MyEnv,"num_gpus": 1,"num_workers": 0, "num_cpus_per_worker": 0,"num_gpus_per_worker": 1,
          "train_batch_size": 600*5*10,
          "batch_mode": "complete_episodes",
          "gamma":0.995, "lr": 2.5e-4,"shuffle_sequences": True,
          "observation_space":eval_env.observation_space,"action_space":eval_env.action_space,
          "sgd_minibatch_size": 600, "num_sgd_iter":20,
          "multiagent": {"policies": policies_enem,  "policy_mapping_fn": policy_mapping_fn}
         }

res_name = "test"
conditions_dir = os.path.join('./' + PROJECT + '/conditions/')

if not os.path.exists(conditions_dir):
    os.makedirs(conditions_dir)
save_conditions(conditions_dir)

# PPOTrainer()は、try_import_tfを使うと、なぜかTensorflowのeager modeのエラーになる。

trainer = ppo.PPOTrainer(config=config_own,
                         logger_creator=custom_log_creator(
                             os.path.expanduser("./" + PROJECT + "/logs"), TRIAL))

adversary = ppo.PPOTrainer(config=config_enem,
                         logger_creator=custom_log_creator(
                             os.path.expanduser("./" + PROJECT + "/logs"), TRIAL))

if CONTINUAL:
    # Continual learning: Need to specify the checkpoint
    # model_path = PROJECT + '/checkpoints/' + TRIAL + '/checkpoint_000197/checkpoint-197'
    model_path = latest_learned_file_path('./UCAV/checkpoints/test_2/*')
    
    #trainer.restore(checkpoint_path=model_path)
    #save_weights("blue_0",trainer)
    #save_weights("blue_1",trainer)

    reload_weights(policy_id="red_0",trainer=trainer,set_policy_id="blue_0")
    reload_weights(policy_id="red_1",trainer=trainer,set_policy_id="blue_1")
    reload_weights(policy_id="blue_0",trainer=trainer,set_policy_id="blue_0")
    reload_weights(policy_id="blue_1",trainer=trainer,set_policy_id="blue_1")
    save_weights("red_0",trainer)
    save_weights("red_1",trainer)


models_dir = os.path.join('./' + PROJECT + '/models/')
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
hist_dir = os.path.join('./' + PROJECT + '/hist/')
if not os.path.exists(hist_dir):
    os.makedirs(hist_dir)
for j in range(2):
    text_name = models_dir + TRIAL + "blue_"+str(j) +'.txt'
    with open(text_name, "w") as fp:
        trainer.get_policy("blue_"+str(j)).model.base_model.summary(print_fn=lambda x: fp.write(x + "\r\n"))
    png_name = models_dir + TRIAL + '.png'
    plot_model(trainer.get_policy("blue_"+str(j)).model.base_model, to_file=png_name, show_shapes=True)



# Define checkpoint dir
check_point_dir = os.path.join('./' + PROJECT + '/checkpoints/', TRIAL)
if not os.path.exists(check_point_dir):
    os.makedirs(check_point_dir)

  for external in metadata.entry_points().get(self.group, []):

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Could not import from numba, which means that some
parts of this code may run MUCH more slowly.  You
may wish to install numba.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

2022-04-05 20:40:28,583	INFO trainer.py:2141 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
2022-04-05 20:40:28,585	INFO ppo.py:250 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-04-05 20:40:28,585	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.




2022-04-05 20:40:53,403	INFO trainable.py:130 -- Trainable.setup took 24.822 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.




2022-04-05 20:41:05,437	INFO trainable.py:130 -- Trainable.setup took 11.988 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [None]:

eval_env.reset()
save_env_info(eval_env)
record_mode = 0
results_dir = os.path.join('./' + PROJECT + '/results/')

if not os.path.exists(results_dir):
    os.makedirs(results_dir)
results_file = results_dir + TRIAL + '.pkl'

best_reward = {"blue_0":0,"blue_1":0}
for steps in range(10001):
    # Training
    print(f'\n----------------- Training at steps:{steps} start! -----------------')
    eval_env.eval = False
    eval_env.self_play = True
    eval_env.reset()
    results = trainer.train()
    save_logs(res_name,results,steps,CONTINUAL)
    print(pretty_print(results))
    check_point = trainer.save(checkpoint_dir=check_point_dir)
    # Evaluation
    if steps % EVAL_FREQ == 0:
        print(f'\n-------------- Evaluation at steps:{steps} starting ! --------------')
        EVAL_reward = {"blue_0":0,"blue_1":0}
        #check_point = trainer.save(checkpoint_dir=check_point_dir)
        for i in range(NUM_EVAL):
            # print(f'\nEvaluation {i}:')
            #model_path = latest_learned_file_path('./UCAV/checkpoints/test_2/*')
            #trainer.restore(checkpoint_path=model_path)
            eval_env.eval = True
            obs = eval_env.reset()
            done = False
            
            step_num = 0
            #fig = plt.figure(1,figsize=(8.0, 6.0))
            ESC = 0x1B          # ESCキーの仮想キーコード
            trajectory_length = 100

            cell_size = 256
            state_0=[np.zeros(cell_size, np.float32),np.zeros(cell_size, np.float32)]
            state_1=[np.zeros(cell_size, np.float32),np.zeros(cell_size, np.float32)]
            state_2=[np.zeros(cell_size, np.float32),np.zeros(cell_size, np.float32)]
            state_3=[np.zeros(cell_size, np.float32),np.zeros(cell_size, np.float32)]
            action_dict0 = [0,0]
            action_dict1 = [0,0]
            action_dict2 = [0,0]
            action_dict3 = [0,0]
            rewards = {"blue_0":0,"blue_1":0}
            if record_mode == 0:
                file_name = "test_num" + str(steps) +str(i)
                #video = cv2.VideoWriter(file_name+'.mp4',0x00000020,20.0,(800,600))

            while True:
                action_dict = {}
                action_dict0 = trainer.compute_single_action(obs["blue_0"],
                                                             state=state_0,prev_action=None,prev_reward=None,
                                                             policy_id="blue_0",explore=False)
                action_dict1 = trainer.compute_single_action(obs["blue_1"],
                                                             state=state_1,prev_action=None,prev_reward=None,
                                                             policy_id="blue_1",explore=False)
                action_dict2 = trainer.compute_single_action(obs["red_0"],
                                                             state=state_2,prev_action=None,prev_reward=None,
                                                             policy_id="red_0",explore=False)
                action_dict3 = trainer.compute_single_action(obs["red_1"],
                                                             state=state_3,prev_action=None,prev_reward=None,
                                                             policy_id="red_1",explore=False)
                
                #action_dict0 = trainer.compute_single_action(obs["blue_0"],policy_id="blue_0")
                #action_dict1 = trainer.compute_single_action(obs["blue_1"],policy_id="blue_1")
                state_0 = action_dict0[1]
                state_1 = action_dict1[1]
                state_2 = action_dict2[1]
                state_3 = action_dict3[1]
                obs, rewards, dones, infos = eval_env.step({"blue_0": action_dict0[0],
                                                            "blue_1": action_dict1[0],
                                                            "red_0": action_dict2[0],
                                                            "red_1": action_dict3[0],})

                env_blue_pos_temp_mod, env_red_pos_temp_mod, env_mrm_pos_temp_mod = render_env.copy_from_env_mod(eval_env)
                if eval_env.timer == 1:
                    env_blue_pos_mod = env_blue_pos_temp_mod
                    env_red_pos_mod = env_red_pos_temp_mod
                    env_mrm_pos_mod = env_mrm_pos_temp_mod
                else:
                    env_blue_pos_mod = np.vstack([env_blue_pos_mod,env_blue_pos_temp_mod])
                    env_red_pos_mod = np.vstack([env_red_pos_mod,env_red_pos_temp_mod])
                    env_mrm_pos_mod = np.vstack([env_mrm_pos_mod,env_mrm_pos_temp_mod])
                EVAL_reward["blue_0"] += rewards["blue_0"]
                EVAL_reward["blue_1"] += rewards["blue_1"]
                # plt.clf()

                # plt.subplots_adjust(left=-0.1,right=1.1,bottom=-0.1,top=1.1)
                # fig.canvas.draw()
                # plt.pause(.01)

                #if record_mode == 0:
                    #img = np.array(fig.canvas.renderer.buffer_rgba())
                    #img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
                    #video.write(img.astype('uint8'))

                
                step_num = step_num + 1
                
                done = dones["__all__"]
                
                #print(f'rewards:{rewards}')
                #if record_mode == 0:
                #    img = eval_env.render_movie(file_name,step_num)
                #    video.write(img.astype('unit8'))
                #elif record_mode == 1:
                #    eval_env.render()
                #elif record_mode == 2:
                #    eval_env.render()
                    
                #env_blue_pos_temp, env_red_pos_temp, env_mrm_pos_temp = render_env.copy_from_env(eval_env)
                
                #env_blue_pos.append(env_blue_pos_temp)
                #env_red_pos.append(env_red_pos_temp)
                #env_mrm_pos.append(env_mrm_pos_temp)
                #step_num = step_num + 1
                # エピソードの終了処理
                if dones['__all__']:
                    save_hists("blue_"+str(i),steps,env_blue_pos_mod,hist_dir)
                    save_hists("red_"+str(i),steps,env_red_pos_mod,hist_dir)
                    save_hists("mrm_"+str(i),steps,env_mrm_pos_mod,hist_dir)
                    # print(f'all done at {env.steps}')
                    break
            if EVAL_reward["blue_0"]> best_reward["blue_0"]:
                save_weights("blue_0",trainer)
                reload_weights(policy_id="red_0",trainer=trainer,set_policy_id="blue_0")
                best_reward["blue_0"] = EVAL_reward["blue_0"]
            if EVAL_reward["blue_1"]> best_reward["blue_1"]:
                save_weights("blue_1",trainer)
                reload_weights(policy_id="red_1",trainer=trainer,set_policy_id="blue_1")
                best_reward["blue_1"] = EVAL_reward["blue_1"]

            
            #if record_mode == 0:
               # video.release()

ray.shutdown()

-------------------------- Scene: 0 --------------------------

----------------- Training at steps:0 start! -----------------
-------------------------- Scene: 0 --------------------------
-------------------------- Scene: 0 --------------------------
700 blue_1 Shoot at red_1 launch distance : 58925.86365711398 True True
723 blue_1 Shoot at red_1 launch distance : 49626.04795902315 True True
803 blue_1 Splash :red_1
TIME LIMIT LOSE
blue_0 False False 1200 0.899000535029211 -54.30947538676721
blue_1 False False 1200 0.9990005350292109 65.08631124312076
-------------------------- Scene: 0 --------------------------
292 blue_0 Shoot at red_1 launch distance : 59194.11408202614 True True
320 blue_0 Shoot at red_1 launch distance : 47532.873524210874 True True
339 red_1 Shoot at blue_0
350 red_1 Shoot at blue_0
365 blue_0 Splash :red_1
Same tgt shoot
Same tgt shoot
619 blue_1 Shoot at red_0 launch distance : 58485.9791042194 True True
Same tgt shoot
Same tgt shoot
630 blue_1 Shoot at red_

295 blue_1 Shoot at red_1 launch distance : 59556.55081438123 True True
309 blue_1 Shoot at red_1 launch distance : 53458.789531332484 True True
353 blue_0 Shoot at red_1 launch distance : 59845.5904790344 True True
371 blue_1 Splash :red_1
530 red_0 Shoot at blue_1
541 red_0 Shoot at blue_1
TIME LIMIT LOSE
blue_0 False False 1200 0.8989997452417609 -3.657025056358102
blue_1 False False 1200 0.8989997452417609 95.16997863905986
-------------------------- Scene: 0 --------------------------
225 blue_0 DOWN
298 blue_1 Shoot at red_1 launch distance : 59940.76051676992 True True
315 blue_1 Shoot at red_1 launch distance : 54293.56933489817 True True
361 red_1 Shoot at blue_1
372 red_1 Shoot at blue_1
437 blue_1: Destroyed
TIME LIMIT LOSE
blue_0 False False 437 -0.10099997139185694 -2.3226646779619
blue_1 False False 437 0.3589153619414764 72.70646016512937
-------------------------- Scene: 0 --------------------------
254 blue_1 DOWN
TIME LIMIT LOSE
blue_0 False False 1200 -0.100998962524

395 blue_1 Shoot at red_1 launch distance : 27622.694989399475 True True
423 blue_0 Splash :red_1
TIME LIMIT LOSE
blue_0 False False 1200 0.9990100685730191 80.66040242936
blue_1 False False 1200 0.8990015685730192 45.39393166411194
-------------------------- Scene: 0 --------------------------
658 blue_1 Shoot at red_0 launch distance : 57192.18478356483 True True
681 blue_1 Shoot at red_0 launch distance : 46659.73841544922 True True
694 blue_0 Shoot at red_0 launch distance : 59627.257689269485 True True
716 blue_0 Shoot at red_0 launch distance : 49965.399111439765 True True
733 blue_1 Splash :red_0
978 red_1 Shoot at blue_0
989 red_1 Shoot at blue_0
1056 blue_0: Destroyed
TIME LIMIT LOSE
blue_0 False False 1200 0.8989990149779956 39.50014509189346
blue_1 False False 1200 0.8989990149779956 19.499243460335766
-------------------------- Scene: 0 --------------------------
