In [1]:
"""
run anaconda prompt 
"""

import os

home_path = os.path.expanduser('~')
cur_path = os.getcwd()
conda_path = home_path + "\\anaconda3"
conda_script_path = home_path + "\\anaconda3\\Scripts\\activate.bat"
exc = ' '.join(['start', '%windir%\System32\cmd.exe "/K"', conda_script_path, conda_path])
!$exc

"""
run tensorboard server

conda activate py38-pytorch-gpu && tensorboard --port=6006 --logdir=runs
http://localhost:6006/
"""

'\nrun tensorboard server\n\nconda activate py38-pytorch-gpu && tensorboard --port=6006 --logdir=runs\nhttp://localhost:6006/\n'

In [1]:
from env import Env
from runner import RunnerParams
from logger import Logger
from remover import Remover
from algorithms.dqn import DQN, DQNParams
from algorithms.dqn_runner import DQNRunner
from algorithms.ddqn import DDQN, DDQNParams
from algorithms.ddqn_runner import DDQNRunner
from algorithms.reinforce import Reinforce, ReinforceParams
from algorithms.reinforce_runner import ReinforceRunner
from algorithms.actorcritic import ActorCritic, ActorCriticParams
from algorithms.actorcritic_runner import ActorCriticRunner

import itertools

class Manifest:
    envs = [Env.CARTPOLE, Env.LUNARLANDER]
    algos = [Reinforce, ActorCritic, DQN, DDQN]
    algo_runners = {Reinforce: ReinforceRunner, ActorCritic: ActorCriticRunner, 
                    DQN: DQNRunner, DDQN: DDQNRunner}
    algo_params = {Reinforce: ReinforceParams, ActorCritic: ActorCriticParams, 
                    DQN: DQNParams, DDQN: DDQNParams}
    algo_name_dic = {'Reinforce': Reinforce, 'ActorCritic': ActorCritic, 'DQN': DQN, 'DDQN': DDQN}

    short_dic = {
        'train': 'train',
        'intvl': 'check_interval',
        'rwdscl': 'reward_scale',
        'node': 'n_node',
        'lRate': 'learning_rate',
        'gma': 'gamma',
        'nRoll': 'n_rollout',
        'nBuf': 'buffer_limit',
        'nBat': 'batch_size',
        'nStrt': 'n_train_start',
        'updIntvl': 'update_interval',
        'lmb': 'lmbda',
        'epsclp': 'eps_clip',
        'k': 'k_epoch',
        't': 't_horizon'
    }
    full_dic = {v: k for k, v in short_dic.items()}

    @classmethod
    def get_algo_class(cls, algo_name):
        return cls.algo_name_dic[algo_name]

    @classmethod
    def get_runner_class(cls, algo_class):
        return cls.algo_runners[algo_class]

    @classmethod
    def get_param_class(cls, algo_class):
        return cls.algo_params[algo_class]
    
    @classmethod
    def get_param_full(cls, short_param):
        return cls.short_dic[short_param]
    
    @classmethod
    def get_param_short(cls, full_param):
        return cls.full_dic[full_param]


class Trainer:
    def __init__(self, check_intervals=None):
        self._testcases = []
        self._check_intervals = check_intervals
        self.allcases = [*itertools.product(Manifest.envs, Manifest.algos)]
    
    def default_hyperparam(self, env, algo):
        algo_param = None

        if env == Env.CARTPOLE:
            if algo == Reinforce:
                algo_param = Manifest.get_param_class(algo)(
                        n_node=32, learning_rate=0.0005, gamma=0.98)
            elif algo == ActorCritic:
                algo_param = Manifest.get_param_class(algo)(
                        n_node=32, learning_rate=0.0005, gamma=0.98)
            elif algo == DQN:
                algo_param = Manifest.get_param_class(algo)(
                        n_node=32, learning_rate=0.0005, gamma=0.98, buffer_limit=50000, 
                        batch_size=32, n_train_start=2000, start_epsilon=0.1, update_interval=10)
            elif algo == DDQN:
                algo_param = Manifest.get_param_class(algo)(
                        n_node=32, learning_rate=0.0005, gamma=0.98, buffer_limit=50000, 
                        batch_size=32, n_train_start=2000, start_epsilon=0.1, update_interval=10)
            else:
                raise Exception(f'algorithm does not exist: {algo}')
        elif env == Env.LUNARLANDER:
            if algo == Reinforce:
                algo_param = Manifest.get_param_class(algo)(
                        n_node=128, learning_rate=0.0025, gamma=0.98)
            elif algo == ActorCritic:
                algo_param = Manifest.get_param_class(algo)(
                        n_node=128, learning_rate=0.0025, gamma=0.98)
            elif algo == DQN:
                algo_param = Manifest.get_param_class(algo)(
                        n_node=128, learning_rate=0.0025, gamma=0.98, buffer_limit=100000, 
                        batch_size=64, n_train_start=10000, start_epsilon=0.2, update_interval=20)
            elif algo == DDQN:
                algo_param = Manifest.get_param_class(algo)(
                        n_node=128, learning_rate=0.0025, gamma=0.98, buffer_limit=100000, 
                        batch_size=64, n_train_start=10000, start_epsilon=0.2, update_interval=20)
            else:
                raise Exception(f'algorithm does not exist: {algo}')
        else:
            raise Exception(f'env does not exist: {env}')

        return algo_param

    def add_case(self, env, algo, algo_param_dic=None):
        if not algo_param_dic:
            algo_param_dic = dict()

        algo_param_class = Manifest.get_param_class(algo)
        algo_param_dic = {**self.default_hyperparam(env, algo).__dict__, **algo_param_dic}
        algo_param = algo_param_class(**algo_param_dic)
        algo_runner = Manifest.get_runner_class(algo)

        self._testcases += [(env, algo_runner, algo_param)]

    def run(self, runner_param_dic=None, debug=False):
        for check_interval in self._check_intervals:
            runnerp = None
            if debug:
                runnerp = RunnerParams(save_net=True, max_video=1000, video_record_interval=self._check_intervals, 
                                            print_interval=self._check_intervals)
            elif runner_param_dic == None:
                runnerp = RunnerParams(save_net=True, video_record_interval=0, print_interval=0)
            else:
                runnerp = RunnerParams(**runner_param_dic)

            runnerp.check_interval = check_interval

            for i, (env, runner, algop) in enumerate(self._testcases):
                runnerp.name_postfix=str(algop)

                if env == Env.CARTPOLE:
                    runnerp.target_score = 500.0
                    runnerp.reward_scale = 100.0
                elif env == Env.LUNARLANDER:
                    runnerp.target_score = 200.0
                    runnerp.reward_scale = 30.0

                runner(env.value, algop, runnerp).run()


In [2]:
class player:

    def __init__(self, path, model_names):
        self.path = path
        self.model_names = model_names

    def run(self, debug=False):
        cases = []
        for name in self.model_names:
            tokens = name.split('_')
            load_name = name + ".pt"
            algo_name, env_name, last_score = tokens[0:3]
            train, check_interval, reward_scale = tokens[3:6]
            algo_params = dict()
            for token in tokens[6:-1]:
                k, v = token.split('=')
                k = Manifest.get_param_full(k)
                try: algo_params[k] = int(v)
                except: algo_params[k] = float(v)
            algo = Manifest.get_algo_class(algo_name)
            algop = Manifest.get_param_class(algo)(**algo_params)
            runner = Manifest.get_runner_class(algo)
            cases.append((env_name, runner, algop, algo_params, load_name))

        for env, runner, algop, algo_params, load_name in cases:
            print(f'\t[ {env}, {runner} ]\n parameters {algo_params.items()}\n')
            runnerp = None
            if debug:
                runnerp = RunnerParams(train=False, save_net=False, load_net=True, target_score=9999.0,
                                        load_name=load_name, name_postfix=str(algop), 
                                        check_interval=1, max_video=3, save_check_log=False, save_step_log=True,
                                        print_interval=1, video_record_interval=1, max_episode=1000)
            else:
                runnerp = RunnerParams(train=False, save_net=False, load_net=True, target_score=9999.0,
                                        load_name=load_name, name_postfix=str(algop), 
                                        check_interval=1, max_video=0, save_check_log=False, save_step_log=True,
                                        print_interval=0, video_record_interval=0, max_episode=1000)

            runner(env, algop, runnerp).run()
        
        print('모두 종료됨')


def train():
    
    all_cases = Trainer().allcases
    cartpole_cases = filter(lambda x:x[0] == Env.CARTPOLE, all_cases)
    lunar_cases = filter(lambda x:x[0] == Env.LUNARLANDER, all_cases)
    
    check_interval_arr = [10]
    tr = Trainer(check_interval_arr)

    for env, algo in cartpole_cases:
        tr.add_case(env, algo)
    tr.run()

    check_interval_arr = [5]
    tr = Trainer(check_interval_arr)

    for env, algo in lunar_cases:
        tr.add_case(env, algo)
    tr.run()

def replay():
    model_names = [
        'Reinforce_LunarLander-v2_204_train=True_intvl=10_rwdscl=30.0_node=128_lRate=0.0025_gma=0.98_1635200309',
        'ActorCritic_LunarLander-v2_217_train=True_intvl=10_rwdscl=30.0_node=128_lRate=0.0025_gma=0.98_1635202089',
        'DQN_LunarLander-v2_200_train=True_intvl=10_rwdscl=30.0_node=128_lRate=0.0025_gma=0.98_nBuf=100000_nBat=64_nStrt=10000_updIntvl=20_1635208616',
        'DDQN_LunarLander-v2_215_train=True_intvl=10_rwdscl=30.0_node=128_lRate=0.0025_gma=0.98_nBuf=100000_nBat=64_nStrt=10000_updIntvl=20_1635209424'
    ]
    player('weights', model_names).run(debug=True)

# Remover().remove_dirs(['runs', 'weights', 'videos'])
# Remover().remove_dirs(['logs'])
train()

# print_interval=10, max_epi=100 > 4분 35초
# print_interval=0, max_epi=100 > 4분 25초
# save_check_log=True, print_interval=0, max_epi=100 > 4분 37초

# for _ in range(10):
#     replay()

초기 설정
algorithm: Reinforce
env: LunarLander-v2
state space: (8,)
action space: Discrete(4)
시뮬레이션 시작


In [None]:
def tuning_reinforce():
    tr = Trainer([10])
    for env, algo in [(Env.LUNARLANDER, Reinforce)]:
        start = 5
        k = 5
        for i in range(21):
            hparam = tr.default_hyperparam(env, algo)
            hparam.learning_rate = (start + k*i)/10000
            tr.add_case(env, algo, hparam)

    runner_param_dic = {'save_net':True, 'max_episode':10000, 'print_interval':0, 'video_record_interval':0}
    tr.run(runner_param_dic)
    print('전체 테스트 종료')

def tuning_actorcritic():
    tr = Trainer([10])
    for env, algo in [(Env.LUNARLANDER, ActorCritic)]:
        start = 1
        k = 5
        for i in range(21):
            hparam = tr.default_hyperparam(env, algo)
            hparam.learning_rate = (start + k*i)/10000
            tr.add_case(env, algo, hparam)

    runner_param_dic = {'save_net':True, 'max_episode':10000, 'print_interval':0, 'video_record_interval':0}
    tr.run(runner_param_dic)
    print('전체 테스트 종료')

def tuning_dqn_node_cartpole():
    """
    8
    """
    tr = Trainer([20])
    for env, algo in [(Env.CARTPOLE, DQN)]:
        n_node = 2
        for i in range(8):
            hparam = tr.default_hyperparam(env, algo)
            hparam.n_node = n_node
            tr.add_case(env, algo, hparam)
            n_node *= 2

    runner_param_dic = {'save_net':True, 'max_episode':10000, 'print_interval':0, 'video_record_interval':0}
    tr.run(runner_param_dic)
    print('전체 테스트 종료')

def tuning_dqn_node_lunarlander():
    """
    64~256
    """
    tr = Trainer([10])
    for env, algo in [(Env.LUNARLANDER, DQN)]:
        n_node = 2
        for i in range(8):
            hparam = tr.default_hyperparam(env, algo)
            hparam.n_node = n_node
            tr.add_case(env, algo, hparam)
            n_node *= 2

    runner_param_dic = {'save_net':True, 'max_episode':10000, 'print_interval':0, 'video_record_interval':0}
    tr.run(runner_param_dic)
    print('전체 테스트 종료')

def tuning_ddqn_node_cartpole():
    """
    8~16
    """
    tr = Trainer([20])
    for env, algo in [(Env.CARTPOLE, DDQN)]:
        n_node = 64
        for i in range(2):
            hparam = tr.default_hyperparam(env, algo)
            hparam.n_node = n_node
            tr.add_case(env, algo, hparam)
            n_node *= 2

    runner_param_dic = {'save_net':True, 'max_episode':10000, 'print_interval':0, 'video_record_interval':0}
    tr.run(runner_param_dic)
    print('전체 테스트 종료')

def tuning_ddqn_node_lunarlander():
    """
    32, 128, 256
    """
    tr = Trainer([10])
    for env, algo in [(Env.LUNARLANDER, DDQN)]:
        n_node = 32
        for i in range(4):
            hparam = tr.default_hyperparam(env, algo)
            hparam.n_node = n_node
            tr.add_case(env, algo, hparam)
            n_node *= 2

    runner_param_dic = {'save_net':True, 'max_episode':10000, 'print_interval':0, 'video_record_interval':0}
    tr.run(runner_param_dic)
    print('전체 테스트 종료')

# Remover().remove_dirs(['runs', 'weights', 'videos'])
# tuning_dqn_node_cartpole()
# tuning_dqn_node_lunarlander()
tuning_ddqn_node_lunarlander()