In [None]:
import os

home_path = os.path.expanduser('~')
cur_path = os.getcwd()
conda_path = home_path + "\\anaconda3"
conda_script_path = home_path + "\\anaconda3\\Scripts\\activate.bat"
exc = ' '.join(['start', '%windir%\System32\cmd.exe "/K"', conda_script_path, conda_path])
!$exc

In [None]:
conda activate py38-pytorch-gpu && tensorboard --port=6006 --logdir=runs
http://localhost:6006/

In [None]:
import gym
from env import Env

env = gym.make(Env.MOUNTAINCAR.value)
print(dir(env))
print(f'{env._max_episode_steps=}')
print(f'{env.action_space=}')
print(f'{env.metadata=}')
print(f'{env.observation_space.shape[0]=}')
print(f'{env.reward_range=}')
print(f'{env.seed=}')
print(f'{env.spec=}')

In [1]:

from env import Env
from runner import RunnerParams

def dqn_pole_train():
    from algorithms.dqn import DQNParams
    from algorithms.dqn_runner import DQNRunner
    algo_param = DQNParams(buffer_limit=50000, n_train_start=4000,
                            n_node=128, start_epsilon=0.1, learning_rate=0.0001,
                            update_interval=40)
    runner_param = RunnerParams(save_net=True, name_postfix=str(algo_param),
                                target_score=500.0,
                                interval=8, 
                                max_video=100, video_record_interval=200,
                                reward_scale=100.0)
    DQNRunner(Env.CARTPOLE.value, algo_param, runner_param).run()


def dqn_pole_load():
    from algorithms.dqn import DQNParams
    from algorithms.dqn_runner import DQNRunner
    algo_param = DQNParams(buffer_limit=50000, n_train_start=4000,
                            n_node=128, start_epsilon=0.1, learning_rate=0.0001,
                            update_interval=40)
    runner_param = RunnerParams(train=False,
                                load_net=True, 
                                load_name='DQN-CartPole-v1-500.0-train=True-intvl=8-rwdscl=100.0-node=128-lRate=0.0001-gma=0.98-nBuf=50000-nBat=32-nStrt=4000-updIntvl=40-1632225762.pt',
                                name_postfix=str(algo_param),
                                target_score=999.0,
                                max_video=100, 
                                interval=1, video_record_interval=1,
                                reward_scale=100.0)
    DQNRunner(Env.CARTPOLE.value, algo_param, runner_param).run()


def ddqn_pole_train():
    from algorithms.ddqn import DDQNParams
    from algorithms.ddqn_runner import DDQNRunner
    algo_param = DDQNParams(buffer_limit=50000, n_train_start=2000,
                            batch_size=32, gamma=0.98,
                            n_node=128, start_epsilon=0.08, learning_rate=0.0005,
                            update_interval=20)
    runner_param = RunnerParams(save_net=True, name_postfix=str(algo_param),
                                target_score=500.0,
                                interval=8, 
                                max_video=100, video_record_interval=200,
                                reward_scale=100.0)
    DDQNRunner(Env.CARTPOLE.value, algo_param, runner_param).run()


def ddqn_pole_load():
    from algorithms.ddqn import DDQNParams
    from algorithms.ddqn_runner import DDQNRunner
    algo_param = DDQNParams(buffer_limit=50000, n_train_start=2000,
                            batch_size=32, gamma=0.98,
                            n_node=128, start_epsilon=0.08, learning_rate=0.0005,
                            update_interval=20)
    runner_param = RunnerParams(train=False,
                                load_net=True, 
                                load_name='DDQN-CartPole-v1-500.0-train=True-intvl=8-rwdscl=100.0-node=128-lRate=0.0005-gma=0.98-nBuf=50000-nBat=32-nStrt=2000-updIntvl=20-1632228354.pt',
                                name_postfix=str(algo_param),
                                target_score=999.0,
                                max_video=100, 
                                interval=1, video_record_interval=1,
                                reward_scale=100.0)
    DDQNRunner(Env.CARTPOLE.value, algo_param, runner_param).run()


def ac_pole_train():
    from algorithms.actorcritic import ActorCriticParams
    from algorithms.actorcritic_runner import ActorCriticRunner
    algo_param = ActorCriticParams(n_node=256, learning_rate=0.0002,
                                    gamma=0.98, n_rollout=10)
    runner_param = RunnerParams(save_net=True, name_postfix=str(algo_param),
                                target_score=500.0,
                                interval=8, 
                                max_video=100, video_record_interval=200,
                                reward_scale=100.0)
    ActorCriticRunner(Env.CARTPOLE.value, algo_param, runner_param).run()


def ac_pole_load():
    from algorithms.actorcritic import ActorCriticParams
    from algorithms.actorcritic_runner import ActorCriticRunner
    algo_param = ActorCriticParams(n_node=256, learning_rate=0.0002,
                                    gamma=0.98, n_rollout=10)
    runner_param = RunnerParams(train=False,
                                load_net=True, 
                                load_name='ActorCritic-CartPole-v1-500.0-train=True-intvl=8-rwdscl=100.0-node=256-lRate=0.0002-gma=0.98-nRoll=10-1632236701.pt',
                                name_postfix=str(algo_param),
                                target_score=999.0,
                                max_video=100, 
                                interval=1, video_record_interval=1,
                                reward_scale=100.0)
    ActorCriticRunner(Env.CARTPOLE.value, algo_param, runner_param).run()


def ppo_pole_train():
    from algorithms.ppo import PPOParams
    from algorithms.ppo_runner import PPORunner
    algo_param = PPOParams(n_node=128, learning_rate=0.0001, 
                            gamma=0.98, lmbda=0.95, eps_clip=0.1, 
                            k_epoch=3, t_horizon=20)
    runner_param = RunnerParams(save_net=True, name_postfix=str(algo_param),
                                target_score=500.0,
                                interval=100, 
                                max_video=100, video_record_interval=200,
                                reward_scale=100.0)
    PPORunner(Env.CARTPOLE.value, algo_param, runner_param).run()


def ppo_pole_load():
    from algorithms.ppo import PPOParams
    from algorithms.ppo_runner import PPORunner
    algo_param = PPOParams(n_node=128, learning_rate=0.0001, 
                            gamma=0.98, lmbda=0.95, eps_clip=0.1, 
                            k_epoch=3, t_horizon=20)
    runner_param = RunnerParams(train=False,
                                load_net=True, 
                                load_name='PPO-CartPole-v1-500.0-train=True-intvl=100-rwdscl=100.0-node=128-lRate=0.0001-gma=0.98-lmb=0.95-epsclp=0.1-k=3-t=20-1632243563.pt',
                                name_postfix=str(algo_param),
                                target_score=999.0,
                                max_video=100, 
                                interval=1, video_record_interval=1,
                                reward_scale=100.0)
    PPORunner(Env.CARTPOLE.value, algo_param, runner_param).run()


ppo_pole_load()


초기 설정
algorithm: PPO
env: CartPole-v1
state space: (4,)
action space: Discrete(2)
네트워크 불러오기
시뮬레이션 시작




n_epi=0, avg_score=308.0 비디오 저장
에피소드: 0, 평균 점수: 308.0
n_epi=1, avg_score=500.0 비디오 저장
에피소드: 1, 평균 점수: 500.0
n_epi=2, avg_score=500.0 비디오 저장
에피소드: 2, 평균 점수: 500.0
n_epi=3, avg_score=500.0 비디오 저장
에피소드: 3, 평균 점수: 500.0
n_epi=4, avg_score=500.0 비디오 저장
에피소드: 4, 평균 점수: 500.0
n_epi=5, avg_score=378.0 비디오 저장
에피소드: 5, 평균 점수: 378.0
n_epi=6, avg_score=500.0 비디오 저장
에피소드: 6, 평균 점수: 500.0


In [None]:
# cart-pole 모음

from env import Env
from runner import RunnerParams


# PPO
# from algorithms.ppo import PPOParams
# from algorithms.ppo_runner import PPORunner

# algo_param = PPOParams()

# runner_param = RunnerParams(save_net=True, name_postfix=str(algo_param),
#                             target_score=500.0,
#                             interval=40, 
#                             max_video=100, video_record_interval=200,
#                             reward_scale=100.0)

# PPORunner(Env.CARTPOLE.value, algo_param, runner_param).run()

# PPOlstm
# from algorithms.ppolstm import PPOlstmParams
# from algorithms.ppolstm_runner import PPOlstmRunner
# runner_param = RunnerParams(save_net=True, target_score=500, reward_scale=100.0, max_video=100)
# PPOlstmRunner(Env.CARTPOLE.value, PPOlstmParams(), runner_param).run()

In [None]:
# 모델 load
from env import Env
from runner import RunnerParams


# PPO
# from algorithms.ppo import PPOParams
# from algorithms.ppo_runner import PPORunner

# algo_param = PPOParams()
# runner_param = RunnerParams(train=False, load_net=True, load_name='PPO-CartPole-v1-500.0-train=True-intvl=40-rwdscl=100.0-node=128-lRate=0.0001-gma=0.98-lmb=0.95-epsclp=0.1-k=3-t=20-1632072908.pt',
#                             name_postfix=str(algo_param),
#                             target_score=999.0,
#                             max_video=100, 
#                             interval=1, video_record_interval=1,
#                             reward_scale=100.0)

# PPORunner(Env.CARTPOLE.value, algo_param, runner_param).run()

# PPOlstm
# from algorithms.ppolstm import PPOlstmParams
# from algorithms.ppolstm_runner import PPOlstmRunner
# runner_param = RunnerParams(save_net=True, target_score=500, reward_scale=100.0, max_video=100)
# PPOlstmRunner(Env.CARTPOLE.value, PPOlstmParams(), runner_param).run()

In [None]:
## dqn cart-pole 학습

from env import Env
from runner import RunnerParams
from algorithms.dqn import DQNParams
from algorithms.dqn_runner import DQNRunner

runner_param = RunnerParams(reward_scale=100.0, max_video=30)
algo_param = DQNParams()
DQNRunner(Env.CARTPOLE.value, algo_param, runner_param).run()

In [None]:
## ddqn lunar-lander 불러오기

from env import Env
from runner import RunnerParams
from algorithms.dqn import DQNParams
from algorithms.dqn_runner import DQNRunner

runner_param = RunnerParams(load_net=True, load_name='node512-score165-DQN-LunarLander-v2-1631737560.pt',
                            train=False,
                            target_score=150, 
                            reward_scale=30.0, max_video=100, video_record_interval=1,
                            step_wrapper=lambda x: (x[0], x[1], x[2], x[3]))
algo_param = DQNParams(n_node=512, batch_size=32, buffer_limit=100000, 
                        n_train_start=8000, start_epsilon=0.2,
                        update_interval=40)
DQNRunner(Env.LUNARLANDER.value, algo_param, runner_param).run()

In [None]:
## ddqn lunar-lander
from env import Env
from runner import RunnerParams
from algorithms.ddqn import DDQNParams
from algorithms.ddqn_runner import DDQNRunner

runner_param = RunnerParams(save_net=True,
                            train=True,
                            target_score=200, 
                            reward_scale=30.0, max_video=100, video_record_interval=200,
                            step_wrapper=lambda x: x)
algo_param = DDQNParams(n_node=512, batch_size=64, buffer_limit=50000, 
                        n_train_start=4000, start_epsilon=0.2,
                        update_interval=20)
DDQNRunner(Env.LUNARLANDER.value, algo_param, runner_param).run()

In [None]:
## ddqn lunar-lander 불러오기

from env import Env
from runner import RunnerParams
from algorithms.ddqn import DDQNParams
from algorithms.ddqn_runner import DDQNRunner

runner_param = RunnerParams(save_net=False, load_net=True, load_name='node512-score213-DDQN-LunarLander-v2-1631750946.pt',
                            train=False,
                            target_score=200, 
                            reward_scale=30.0, max_video=100, video_record_interval=1,
                            step_wrapper=lambda x: x)
algo_param = DDQNParams(n_node=512, batch_size=64, buffer_limit=50000, 
                        n_train_start=4000, start_epsilon=0.2,
                        update_interval=20)
DDQNRunner(Env.LUNARLANDER.value, algo_param, runner_param).run()

In [None]:
# 알고리즘 테스트 

# ppo

# from tester import RunnerTester
# from algorithms.ppo_runner import PPORunner
# from algorithms.ppo import PPOParams
# from env import Env
# algo_params = PPOParams()
# result = RunnerTester(PPORunner, algo_params, [Env.MOUNTAINCAR]).test()
# print('통과' if result else '실패')


# ppolstm

# from tester import RunnerTester
# from algorithms.ppolstm_runner import PPOlstmRunner
# from algorithms.ppolstm import PPOlstmParams
# from env import Env
# algo_params = PPOlstmParams()
# result = RunnerTester(PPOlstmRunner, algo_params, [Env.MOUNTAINCAR]).test()
# print('통과' if result else '실패')


# actorcritic

# from tester import RunnerTester
# from algorithms.actorcritic_runner import ActorCriticRunner
# from algorithms.actorcritic import ActorCriticParams
# from env import Env
# algo_params = ActorCriticParams()
# result = RunnerTester(ActorCriticRunner, algo_params, [Env.CARTPOLE]).test()
# print('통과' if result else '실패')



# dqn
# from tester import RunnerTester
# from algorithms.dqn_runner import DQNRunner
# from algorithms.dqn import DQNParams
# from env import Env
# algo_params = DQNParams()
# result = RunnerTester(DQNRunner, algo_params, [Env.CARTPOLE]).test()
# print('통과' if result else '실패')



# ddqn
from tester import RunnerTester
from algorithms.ddqn_runner import DDQNRunner
from algorithms.ddqn import DDQNParams
from env import Env
algo_params = DDQNParams(start_epsilon=0.2, n_node=128, n_train_start=20000, 
                        buffer_limit=100000)
result = RunnerTester(DDQNRunner, algo_params, [Env.CARTPOLE]).test()
print('통과' if result else '실패')