In [1]:
import sys, os
ppath = lambda x : os.path.dirname(os.path.abspath(x))
file_name = os.getcwd()
sys.path.append(ppath(ppath(file_name)))

from recorder.recorder import Recorder

In [2]:
import gym
from dqn import *
import torch.optim as optim

In [3]:
# conda install pytorch torchvision -c pytorch
# conda install -c conda-forge tensorboard

from torch.utils.tensorboard import SummaryWriter

In [4]:
def main():
    recorder = Recorder(gym.make('CartPole-v1'), False)
    env = recorder.wrapped_env()
    writer = SummaryWriter()
    
    # 네트워크를 두 개 정의함.
    q = Qnet()
    q_target = Qnet()

    # q를 q_target으로 복사함.
    # q.state_dict()는 model의 weight를 dictionary 형태로 저장하고 있음.
    # 이걸 q_target으로 불러온다는 것.
    q_target.load_state_dict(q.state_dict())

    # replay buffer용 메모리 정의
    memory = ReplayBuffer()

    print_interval = 20
    score = 0.0  

    # q만 업데이트한다. q_target은 안 한다!!
    # gradient update가 q 네트워크만 업데이트함.
    # q_target은 고정해두고 주기적으로 q의 weight를 가져오기만 하면 됨.
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)

    # 에피소드 시뮬레이션
    for n_epi in range(10000):
        # 입실론 값을 에피소드 진행에 따라 linear하게 감소시킨다.
        # 8% -> 1%
        epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) 
        # 환경 초기화하고 상태 받음
        s = env.reset()
        done = False

        while not done:
            # 입실론 그리디로 액선 하나 선택
            a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
            # 환경에 주고 obs 받아옴
            s_prime, r, done, info = env.step(a)
            # done_mask는 게임이 끝나면 0, 안 끝나면 1임.
            # TD target 구할 때 게임 끝난 후의 값이 적용되지 않게 곱해주는 값임.
            done_mask = 0.0 if done else 1.0
            # done_mask도 replay buffer에 같이 넣음
            memory.put((s,a,r/100.0,s_prime, done_mask))
            s = s_prime

            score += r
            if done:
                break
        
        # 메모리가 충분히 쌓이면 학습을 시작
        if memory.size()>2000:
            train(q, q_target, memory, optimizer)
        
        if score/print_interval > 200:
            recorder.update([n_epi + 1])
            
        if len(recorder.recorded_epi()) >= 5:
            env.reset() # 마지막 에피소드 비디오 마무리
            break

        
        # interval마다, 최근 interval개 에피소드 평균 time-step 계산하고
        # Q target을 Q로 업데이트함.
        if n_epi%print_interval==0 and n_epi!=0:
            q_target.load_state_dict(q.state_dict())
            print(f"n_episode :{n_epi}, "\
                    + f"score : {score/print_interval:.1f}, "\
                    + f"n_buffer : {memory.size()}, "\
                    + f"eps : {epsilon*100:.1f}%")
            writer.add_scalar("score/train", score/print_interval, n_epi)
            score = 0.0

    writer.flush()
    writer.close()
    env.close()

In [5]:
if __name__ == '__main__':
    main()

n_episode :20, score : 10.3, n_buffer : 207, eps : 7.9%
n_episode :40, score : 9.9, n_buffer : 405, eps : 7.8%
n_episode :60, score : 10.1, n_buffer : 607, eps : 7.7%
n_episode :80, score : 9.6, n_buffer : 798, eps : 7.6%
n_episode :100, score : 9.3, n_buffer : 985, eps : 7.5%
n_episode :120, score : 10.1, n_buffer : 1187, eps : 7.4%
n_episode :140, score : 9.4, n_buffer : 1376, eps : 7.3%
n_episode :160, score : 9.8, n_buffer : 1573, eps : 7.2%
n_episode :180, score : 10.2, n_buffer : 1777, eps : 7.1%
n_episode :200, score : 10.2, n_buffer : 1982, eps : 7.0%
n_episode :220, score : 9.7, n_buffer : 2175, eps : 6.9%
n_episode :240, score : 10.3, n_buffer : 2382, eps : 6.8%
n_episode :260, score : 10.2, n_buffer : 2586, eps : 6.7%
n_episode :280, score : 11.2, n_buffer : 2811, eps : 6.6%
n_episode :300, score : 13.7, n_buffer : 3084, eps : 6.5%
n_episode :320, score : 29.0, n_buffer : 3664, eps : 6.4%
n_episode :340, score : 137.6, n_buffer : 6415, eps : 6.3%
n_episode :360, score : 132.



n_episode :420, score : 223.9, n_buffer : 20735, eps : 5.9%


In [6]:
home_path = os.path.expanduser('~')
cur_path = os.getcwd()
conda_path = home_path + "\\anaconda3"
conda_script_path = home_path + "\\anaconda3\\Scripts\\activate.bat"
exc = ' '.join(['start', '%windir%\System32\cmd.exe "/K"', conda_script_path, conda_path])
!$exc
print('conda activate py38-pytorch-gpu && tensorboard --port=6006 --logdir=runs http://localhost:6006/')

# conda environments:
#
base                     C:\Users\kuro1\anaconda3
py38-pytorch-gpu      *  C:\Users\kuro1\anaconda3\envs\py38-pytorch-gpu

^C
