In [1]:
#@title set up virtual display

from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()


<pyvirtualdisplay.display.Display at 0x7f7f3c33e668>

# LunarLander with DQN 실습



In [3]:
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI

def show_video():
  mp4list = glob.glob('./example/*.mp4')
  print(mp4list)
  if len(mp4list) > 0:
    for i in range(len(mp4list)):
        mp4 = mp4list[i]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                    loop controls style="height: 300px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                 </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

show_video()

['./example/fail.mp4', './example/success.mp4']


## TODO

 이 파일은 학습을 실행하기 위한 코드입니다. 
 코드를 실행하기 위해서는 아래의 파일에서 다음 함수 #TODO 부분을 수정하셔야 합니다.
 
 - dqn_agent.py
     step_env()
     train()
 - dqn.py
     update()
 - core/dqn_utils.py 
     ReplayBuffer.write() : multi-step
 

## Run DQN

In [21]:
#imports library

import os
import time

from core.dqn_utils import get_env_kwargs

from rl_trainer import RL_Trainer
from dqn_agent import DQNAgent

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
#arguments (hyperparameter)

class Args:

  def __getitem__(self, key):
    return getattr(self, key)

  def __setitem__(self, key, val):
    setattr(self, key, val)

  def __contains__(self, key):
    return hasattr(self, key)

  env_name = 'LunarLander-v3' 
  exp_name = 'q3_dqn' 

  ep_len = 200 
  num_timesteps = 150000 # 학습할 총 step 수

  #batches and steps
  batch_size = 128 # 학습에 사용할 batch_size

  num_agent_train_steps_per_iter = 1 # step 마다 network를 update하는 횟수

  # Advanced DQN
  double_q = False 
  n_step = 1

  save_params = False 
  no_gpu = False 
  which_gpu = 0 # 사용할 GPU number
  seed = 1 # 환경에 적용되는 seed

  video_log_freq =  -1 
  scalar_log_freq =  20000


args = Args()
args['train_batch_size'] = args['batch_size']

In [23]:
# 실험 결과 logging 설정
data_path = './data'

if not (os.path.exists(data_path)):
    os.makedirs(data_path)

logdir = 'day2_' + args.exp_name + '_LunarLander_seed'+ str(args.seed) + '_step' + str(args.n_step)
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

print("LOGGING TO: ", logdir)


LOGGING TO:  ./data/hw3_q3_dqn_LunarLander-v3


In [24]:
#@title Define Q-function trainer

class Q_Trainer(object):
    def __init__(self, params):
        self.params = params

        train_args = {
            'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
            'train_batch_size': params['batch_size'],
            'double_q': params['double_q'],
        }

        env_args = get_env_kwargs(params['env_name'])
        env_args['num_timesteps'] = params['num_timesteps']

        for k, v in env_args.items():
          params[k] = v

        self.params['agent_class'] = DQNAgent
        self.params['agent_params'] = params
        self.params['train_batch_size'] = params['batch_size']
        self.params['env_wrappers'] = env_args['env_wrappers']

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):
        self.rl_trainer.run_training_loop(
            self.params['num_timesteps'],
            collect_policy = self.rl_trainer.agent.dqn,
            eval_policy = self.rl_trainer.agent.dqn,
            )

In [16]:
#run training

trainer = Q_Trainer(args)
trainer.run_training_loop()



########################
logging outputs to  ./data/hw3_q3_dqn_LunarLander-v3
########################
<__main__.Args object at 0x7fc2f74a9ad0>
GPU not detected. Defaulting to CPU.
if 'env_wrappers' in self.params:
ob_dim = Box(-inf, inf, (9,), float32), ac_dim = 6
fps =  30
[ cs285/envs/box2d/lunar_lander ] No display found; rendering is disabled


  0%|          | 0/200000 [00:00<?, ?it/s]


********** Iteration 0 ************

Training agent...

Beginning logging procedure...
Timestep 1
Num Episodes 0
mean reward (100 episodes) nan
best mean reward -inf
running time 0.005166
	Train_EnvstepsSoFar : 1
	Train_EpisodeSoFar : 0
	Num_Episode_Grounded : 0
	Num_Episode_Grounded_at_site : 0
	TimeSinceStart : 0.005166053771972656
Done logging...




  5%|▍         | 9966/200000 [00:29<07:09, 442.67it/s] 


********** Iteration 10000 ************

Training agent...

Beginning logging procedure...
Timestep 10001
Num Episodes 45
Grounded rate(%) = 17.78
Success rate(%) = 15.56
mean reward (100 episodes) -220.573558
best mean reward -inf
running time 29.422616


  5%|▌         | 10057/200000 [00:29<07:17, 434.17it/s]

	Train_EnvstepsSoFar : 10001
	Train_EpisodeSoFar : 45
	Num_Episode_Grounded : 8
	Num_Episode_Grounded_at_site : 7
	Train_AverageReturn : -220.57355795499765
	TimeSinceStart : 29.422616004943848
	Training Loss : 1.6048036813735962
Done logging...




 10%|▉         | 19985/200000 [01:09<11:03, 271.44it/s]


********** Iteration 20000 ************

Training agent...

Beginning logging procedure...
Timestep 20001
Num Episodes 60
Grounded rate(%) = 16.67
Success rate(%) = 13.33
mean reward (100 episodes) -208.844970
best mean reward -inf
running time 69.181116


 10%|█         | 20040/200000 [01:09<11:31, 260.17it/s]

	Train_EnvstepsSoFar : 20001
	Train_EpisodeSoFar : 60
	Num_Episode_Grounded : 10
	Num_Episode_Grounded_at_site : 8
	Train_AverageReturn : -208.84496967563675
	TimeSinceStart : 69.18111610412598
	Training Loss : 0.8981856107711792
Done logging...




 15%|█▌        | 30000/200000 [01:52<13:00, 217.93it/s]


********** Iteration 30000 ************

Training agent...

Beginning logging procedure...
Timestep 30001
Num Episodes 70
Grounded rate(%) = 14.29
Success rate(%) = 11.43
mean reward (100 episodes) -195.464334
best mean reward -inf
running time 112.746848


 15%|█▌        | 30022/200000 [01:52<14:34, 194.37it/s]

	Train_EnvstepsSoFar : 30001
	Train_EpisodeSoFar : 70
	Num_Episode_Grounded : 10
	Num_Episode_Grounded_at_site : 8
	Train_AverageReturn : -195.46433432438866
	TimeSinceStart : 112.7468478679657
	Training Loss : 0.35418593883514404
Done logging...




 20%|█▉        | 39980/200000 [02:37<08:53, 299.79it/s]


********** Iteration 40000 ************

Training agent...

Beginning logging procedure...
Timestep 40001
Num Episodes 82
Grounded rate(%) = 12.20
Success rate(%) = 9.76
mean reward (100 episodes) -186.624030
best mean reward -inf
running time 157.707301


 20%|██        | 40052/200000 [02:37<08:16, 322.12it/s]

	Train_EnvstepsSoFar : 40001
	Train_EpisodeSoFar : 82
	Num_Episode_Grounded : 10
	Num_Episode_Grounded_at_site : 8
	Train_AverageReturn : -186.6240303095751
	TimeSinceStart : 157.70730090141296
	Training Loss : 0.3949374258518219
Done logging...




 25%|██▍       | 49985/200000 [03:20<15:53, 157.25it/s]


********** Iteration 50000 ************

Training agent...

Beginning logging procedure...
Timestep 50001
Num Episodes 93
Grounded rate(%) = 10.75
Success rate(%) = 8.60
mean reward (100 episodes) -178.389536
best mean reward -inf
running time 200.868597


 25%|██▌       | 50018/200000 [03:20<17:02, 146.74it/s]

	Train_EnvstepsSoFar : 50001
	Train_EpisodeSoFar : 93
	Num_Episode_Grounded : 10
	Num_Episode_Grounded_at_site : 8
	Train_AverageReturn : -178.38953621204826
	TimeSinceStart : 200.86859679222107
	Training Loss : 1.2416772842407227
Done logging...




 27%|██▋       | 53759/200000 [03:35<09:46, 249.52it/s]


KeyboardInterrupt: 

In [18]:
#@markdown You can visualize your runs with tensorboard from within the notebook

## requires tensorflow==2.3.0
%load_ext tensorboard
%tensorboard --logdir .data/hw3_q3_dqn_LunarLander-v3/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [15]:
def show_video():
  mp4list = glob.glob(logdir + '/gym/*.mp4')
  print(logdir + '/gym/*.mp4')
  print(mp4list)
  if len(mp4list) > 0:
    for i in range(len(mp4list)):
        mp4 = mp4list[i]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                    loop controls style="height: 300px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                 </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

show_video()

./data/hw3_q3_dqn_LunarLander-v3/gym/*.mp4
[]
Could not find video


In [3]:
#@title apt install requirements

#@markdown Run each section with Shift+Enter

#@markdown Double-click on section headers to show code.

!apt update 
!apt install -y --no-install-recommends \
        build-essential \
        curl \
        git \
        gnupg2 \
        make \
        cmake \
        ffmpeg \
        swig \
        libz-dev \
        unzip \
        zlib1g-dev \
        libglfw3 \
        libglfw3-dev \
        libxrandr2 \
        libxinerama-dev \
        libxi6 \
        libxcursor-dev \
        libgl1-mesa-dev \
        libgl1-mesa-glx \
        libglew-dev \
        libosmesa6-dev \
        lsb-release \
        ack-grep \
        patchelf \
        wget \
        xpra \
        xserver-xorg-dev \
        xvfb \
        python-opengl \
        ffmpeg > /dev/null 2>&1

#!pip install opencv-python==3.4.0.12

The operation couldn’t be completed. Unable to locate a Java Runtime that supports apt.
Please visit http://www.java.com for information on installing Java.

