*

In [1]:
# Colab server에 내 google drive를 mount. /content/drive/My Drive 로 mount됨

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
# Training된 Q-table 저장을 위해 temp directory를 생성해 둠
# 이미 만들어진 경우는 server가 만들 수 없다고 하나 신경쓸 필요 없음

!mkdir '/content/drive/My Drive/Colab Notebooks/temp'

mkdir: cannot create directory ‘/content/drive/My Drive/Colab Notebooks/temp’: File exists


In [0]:
from collections import deque  # deque는 처음과 끝 부분에서 모두 insert/delete가 가능한 data structure
import numpy as np
#import argparse      # console에서 실행할 때는 argparse를 쓰는 것이 편함 (Atienza book original source 참조)
import os
import time
import gym            # openAI gym package를 로딩
from gym import wrappers, logger   # gym package 중 wrappers와 logger 사용

In [0]:
# QAgent class

class QAgent():
    def __init__(self,
                 observation_space,  # 
                 action_space,
                 demo=False,
                 slippery=False,
                 episodes=40000):
        
        self.action_space = action_space
        # number of columns is equal to number of actions
        col = action_space.n
        # number of rows is equal to number of states
        row = observation_space.n
        # build Q Table with row x col dims
        self.q_table = np.zeros([row, col])

        # discount factor
        self.gamma = 0.9

        # initially 90% exploration, 10% exploitation
        self.epsilon = 0.9
        # iteratively applying decay til 10% exploration/90% exploitation (slide 12 p.24)
        self.epsilon_min = 0.1
        self.epsilon_decay = self.epsilon_min / self.epsilon  
        self.epsilon_decay = self.epsilon_decay ** (1. / float(episodes)) 

        # learning rate of Q-Learning: 새로운 Q value로 update하는 비율 (slide 12 p.25)
        self.learning_rate = 0.1
        
        # file where Q Table is saved on/restored fr
        if slippery:
            self.filename = 'q-frozenlake-slippery.npy'
        else:
            self.filename = 'q-frozenlake.npy'

        # demo or train mode 
        self.demo = demo
        # if demo mode, no exploration (즉 test 때는 Q-table만을 참조)
        if demo:
            self.epsilon = 0

    # determine the next action
    # if random, choose from random action space
    # else use the Q Table
    def act(self, state, is_explore=False):
        # 0 - left, 1 - Down, 2 - Right, 3 - Up
        if is_explore or np.random.rand() < self.epsilon:
            # explore: 미리 준비된 environment의 (0, 1, 2, 3) action 중 random으로 return
            return self.action_space.sample()  

        # exploit - choose action with max Q-value
        return np.argmax(self.q_table[state])

    # TD(0) learning (generalized Q-Learning) with learning rate
    def update_q_table(self, state, action, reward, next_state):
        # Q(s,a) = Q(s,a) + alpha * (reward + gamma * max_a' Q(s',a') - Q(s,a))
        q_value = self.gamma * np.amax(self.q_table[next_state])
        q_value += reward
        q_value -= self.q_table[state, action]
        q_value *= self.learning_rate
        q_value += self.q_table[state, action]
        self.q_table[state, action] = q_value

    # dump Q Table
    def print_q_table(self):
        print(self.q_table)
        print("Epsilon : ", self.epsilon)

    # save trained Q Table
    def save_q_table(self):
        np.save(self.filename, self.q_table)

    # load trained Q Table
    def load_q_table(self):
        self.q_table = np.load(self.filename)

    # adjust epsilon
    def update_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [5]:
# Main Program Begin Here

# openAI gym의 print option을 INFO level로 함. 
# DEBUG, INFO, WARN, ERROR, DISABLED level이 있음. https://github.com/openai/gym/blob/master/gym/logger.py
logger.setLevel(logger.INFO)

# instantiate a gym environment (FrozenLake-v0)
# gym이 제공하는 'FrozenLake-v0' environment를 하나 생성
# 예를 들면 'CartPole-v0' environment도 그 이름을 주어서 만들 수 있음 
env = gym.make('FrozenLake-v0')

INFO: Making new env: FrozenLake-v0


In [6]:
# Training 된 Q-table을 저장할 output path 
outdir = "/content/drive/My Drive/Colab Notebooks/temp/q-learning-FrozenLake-v0"

# simple environment를 wrapper environment로 확장: N개의 이전 observation등을 buffering하여 이용하는 등의 확장 기능 제공
# Monitor: data save할 때 필요
env = wrappers.Monitor(env, directory=outdir, force=True)
env.seed(0)

# output path가 제대로 준비되었는지 server ls 명령으로 확인
!ls "/content/drive/My Drive/Colab Notebooks/temp"

INFO: Clearing 20 monitor files from previous run (because force=True was provided)
dqn-CartPole-v0  q-learning-FrozenLake-v0


In [0]:
#----------------------------------------------------------------------- 
# 실행 옵션은 여기서 수정
#-----------------------------------------------------------------------
opt_slippery = False      # True로 하면 slippery case
opt_delay = 0             # step 마다 몇 초씩 쉬게 할 수 있음. 1, 2, ... 
opt_demo = False          # training일 때 False, testing일 때 True
opt_explore = False       # pure random action으로만 learning할 때 True

# number of times the Goal state is reached
wins = 0
# number of episodes to train
episodes = 40000         # original Atienza source의 episodes 수, training에 5시간 소요
#episodes = 1000

env.is_slippery = opt_slippery

In [0]:
# instantiate a Q Learning agent
agent = QAgent(env.observation_space,  # 한번 실행 후 next state
               env.action_space,       # FrozenLake-v0 의 action space (Left, Down, Right, Up)
               demo=opt_demo,          # 지금 실행이 test mode이면 True
               slippery=opt_slippery,  # slippery option을 사용하면 True (non-deterministic Q-learning)
               episodes=episodes)      # episode의 수

In [0]:
# demo (test) mode 이면 미리 저장해 둔 Q-table을 upload한다
if opt_demo:
  agent.load_q_table()

In [15]:
# episode 수 많큼 for loop 반복
for episode in range(episodes):
    state = env.reset() # 새 episode로 준비
    done = False
    while not done:
        # 현재 state에서 취할 action 선택. pure random exploration인지를 명시
        action = agent.act(state, is_explore=opt_explore)
        # 한 step 실행하여 observable data 얻음: next_state, reward, 끝인지 여부 (H(hole)이나 G(gole)이면 done)
        next_state, reward, done, _ = env.step(action)
        # clear the screen before rendering the environment
        os.system('clear')
        # render the environment for human debugging
        env.render()
        # training of Q Table
        if done:
            # update exploration-exploitation ratio (epsilon)
            # reward > 0 only when Goal is reached
            # otherwise, it is a Hole
            if reward > 0:
                wins += 1   # Goal 도달 횟수 하나 증가

        if not opt_demo: # Training mode이면 q-table update
            agent.update_q_table(state, action, reward, next_state)
            agent.update_epsilon()

        state = next_state  # next_state를 현재 state로

        percent_wins = 100.0 * wins / (episode + 1)
        print("-------%0.2f%% Goals in %d Episodes---------"
              % (percent_wins, episode))
        
        # delay가 있을 경우 그 초만큼 쉼
        if done:
            time.sleep(5 * opt_delay)
        else:
            time.sleep(opt_delay)

# for loop done: 모든 episode가 끝나고 Goal에 도달한 비율을 print
print("Episodes: ", episode)
print("Goals/Holes: %d/%d" % (wins, episode - wins))

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
SFFF
FHFH
[41mF[0mFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
-------25.85% Goals in 39966 Episodes---------
  (Down)
SFFF
FHFH
[41m

In [16]:
# Q-table print 
agent.print_q_table()
# Training mode (pure exploration이 아닐 때) 일 때는 Q-table을 disk에 save
if not opt_demo and not opt_explore:
  agent.save_q_table()
# close the env and write monitor result info to disk
env.close() 

[[0.06247    0.0627891  0.06429007 0.05963725]
 [0.04814661 0.0419594  0.04821241 0.05615195]
 [0.06002315 0.05913036 0.0598527  0.05665275]
 [0.03037688 0.0374318  0.02291906 0.05194591]
 [0.08665303 0.06841302 0.06977697 0.06324405]
 [0.         0.         0.         0.        ]
 [0.06438906 0.06835158 0.09268841 0.02193476]
 [0.         0.         0.         0.        ]
 [0.07201845 0.11618501 0.09781351 0.11896406]
 [0.1729793  0.19881682 0.19237268 0.14520888]
 [0.22503316 0.18281014 0.19770409 0.12020883]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.20621685 0.26212234 0.36847125 0.27164115]
 [0.40115118 0.62551414 0.54643237 0.53130618]
 [0.         0.         0.         0.        ]]
Epsilon :  0.09999999999983186
