# FrozenLake

- wiki: https://github.com/openai/gym/wiki/FrozenLake-v0
- link: https://gym.openai.com/envs/FrozenLake-v0/

In [4]:
# pip install gym

In [5]:
import gym

import numpy as np
import random

# Load environment

In [6]:
env_name = 'FrozenLake-v0'
env = gym.make(env_name, is_slippery = False)

# Set up

In [7]:
env.observation_space.n

16

In [8]:
env.action_space.n

4

In [9]:
# Q_table = np.random.rand(env.observation_space.n, env.action_space.n)
# Q_table

In [10]:
Q_table = np.zeros((env.observation_space.n, env.action_space.n))
Q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [11]:
number_of_episodes =1000
epsilon = 1
epsilon_decay_factor = 0.99999
learning_rate = 0.1
reward_discount_factor = 0.99

---

In [12]:
state = env.reset()
state

0

In [13]:
Q_table[0]

array([0., 0., 0., 0.])

In [14]:
state = env.reset()
optimal_action = np.argmax(Q_table[state])
optimal_action
#index ที่ 3

0

---

In [15]:
env.action_space.n

4

In [16]:
random_action = random.randint(0,env.action_space.n-1)
random_action

2

---

In [17]:
action = np.random.choice([random_action,optimal_action],p=[epsilon,1-epsilon]) #p โอกาสที่จะเกิดเหตุการณ์
action

2

---

In [18]:
for i in range(number_of_episodes):
    state = env.reset()
    while True:
        print('curent state:', state)
         # Interact with the Environment and Update the Q table
        optimal_action = np.argmax(Q_table[state])
        random_action = random.randint(0,env.action_space.n-1)
        action = np.random.choice([random_action,optimal_action],p=[epsilon,1-epsilon])
        print('action:', action)
        
        next_state,reward,done,info = env.step(action) #ให้ agent เดินตาม action
        print('next_state:', next_state)
        
        if done and reward == 0:
            reward = -1
        print('reward:', reward)

        target = reward+reward_discount_factor*np.max(Q_table[next_state])
        print('target:', target)
        
        error = target - Q_table[state][action]
        print('error:', error, '\n')
        
        Q_table[state][action] = Q_table[state][action] + learning_rate*error
        epsilon = epsilon*epsilon_decay_factor
        
        '''
        เลือก action ด้วยการใช้ policy และ epsilon-greedy
        Epsilon-Greedy เป็นการสำรวจ environment กล่าวคือเราจะเลือก action แบบสุ่มด้วยความน่าจะเป็น ϵ
        และเลือก action ที่ดีที่สุดตาม policy ด้วยความน่าจะเป็น 1−ϵ
        โดยที่ค่า ϵ นั้นตอนแรกจะถูกตั้งไว้สูง ๆ ก่อน ก็คือประมาณ 1 และค่อย ๆ ลดลงมาเมื่อ train ไปเรื่อย ๆ
        สาเหตุที่เราต้องใช้ epsilon-greedy ก็เพราะว่าเราต้องการให้ agent เราได้รู้จักกับ environment ให้ได้กว้าง ๆ 
        เผื่อจะค้นพบ policy ที่ดีที่สุดจริง ๆ คือสมมติว่าเรามัวแต่เลือก action ที่เราคิดในตอนนี้ว่ามันดีที่สุดแล้ว 
        เราก็จะไม่เคยได้ไปทดลองในทางอื่นเลย และจะไม่มีทางรู้ได้ว่า policy ที่เรามีอยู่ตอนนี้มันดีที่สุดจริง ๆ แล้วหรือยัง
        '''
        
        state = next_state
        
        if done:
            print('-----done-----\n')
            break
            
'''
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)

0	Move Left
1	Move Down
2	Move Right
3	Move Up
'''

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -1.0 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.0
error: 0.0 

curent state: 8
action: 3
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.0
error: 0.0 

curent state: 8
action: 3
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 2
next_state: 5
reward: -1
target: 

action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 0
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 0
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.0
error: 0.0 

curent state: 6
action: 3
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.0
error: 0.0 

curent state: 6

action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.0
error: 0.0 

curent state: 6
action: 2
next_state: 7
reward: -1
target: -1.0
error: -0.729 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.0
error: 0.0 

curent state: 6
action: 3
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.0
error: 0.0 

curent state: 6
action: 0
next_state: 5
reward: -1
target: -1.0
error: -0.6560999999999999 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 3
next_state: 0
rewa

action: 1
next_state: 4
reward: 0.0
target: 0.00021774904761664795
error: 0.0001745789070823362 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 6.002175093011992e-05
error: 5.6605773448299864e-05 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 4.04423154456843e-07
error: 2.948244795990385e-07 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 4.04423154456843e-07
error: 2.948244795990385e-07 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -0.03090315438263258 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 6.002175093011992e-05
error: 5.701744646123237e-05 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 4.04423154456843e-07
error: 2.6534203163913467e-07 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.0
error: 0.0 

curent state: 6
action: 1
next_state: 10
reward: 0.0
target: 0.009801
erro

reward: 0.0
target: 0.0008609571288403761
error: 0.00022335868774843882 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.0019377827839612213
error: 0.001068129118465892 

curent state: 8
action: 0
next_state: 8
reward: 0.0
target: 0.0019377827839612213
error: 0.0015579454783403055 

curent state: 8
action: 3
next_state: 4
reward: 0.0
target: 0.0009667019115684993
error: 0.0008596623736373051 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.0009667019115684993
error: 0.0004156050004426667 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -0.0041745579179292 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0009667019115684993
error: 0.0003067676017017182 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.0019377827839612213
error: 0.0009613162066193028 

curent state: 8
action: 3
next_state: 4
reward: 0.0
target: 0.0010618722160238103
error: 0.0008688664407288855 

curent state: 4
action: 3
n

action: 3
next_state: 0
reward: 0.0
target: 0.002384384757523116
error: 0.001653151527744218 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.002384384757523116
error: 0.0010945895431396287 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.002384384757523116
error: 0.0009851305888256658 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.004679616424334682
error: 0.0022711469722911303 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.01029323688725104
error: 0.005566351610145301 

curent state: 8
action: 2
next_state: 9
reward: 0.0
target: 0.033235602064721106
error: 0.022838393087699853 

curent state: 9
action: 3
next_state: 5
reward: -1
target: -1.0
error: -0.3874204889999999 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.001009388967342094
error: 0.00037816720242137133 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -0.0041745579179292 

-----done-----

curent state: 0
a

error: 0.0040380556917528326 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.010796336838292493
error: 0.003712249766245409 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.014813262999385072
error: 0.00390787225363508 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.014813262999385072
error: 0.009347339884911193 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.021925502415130554
error: 0.006962610496559775 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -0.04710128697246241 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.011183216191402364
error: 0.00402112947568742 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.011183216191402364
error: 0.003619016528118678 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.011183216191402364
error: 0.0037279041427307393 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.01550256143854449
er

target: 0.0242054811792282
error: 0.00929696821336628 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.01664094430490183
error: 0.006547779211756489 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0242054811792282
error: 0.00836727139202965 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -0.0008595044557170306 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0191272680838951
error: 0.0023182334324791075 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.02815491746247034
error: 0.00883444465045509 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -0.038152042447694545 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.016870449414717263
error: 0.003635488502469246 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.020001878104290154
error: 0.0029610201096262508 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -

target: 0.03198284314843339
error: 0.0132478854744669 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.03198284314843339
error: 0.00404089704957732 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.03198284314843339
error: 0.0036368073446195887 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.03198284314843339
error: 0.006201641106111915 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.03198284314843339
error: 0.0032731266101576305 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.03682895018535935
error: 0.004523048015224615 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.032430624901940625
error: 0.013521948865460043 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.02656005744961917
error: 0.0037012952910709766 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.032430624901940625
error: 0.012370878680527449 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.07239790148645346
error: 0.016541756015851082 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.10708228329011012
error: 0.03395308986945006 

curent state: 6
action: 1
next_state: 10
reward: 0.0
target: 0.21687083194302903
error: 0.10870690942776629 

curent state: 10
action: 1
next_state: 14
reward: 0.0
target: 0.516486069
error: 0.29742462259290003 

curent state: 14
action: 3
next_state: 10
reward: 0.0
target: 0.24631586957972615
error: 0.15394800160943214 

curent state: 10
action: 3
next_state: 6
reward: 0.0
target: 0.11784426732345897
error: 0.09798291233480996 

curent state: 6
action: 1
next_state: 10
reward: 0.0
target: 0.24631586957972615
error: 0.12728125612168678 

curent state: 10
action: 0
next_state: 9
reward: 0.0
target: 0.09271386900210132
error: 0.058620153775746864 

curent state: 9
action: 1
next_state: 13
reward: 0.0
target: 0.20937557992923003
error: 0.11572520719983476 

curent state: 13

error: -4.048376602283632e-05 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.041106975041326134
error: 0.005536972803303689 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -4.048376602283632e-05 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.041106975041326134
error: 0.00498327552297332 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.041106975041326134
error: 0.01084584763944019 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.07693081360688421
error: 0.04083757291863369 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.07693081360688421
error: 0.014033315913614841 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.07693081360688421
error: 0.016723915030552768 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.041106975041326134
error: 0.004484947970675987 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.09778842630689477
error: 0.025969815755974873 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.09778842630689477
error: 0.006546141844412753 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.06606790809032853
error: 0.010980231734834194 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -1.4115816386217617e-05 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.1068142955849861
error: 0.008038107396203503 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.09858419893911892
error: 0.024168606812601537 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.1068142955849861
error: 0.00723429665658315 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.1068142955849861
error: 0.02419414730886811 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.13729205536327957
error: 0.029398827499657235 

curent state

curent state: 8
action: 2
next_state: 9
reward: 0.0
target: 0.17387234832440276
error: 0.07377733882868848 

curent state: 9
action: 2
next_state: 10
reward: 0.0
target: 0.41086374059744085
error: 0.2476715233180275 

curent state: 10
action: 0
next_state: 9
reward: 0.0
target: 0.18607977591510397
error: 0.10795238517683785 

curent state: 9
action: 3
next_state: 5
reward: -1
target: -1.0
error: -0.13508517176729917 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.08757180001232986
error: 0.010326441183946544 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.11834604259287872
error: 0.029889678944060666 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.09053087822779188
error: 0.012252875281013903 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.11834604259287872
error: 0.026900711049654594 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.11834604259287872
error: 0.010833380835833364 

curent state

next_state: 9
reward: 0.0
target: 0.2174286691067742
error: 0.08269713020437658 

curent state: 9
action: 2
next_state: 10
reward: 0.0
target: 0.41086374059744085
error: 0.20061393388760226 

curent state: 10
action: 2
next_state: 11
reward: -1
target: -1.0
error: -0.4782968999999999 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.14605180705209103
error: 0.008171752938507498 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.14605180705209103
error: 0.008624074067368814 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.11600175793709279
error: 0.01103082456622935 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -3.229246018143783e-06 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.14605180705209103
error: 0.007354577644656746 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.11600175793709279
error: 0.00992774210960641 

curent state: 4
action: 3
next_state:

target: 0.14839515677920018
error: 0.0358216300345684 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -1.7161537331222831e-06 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.17341240470533703
error: 0.015725544645112943 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.14839515677920018
error: 0.015585095597562437 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.14839515677920018
error: 0.03223946703111155 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.14839515677920018
error: 0.029015520328000394 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.164377723480938
error: 0.03706447765110488 

curent state: 8
action: 2
next_state: 9
reward: 0.0
target: 0.22800808809761283
error: 0.06196998357141284 

curent state: 9
action: 3
next_state: 5
reward: -1
target: -1.0
error: -0.08862938119652497 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.1734


curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.1995062356968329
error: 0.00682656140488197 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.1995062356968329
error: 0.01701672175455668 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -9.979388826408808e-08 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.1995062356968329
error: 0.006143905264393762 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -8.981449939327035e-08 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.1995062356968329
error: 0.005529514737954377 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.22090333391811223
error: 0.019381883719291115 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.2809133742521468
error: 0.057778693526780917 

curent state: 6
action: 2
next_state: 7
reward: -1
target: -1.0
error: -0.12157665459056921 

-----done-----

curent state

error: 0.008890025948074032 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.21295938256167674
error: 0.007457890347326163 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.22448516048391656
error: 0.00937467304787945 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.2458601180648927
error: 0.019107430707401202 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.1636617079082069
error: 0.05557658154780393 

curent state: 3
action: 2
next_state: 3
reward: 0.0
target: 0.1636617079082069
error: 0.09403715964206427 

curent state: 3
action: 1
next_state: 7
reward: -1
target: -1.0
error: -0.038152042447694545 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.2263767961239493
error: 0.010328841383124232 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.2263767961239493
error: 0.026175777881483897 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.2458601180648927
error: 0.017196687

action: 2
next_state: 2
reward: 0.0
target: 0.2971571066070232
error: 0.030596122190447994 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.3341463461260541
error: 0.03398765258360642 

curent state: 6
action: 2
next_state: 7
reward: -1
target: -1.0
error: -0.07976644307687253 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.26692439066926377
error: 0.0164667532359154 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.2495832696293705
error: 0.0340766751978826 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.2495832696293705
error: 0.020795430092700712 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.20940169690873084
error: 0.014561100549091766 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -1.23202331225869e-07 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.20940169690873084
error: 0.013104990494182595 

curent state: 4
action: 2
next_sta

reward: 0.0
target: 0.30411556010435814
error: 0.07551368469984682 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.30411556010435814
error: 0.032583018842704425 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -6.447764766548403e-09 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.28858210915193777
error: 0.02201162427098935 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.28858210915193777
error: 0.019810461843890403 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.28858210915193777
error: 0.017829415659501358 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.30411556010435814
error: 0.012618480152905842 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.33310539300628206
error: 0.02591795855743545 

curent state: 2
action: 0
next_state: 1
reward: 0.0
target: 0.30668143800154424
error: 0.07052819412704822 

curent state: 1
action: 2
next_state: 2
reward: 0.0


target: 0.2709559217529035
error: 0.015291235554889704 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -2.5366267530912978e-08 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.3210543567976091
error: 0.017011735782375748 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.2709559217529035
error: 0.01376211199940075 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.2709559217529035
error: 0.049369970398423224 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -2.282964073341276e-08 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.3210543567976091
error: 0.015310562204138178 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.3210543567976091
error: 0.01377950598372435 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.3210543567976091
error: 0.012401555385351926 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.3210

target: 0.3897577154701915
error: 0.25946176260191245 

curent state: 9
action: 1
next_state: 13
reward: 0.0
target: 0.5165193003424663
error: 0.19042991543657906 

curent state: 13
action: 0
next_state: 12
reward: -1
target: -1.0
error: -0.4782968999999999 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.3333387855293404
error: 0.0077428477937656615 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.3041752861036738
error: 0.015612718795778247 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.24770691451358573
error: 0.01808701791808731 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -0.0014555783429306413 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.3333387855293404
error: 0.006968563014389084 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.34403595764714595
error: 0.007330113678115258 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0


action: 1
next_state: 4
reward: 0.0
target: 0.32517063835913884
error: 0.01880034909603845 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -5.222689503092681e-09 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.3683556795341778
error: 0.01859550137612559 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.3849819555141736
error: 0.012905511540256598 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.4128465778144862
error: 0.023975915678957294 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.4128465778144862
error: 0.04973213244719604 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.4128465778144862
error: 0.04475891920247643 

curent state: 2
action: 0
next_state: 1
reward: 0.0
target: 0.38735557116639036
error: 0.05583234336118986 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -1.6139745095955504e-10 

-----done-----

curent state: 0
action: 0
next_

action: 1
next_state: 5
reward: -1
target: -1.0
error: -6.947620256880782e-11 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.4045335444460122
error: 0.023135665712212317 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.42215737236765827
error: 0.013537630502999498 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -6.252853790300605e-11 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.40587376986580914
error: 0.022162324560788038 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.35244327200763964
error: 0.013559682745833579 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.40587376986580914
error: 0.04987046480758728 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.40587376986580914
error: 0.019946092104709212 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.40587376986580914
error: 0.017951482894238313 

curent state: 0
action: 


curent state: 6
action: 0
next_state: 5
reward: -1
target: -1.0
error: -0.011972515182562149 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.4284392008552033
error: 0.0038895869264250482 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.42068918689520657
error: 0.028025965366550276 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.4284392008552033
error: 0.0035006282337825545 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.4466853286544433
error: 0.013918459103732905 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.4808301538577096
error: 0.029632852186554703 

curent state: 6
action: 3
next_state: 2
reward: 0.0
target: 0.44961898102091225
error: 0.07425399591462517 

curent state: 2
action: 0
next_state: 1
reward: 0.0
target: 0.42981712830647284
error: 0.03559258749774824 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -2.1802226690681437e-11 

-----done-----

curent st

action: 3
next_state: 10
reward: 0.0
target: 0.6289817459735193
error: 0.3415330871390299 

curent state: 10
action: 2
next_state: 11
reward: -1
target: -1.0
error: -0.16677181699666566 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.43466065500848183
error: 0.008357359882984139 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.43466065500848183
error: 0.007521623894685747 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.41599782645085553
error: 0.006723953702053531 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -2.459952241906649e-10 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.43466065500848183
error: 0.007210257188057567 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.41599782645085553
error: 0.0060515583318481725 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -2.213956795671379e-10 

-----done-----

curent state: 0
actio

target: 0.45091690109909205
error: 0.006165220047813114 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.46410975856731607
error: 0.008638141295505886 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.4873075416393847
error: 0.018509805712802807 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.5177644164187303
error: 0.025534576378947738 

curent state: 6
action: 1
next_state: 10
reward: 0.0
target: 0.6289817459735193
error: 0.10598738595459978 

curent state: 10
action: 2
next_state: 11
reward: -1
target: -1.0
error: -0.15009463529699907 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.43432249287759495
error: 0.008839509405552282 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.3632279859866841
error: 0.032836606506988386 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -0.00036998848503511095 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 

error: 0.03401264834663403 

curent state: 8
action: 2
next_state: 9
reward: 0.0
target: 0.46850454080218507
error: 0.06930134101900676 

curent state: 9
action: 2
next_state: 10
reward: 0.0
target: 0.6289817459735193
error: 0.15574483607232226 

curent state: 10
action: 3
next_state: 6
reward: 0.0
target: 0.5377006437167906
error: 0.26698498211910815 

curent state: 6
action: 0
next_state: 5
reward: -1
target: -1.0
error: -0.00636268544113594 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.4836385550134937
error: 0.007945343963737472 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.4717228679916686
error: 0.021851296176867208 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.4836385550134937
error: 0.007150809567363747 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.5021669160709045
error: 0.013643123127981549 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.5021669160709045
error: 0.0409198439

action: 0
next_state: 0
reward: 0.0
target: 0.4911185476758638
error: 0.004986113354108479 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.5014171481625972
error: 0.005337807075866063 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -1.3877787807814457e-13 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.5014171481625972
error: 0.004804026368279479 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -1.249000902703301e-13 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.4921225891868342
error: 0.005491543529668019 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.4921225891868342
error: 0.004942389176701201 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.4921225891868342
error: 0.005358917259883911 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.5014171481625972
error: 0.004323623731451509 

curent state: 1
action: 3
ne

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.4880336331426475
error: 0.007248258745661873 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.4880336331426475
error: 0.026914629769813236 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -7.601919094213372e-12 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.500502759821193
error: 0.0016989198976364306 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.5074578046413065
error: 0.00189946138757624 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.5074578046413065
error: 0.014598623967126634 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.5235422772425029
error: 0.010958636190678162 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.5611401191581928
error: 0.03230953608495757 

curent state: 6
action: 2
next_state: 7
reward: -1
target: -1.0
error: -0.011972515182562149 

-----done-----

curent state: 0


curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.5161245444988645
error: 0.004259852558119781 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.4992219530530445
error: 0.005331486038014366 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.4992219530530445
error: 0.021997258437184197 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.5161245444988645
error: 0.011859945455385157 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.5161245444988645
error: 0.0038338673023078362 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.5161245444988645
error: 0.003450480572077086 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.5161245444988645
error: 0.0031054325148693884 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.5161245444988645
error: 0.0027948892633824496 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.5255864091195958
error: 0.004248485383368972 

curent state: 1


target: 0.5070940772930809
error: 0.0039712544240627645 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.5311420321241677
error: 0.018925792434187017 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.5089677307440653
error: 0.005447782432640991 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -1.5651924201165457e-12 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.545733471569723
error: 0.009226368413998043 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.5715190377477543
error: 0.020273106869246216 

curent state: 2
action: 0
next_state: 1
reward: 0.0
target: 0.5477405091497783
error: 0.03329106955383576 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.5715190377477543
error: 0.018245796182321583 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.4623086162567023
error: 0.05745551198601978 

curent state: 3
action: 0
next_state: 2
reward: 0.0
target: 0.5715190

action: 1
next_state: 6
reward: 0.0
target: 0.6353750135901705
error: 0.026763221701595286 

curent state: 6
action: 1
next_state: 10
reward: 0.0
target: 0.7511622332159759
error: 0.10936929019560171 

curent state: 10
action: 1
next_state: 14
reward: 0.0
target: 0.9022569126154403
error: 0.1435071820942525 

curent state: 14
action: 3
next_state: 10
reward: 0.0
target: 0.7653694442433069
error: 0.3443566521158808 

curent state: 10
action: 3
next_state: 6
reward: 0.0
target: 0.646202573319535
error: 0.22295277377357142 

curent state: 6
action: 2
next_state: 7
reward: -1
target: -1.0
error: -0.004638397686588025 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.5512803731439473
error: 0.010301557265479655 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.5700915983029721
error: 0.013242736541409172 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.5700915983029721
error: 0.03535759489421386 

curent state: 1
action: 2
next_stat

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.5478759717396966
error: 0.007546162119643429 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.5864227626180877
error: 0.03301269015374764 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.6018757489374426
error: 0.0095295240706873 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.6289242590843885
error: 0.020968957127375898 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.49972122866034996
error: 0.061161453606176974 

curent state: 3
action: 3
next_state: 3
reward: 0.0
target: 0.49972122866034996
error: 0.13199224700908013 

curent state: 3
action: 3
next_state: 3
reward: 0.0
target: 0.49972122866034996
error: 0.11879302230817212 

curent state: 3
action: 1
next_state: 7
reward: -1
target: -1.0
error: -0.00041109831670571584 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.6039516756930527
error: 0.010652498419228773 

curent state: 1
a

'\nSFFF       (S: starting point, safe)\nFHFH       (F: frozen surface, safe)\nFFFH       (H: hole, fall to your doom)\nHFFG       (G: goal, where the frisbee is located)\n\n0\tMove Left\n1\tMove Down\n2\tMove Right\n3\tMove Up\n'

---

In [19]:
Q_table

array([[ 0.58907476,  0.55317883,  0.61409748,  0.59535079],
       [ 0.58452226, -1.        ,  0.63011839,  0.58775515],
       [ 0.57266837,  0.66487344,  0.46303479,  0.58688219],
       [ 0.52835843, -0.99966701,  0.37516346,  0.39280751],
       [ 0.52318977,  0.52670878, -1.        ,  0.57300494],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.99854442,  0.72219809, -0.99695675,  0.52757193],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.48766845, -0.9999383 ,  0.56076626,  0.50239892],
       [ 0.33282872,  0.54917079,  0.65527815, -0.99030226],
       [ 0.46754108,  0.79764018, -0.92023356,  0.52983778],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.81469798,  0.44481904,  0.73157492,  0.24240712],
       [ 0.46362172,  0.63474635,  0.91137062,  0.48771921],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])

In [20]:
state = env.reset()
np.argmax(Q_table[state])

2

In [21]:
optimal_action = np.argmax(Q_table[state])
action = optimal_action

'''
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)

0	Move Left
1	Move Down
2	Move Right
3	Move Up
''';

In [22]:
print(action)

2


In [23]:
print(np.argmax(Q_table[0]))

2


---

In [24]:
state = env.reset()
while True:
    ## เลือก optimal action ด้วยการดูจากตาราง Q
    optimal_action = np.argmax(Q_table[state])
    ## เดินไปตาม optimal action
    next_state,reward,done,_ = env.step(optimal_action)
    ## ก้าวสู่ step ถัดไป ด้วยการก๊อป state ในเวลาถัดไปมาเป็น state ปัจจุบัน 
    state = next_state
    ## คำสั่งให้แสดงผลว่าตอนนี้ agent เราอยู่จุดไหน และมาจากทางไหน
    env.render()
    ## สิ้นสุด episode การเทส
    if done:  break

  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
