# FrozenLake

- wiki: https://github.com/openai/gym/wiki/FrozenLake-v0
- link: https://gym.openai.com/envs/FrozenLake-v0/

In [1]:
pip install gym

Note: you may need to restart the kernel to use updated packages.


In [2]:
import gym

import numpy as np
import random

# Load environment

In [3]:
env_name = 'FrozenLake-v0'
env = gym.make(env_name, is_slippery = False)

# Set up

In [4]:
env.observation_space.n

16

In [5]:
env.action_space.n

4

In [6]:
# Q_table = np.random.rand(env.observation_space.n, env.action_space.n)
# Q_table

In [7]:
Q_table = np.zeros((env.observation_space.n, env.action_space.n))
Q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [8]:
number_of_episodes =1000
epsilon = 1
epsilon_decay_factor = 0.99999
learning_rate = 0.1
reward_discount_factor = 0.99

---

In [9]:
state = env.reset()
state

0

In [10]:
Q_table[0]

array([0., 0., 0., 0.])

In [11]:
state = env.reset()
optimal_action = np.argmax(Q_table[state])
optimal_action
#index ที่ 3

0

---

In [12]:
env.action_space.n

4

In [13]:
random_action = random.randint(0,env.action_space.n-1)
random_action

2

---

In [14]:
action = np.random.choice([random_action,optimal_action],p=[epsilon,1-epsilon]) #p โอกาสที่จะเกิดเหตุการณ์
action

2

---

In [15]:
for i in range(number_of_episodes):
    state = env.reset()
    while True:
        print('curent state:', state)
         # Interact with the Environment and Update the Q table
        optimal_action = np.argmax(Q_table[state])
        random_action = random.randint(0,env.action_space.n-1)
        action = np.random.choice([random_action,optimal_action],p=[epsilon,1-epsilon])
        print('action:', action)
        
        next_state,reward,done,info = env.step(action) #ให้ agent เดินตาม action
        print('next_state:', next_state)
        
        if done and reward == 0:
            reward = -1
        print('reward:', reward)

        target = reward+reward_discount_factor*np.max(Q_table[next_state])
        print('target:', target)
        
        error = target - Q_table[state][action]
        print('error:', error, '\n')
        
        Q_table[state][action] = Q_table[state][action] + learning_rate*error
        epsilon = epsilon*epsilon_decay_factor
        
        '''
        เลือก action ด้วยการใช้ policy และ epsilon-greedy
        Epsilon-Greedy เป็นการสำรวจ environment กล่าวคือเราจะเลือก action แบบสุ่มด้วยความน่าจะเป็น ϵ
        และเลือก action ที่ดีที่สุดตาม policy ด้วยความน่าจะเป็น 1−ϵ
        โดยที่ค่า ϵ นั้นตอนแรกจะถูกตั้งไว้สูง ๆ ก่อน ก็คือประมาณ 1 และค่อย ๆ ลดลงมาเมื่อ train ไปเรื่อย ๆ
        สาเหตุที่เราต้องใช้ epsilon-greedy ก็เพราะว่าเราต้องการให้ agent เราได้รู้จักกับ environment ให้ได้กว้าง ๆ 
        เผื่อจะค้นพบ policy ที่ดีที่สุดจริง ๆ คือสมมติว่าเรามัวแต่เลือก action ที่เราคิดในตอนนี้ว่ามันดีที่สุดแล้ว 
        เราก็จะไม่เคยได้ไปทดลองในทางอื่นเลย และจะไม่มีทางรู้ได้ว่า policy ที่เรามีอยู่ตอนนี้มันดีที่สุดจริง ๆ แล้วหรือยัง
        '''
        
        state = next_state
        
        if done:
            print('-----done-----\n')
            break
            
'''
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)

0	Move Left
1	Move Down
2	Move Right
3	Move Up
'''

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.0
error: 0.0 

curent state: 8
action: 0
next_state: 8
reward: 0.0
target: 0.0
error: 0.0 

curent state: 8
action: 3
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -1.0 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target:

action: 0
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -0.4304672099999999 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.0
error: 0.0 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -0.6560999999999999 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -0.3874204889999999 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0

target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -0.15009463529699907 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.0
error: 0.0 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -0.3874204889999999 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.0
error: 0.0 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.0
error: 0.0 

curent state: 3
action: 2
next_state: 3
reward: 0.0
target: 0.0
error: 0.0 

curent state: 3
action: 1
next_state: 7
reward: -1
target: -1.0
error: -0.4304672099999999 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0
error: 0.0 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -0.05233476330273601 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.0
error: 0.0 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0
error: 0.0 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -0.05814973700304

reward: 0.0
target: 0.00014122735894696865
error: 0.0001221030168645822 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.00014122735894696865
error: 8.791336493313003e-05 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.00014122735894696865
error: 0.0001074823173049413 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.00014122735894696865
error: 7.912202843981704e-05 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.00014122735894696865
error: 9.673408557444716e-05 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.00014122735894696865
error: 8.706067701700243e-05 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.00014122735894696865
error: 7.120982559583534e-05 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 7.9864144390602e-05
error: 3.916187436340039e-05 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 7.9864144390602e-05
error: 5.965672704749583e-05 

curent state: 1
acti

target: -1.0
error: -0.0033813919135226023 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0017264739763892553
error: 0.0007556958961325385 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0017264739763892553
error: 0.0006801263065192846 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0007756257553489202
error: 0.0002483599687488076 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -0.005726416897022357 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0023572596529907483
error: 0.000613346545526854 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.0017871952843964139
error: 0.0008987134818260603 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.0007756257553489202
error: 0.00022352397187392682 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.0007756257553489202
error: 0.00041318464557514676 

curent state: 1
action: 3
next_state

error: 0.003127346994660326 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -0.0006265787482178098 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.009551671293850227
error: 0.0028146122951942937 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.006948335025893609
error: 0.0040951584894051265 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.006948335025893609
error: 0.0022922408507735272 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.006948335025893609
error: 0.003056644314026501 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.003137745323286421
error: 0.0011775939332678204 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -0.0017970102999143034 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.006948335025893609
error: 0.002063016765696175 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.00694833502589

error: 0.0023636867961480037 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.001954188320877675
error: 0.0014277898116160693 

curent state: 3
action: 3
next_state: 3
reward: 0.0
target: 0.001954188320877675
error: 0.0015411597161235334 

curent state: 3
action: 0
next_state: 2
reward: 0.0
target: 0.006560868528495847
error: 0.00458694093164971 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.006560868528495847
error: 0.0037215645199652134 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.022316908049568002
error: 0.01568976812179442 

curent state: 6
action: 3
next_state: 2
reward: 0.0
target: 0.008114155572553496
error: 0.006817558729754144 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.0024082954731109965
error: 0.0017391179826877839 

curent state: 3
action: 2
next_state: 3
reward: 0.0
target: 0.0024082954731109965
error: 0.002029139445947135 

curent state: 3
action: 2
next_state: 3
reward: 0.0
target: 0.002408295473110

curent state: 13
action: 0
next_state: 12
reward: -1
target: -1.0
error: -0.4304672099999999 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.025581661881169983
error: 0.0024173573016018843 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.025581661881169983
error: 0.002986138615581037 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.02815875869704658
error: 0.0023186961908142727 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.03337003205647903
error: 0.0049268414534016786 

curent state: 8
action: 0
next_state: 8
reward: 0.0
target: 0.03337003205647903
error: 0.010890144168778228 

curent state: 8
action: 3
next_state: 4
reward: 0.0
target: 0.028646516000933344
error: 0.016491860547790727 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.028646516000933344
error: 0.006421645415981923 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.03337003205647903
error: 0.004434157308061511 

cu

action: 3
next_state: 5
reward: -1
target: -1.0
error: -0.25418658283289997 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.0355340883752008
error: 0.0023484872381747887 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.02810422438567545
error: 0.002581348185706909 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.03763683179632786
error: 0.012324312456546752 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.05483131540862371
error: 0.01681431359415112 

curent state: 6
action: 1
next_state: 10
reward: 0.0
target: 0.11591369252100002
error: 0.06052852544158213 

curent state: 10
action: 0
next_state: 9
reward: 0.0
target: 0.06948338837991656
error: 0.056182010486910156 

curent state: 9
action: 3
next_state: 5
reward: -1
target: -1.0
error: -0.22876792454960992 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0385909351651242
error: 0.0026979166043153077 

curent state: 4
action: 1


next_state: 13
reward: 0.0
target: 0.09988118633790001
error: 0.029695945550105504 

curent state: 13
action: 2
next_state: 14
reward: 0.0
target: 0.4054149
error: 0.30452481279 

curent state: 14
action: 2
next_state: 15
reward: 1.0
target: 1.0
error: 0.59049 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0492505920295163
error: 0.0018145502423844925 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -1.544538359765646e-06 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.047141321843256555
error: 0.0035362982527518128 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.0492505920295163
error: 0.001633095218146044 

curent state: 4
action: 3
next_state: 0
reward: 0.0
target: 0.047302998269853014
error: 0.00690834933805845 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.047302998269853014
error: 0.004328971094645646 

curent state: 0
action: 0
next_state: 0
reward: 0.0
targe

error: 0.006725311349554862 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -0.00304325272217032 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.05571927345842838
error: 0.00486721331637982 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.05571927345842838
error: 0.008935694667891153 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -2.6561398887681165e-05 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.05446274772207016
error: 0.005076305507389037 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.05446274772207016
error: 0.004568674956650136 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.05446274772207016
error: 0.004331455963320713 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.05571927345842838
error: 0.004380491984741837 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.05571927345842838
error: 

target: -1.0
error: -1.5684240429192364e-05 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.06745431821237974
error: 0.0031190644882820168 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -1.4115816386217617e-05 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.06516190567770784
error: 0.004522698955432225 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.0749055304692849
error: 0.009085423724125463 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -0.0019966781110158927 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.06400068857119666
error: 0.004512062797830389 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.06745431821237974
error: 0.0028071580394538193 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.06427859721710259
error: 0.007417758301865543 

curent state: 0
action: 0
next_state: 0
reward: 0.0

error: 0.006225201647800618 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.09221867910147007
error: 0.014972440882595667 

curent state: 8
action: 2
next_state: 9
reward: 0.0
target: 0.14532453012795854
error: 0.05217434921738272 

curent state: 9
action: 0
next_state: 8
reward: 0.0
target: 0.09738393967399096
error: 0.059973271067132704 

curent state: 8
action: 0
next_state: 8
reward: 0.0
target: 0.09738393967399096
error: 0.03847327515200442 

curent state: 8
action: 2
next_state: 9
reward: 0.0
target: 0.14532453012795854
error: 0.04695691429564444 

curent state: 9
action: 3
next_state: 5
reward: -1
target: -1.0
error: -0.09847709021836104 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.09551182848613851
error: 0.01222134249960495 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.12348127102670355
error: 0.0270046765962606 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.12348127102670355
error: 0.0289918

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.12689644169453076
error: 0.01800277850961232 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.12689644169453076
error: 0.020726475709643394 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.12689644169453076
error: 0.01865382813867905 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.12689644169453076
error: 0.01678844532481115 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.12689644169453076
error: 0.016202500658651087 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.12689644169453076
error: 0.015109600792330036 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.13867914564844863
error: 0.01050092171457917 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.15784749004827356
error: 0.017767544948830483 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.09765828280054539
error: 0.023036533642936077 

curent state: 

reward: 0.0
target: 0.1782117594532223
error: 0.01806784283910834 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -3.926009277677167e-07 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.16033119388904454
error: 0.02055865756330602 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.16033119388904454
error: 0.01850279180697542 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.1782117594532223
error: 0.016261058555197494 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.2091526028842244
error: 0.029140724648646338 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.2091526028842244
error: 0.06596351504283951 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.13208860739445494
error: 0.03864418817775048 

curent state: 3
action: 0
next_state: 2
reward: 0.0
target: 0.2091526028842244
error: 0.07572976713224971 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target:

action: 3
next_state: 6
reward: 0.0
target: 0.27322734581513003
error: 0.17800468492267374 

curent state: 6
action: 3
next_state: 2
reward: 0.0
target: 0.23031611896270493
error: 0.11497586933774724 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.23031611896270493
error: 0.046170476168897634 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.27322734581513003
error: 0.04058480140835738 

curent state: 6
action: 0
next_state: 5
reward: -1
target: -1.0
error: -0.05233476330273601 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.20978704532392894
error: 0.006713374567175523 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -1.3689147915219024e-07 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.20978704532392894
error: 0.006042037110457976 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.20978704532392894
error: 0.029303583246821363 

curent state: 1
action: 2


target: 0.23450402321942737
error: 0.008833368727749286 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.23450402321942737
error: 0.01613194889540298 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.20333475152067715
error: 0.010098487203073547 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.20333475152067715
error: 0.03923316947650002 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.174481955292862
error: 0.021012535717362252 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -0.00019662705047562845 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.23450402321942737
error: 0.007950031854974365 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.23450402321942737
error: 0.014518754005862694 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.23450402321942737
error: 0.013066878605276416 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.


curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.23921474450329758
error: 0.013279489039439013 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.20401363449698956
error: 0.021364705365525843 

curent state: 8
action: 2
next_state: 9
reward: 0.0
target: 0.263951167407494
error: 0.05787678912770658 

curent state: 9
action: 3
next_state: 5
reward: -1
target: -1.0
error: -0.027812838944369367 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.2859264596302155
error: 0.009620581053726485 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -1.2132607141168705e-08 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.2859264596302155
error: 0.008658522948353842 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.2753524510869301
error: 0.023617407197135454 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.2859264596302155
error: 0.007792670653518463 

curent state

reward: 0.0
target: 0.3116109647472873
error: 0.007181193999120761 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -2.7755513443139535e-09 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.3020964112465978
error: 0.010725106773115667 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.3116109647472873
error: 0.006463074599208696 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.30273625563191947
error: 0.027730764919539574 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.30273625563191947
error: 0.010292440481125764 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.30273625563191947
error: 0.00926319643301321 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.30273625563191947
error: 0.009174461846482906 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.3116109647472873
error: 0.005816767139287804 

curent state: 1
action: 2
next_state: 2
reward: 0.0


target: 0.32040924759071326
error: 0.006441328033957638 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.32040924759071326
error: 0.0070990793622029424 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.32040924759071326
error: 0.006389171425982665 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.3045374130261794
error: 0.006939104178086242 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -2.6916358031314758e-11 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.32040924759071326
error: 0.005750254283384382 

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.32040924759071326
error: 0.00579719523056188 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.32683115947261
error: 0.0031854548355259027 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -3.036978446502303e-10 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 

action: 2
next_state: 9
reward: 0.0
target: 0.349649428896325
error: 0.05249575732257139 

curent state: 9
action: 3
next_state: 5
reward: -1
target: -1.0
error: -0.018248003631400778 

-----done-----

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.3433041206111317
error: 0.003906183386834905 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.355677727348097
error: 0.008905888346953883 

curent state: 2
action: 1
next_state: 6
reward: 0.0
target: 0.3825983112880855
error: 0.023327879623341097 

curent state: 6
action: 1
next_state: 10
reward: 0.0
target: 0.4885547272655531
error: 0.1020917865705172 

curent state: 10
action: 0
next_state: 9
reward: 0.0
target: 0.349649428896325
error: 0.1835267951913007 

curent state: 9
action: 0
next_state: 8
reward: 0.0
target: 0.29937921483295066
error: 0.1588871428318304 

curent state: 8
action: 3
next_state: 4
reward: 0.0
target: 0.31895265706364323
error: 0.06097152502382125 

curent state: 4
action: 1
next_state: 8



curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.3119345880597198
error: 0.025503707540204534 

curent state: 3
action: 0
next_state: 2
reward: 0.0
target: 0.3820430255068609
error: 0.06695758302229549 

curent state: 2
action: 0
next_state: 1
reward: 0.0
target: 0.36153371074862817
error: 0.03504739739195062 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.36153371074862817
error: 0.011642749013935694 

curent state: 1
action: 0
next_state: 0
reward: 0.0
target: 0.35187670449754044
error: 0.012038282686257118 

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.3353843318777282
error: 0.006357236518053866 

curent state: 4
action: 2
next_state: 5
reward: -1
target: -1.0
error: -2.1470603073225902e-12 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.3353843318777282
error: 0.0057215128662485015 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.3120548928918161
error: 0.01950512258792081 

curent state:

action: 0
next_state: 0
reward: 0.0
target: 0.374801282683608
error: 0.010701246982458834 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.374801282683608
error: 0.009631122284212923 

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.374801282683608
error: 0.008668010055791653 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.38520679781068706
error: 0.006619643584820367 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -2.9451996397256153e-12 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.3754566273985052
error: 0.005806879486230765 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.38520679781068706
error: 0.0059576792263383305 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.41027328441494076
error: 0.021175508848590208 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.3523655769979627
error: 0.03137518660235955 

curent state: 3
action: 1
next_

next_state: 4
reward: 0.0
target: 0.3710360638800977
error: 0.006944645083527101 

curent state: 4
action: 0
next_state: 4
reward: 0.0
target: 0.3710360638800977
error: 0.02005150610426415 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.35503326980816263
error: 0.025947816975970373 

curent state: 8
action: 1
next_state: 12
reward: -1
target: -1.0
error: -1.0290430145554197e-05 

-----done-----

curent state: 0
action: 3
next_state: 0
reward: 0.0
target: 0.39920059248094764
error: 0.010700693313319964 

curent state: 0
action: 2
next_state: 1
reward: 0.0
target: 0.41032689456594756
error: 0.007093972868020659 

curent state: 1
action: 1
next_state: 5
reward: -1
target: -1.0
error: -2.3503421431314564e-13 

-----done-----

curent state: 0
action: 1
next_state: 4
reward: 0.0
target: 0.3710360638800977
error: 0.006250180575174402 

curent state: 4
action: 1
next_state: 8
reward: 0.0
target: 0.35503326980816263
error: 0.023353035278373346 

curent state: 8
action: 2
next_st

action: 2
next_state: 2
reward: 0.0
target: 0.47332092282863386
error: 0.03016326109465578 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.47332092282863386
error: 0.041175611272328494 

curent state: 2
action: 0
next_state: 1
reward: 0.0
target: 0.4417122479650092
error: 0.05856005527304753 

curent state: 1
action: 3
next_state: 1
reward: 0.0
target: 0.4417122479650092
error: 0.03484370074241694 

curent state: 1
action: 2
next_state: 2
reward: 0.0
target: 0.47332092282863386
error: 0.02714693498519022 

curent state: 2
action: 3
next_state: 2
reward: 0.0
target: 0.47332092282863386
error: 0.03705805014509567 

curent state: 2
action: 2
next_state: 3
reward: 0.0
target: 0.40018189228293755
error: 0.036234657456460595 

curent state: 3
action: 1
next_state: 7
reward: -1
target: -1.0
error: -0.0006961986091308381 

-----done-----

curent state: 0
action: 0
next_state: 0
reward: 0.0
target: 0.42188669489581765
error: 0.014320636729463765 

curent state: 0
action: 2
next_

'\nSFFF       (S: starting point, safe)\nFHFH       (F: frozen surface, safe)\nFFFH       (H: hole, fall to your doom)\nHFFG       (G: goal, where the frisbee is located)\n\n0\tMove Left\n1\tMove Down\n2\tMove Right\n3\tMove Up\n'

---

In [16]:
Q_table

array([[ 0.45815879,  0.39978115,  0.47371043,  0.43894073],
       [ 0.44353663, -1.        ,  0.48584608,  0.45807306],
       [ 0.43979818,  0.50557846,  0.40087229,  0.46927642],
       [ 0.44701263, -0.99949247,  0.31626269,  0.34385567],
       [ 0.3891108 ,  0.42129224, -1.        ,  0.41954081],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.99582544,  0.53613022, -0.99363731,  0.40140531],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.30788344, -0.99999601,  0.47363024,  0.36304582],
       [ 0.28067991,  0.41361976,  0.59238634, -0.99030226],
       [ 0.33507908,  0.71639623, -0.9774716 ,  0.31270777],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.89058101,  0.18037427,  0.58035329,  0.17216441],
       [ 0.22745426,  0.5243513 ,  0.86491483,  0.47076483],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])

In [17]:
state = env.reset()
np.argmax(Q_table[state])

2

In [18]:
optimal_action = np.argmax(Q_table[state])
action = optimal_action

'''
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)

0	Move Left
1	Move Down
2	Move Right
3	Move Up
''';

In [19]:
print(action)

2


In [20]:
print(np.argmax(Q_table[0]))

2


---

In [21]:
state = env.reset()
while True:
    ## เลือก optimal action ด้วยการดูจากตาราง Q
    optimal_action = np.argmax(Q_table[state])
    ## เดินไปตาม optimal action
    next_state,reward,done,_ = env.step(optimal_action)
    ## ก้าวสู่ step ถัดไป ด้วยการก๊อป state ในเวลาถัดไปมาเป็น state ปัจจุบัน 
    state = next_state
    ## คำสั่งให้แสดงผลว่าตอนนี้ agent เราอยู่จุดไหน และมาจากทางไหน
    env.render()
    ## สิ้นสุด episode การเทส
    if done:  break

  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
