## Q* Learning with FrozenLake 

The goal of this game is to go from the starting state (S) to the goal state (G) by walking only on frozen tiles (F) and avoid holes (H).However, the ice is slippery, so you won't always move in the direction you intend (stochastic environment)

In [1]:
import numpy as np
import gym
import random

In [13]:
env = gym.make("FrozenLake-v0")
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [6]:
action_size = env.action_space.n
state_size = env.observation_space.n
print("Actions : ", action_size)
print("State Size : ",state_size)

Actions :  4
State Size :  16


In [84]:
total_episodes = 150000
learning_rate = 0.8
max_steps = 50
gamma = 0.98 #discount factor

epsilon = 0.35
# max_epsilon = 1.0
# min_epsilon = 0.01
# decay_rate = 0.005



In [91]:
%%time
q_table = np.zeros((state_size, action_size))

rewards = []

seen_final = 0
not_seen_final = 0
for episode in range(total_episodes):
    state = env.reset()
    total_rewards = 0

    
    for step in range(max_steps):
        exp_tradeoff = random.uniform(0,1)
        
        if exp_tradeoff > epsilon:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()
            
        obs, reward, done, info = env.step(action)

        if reward==0 :
            total_rewards = total_rewards -1
        
        if done and reward!=0:
            seen_final = seen_final + 1
            total_rewards = 1000
        
        elif done and reward==0:
            not_seen_final = not_seen_final +1
            total_rewards = -100
        
        q_table[state, action] = q_table[state, action] + learning_rate *(total_rewards + gamma * np.max(q_table[obs, :]) - q_table[state, action])
          
        state = obs
        
           
        
        if done:
            rewards.append(total_rewards)
            break
         
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp (-decay_rate * episode) 
        

        
print("Avg Score ", sum(rewards)/total_episodes)
print("Final ?", seen_final, not_seen_final, total_episodes)

Avg Score  -43.086
Final ? 7752 142149 150000
CPU times: user 25.4 s, sys: 428 ms, total: 25.8 s
Wall time: 25.1 s


In [92]:
q_table

array([[-132.56830201, -120.95884091,  -92.32612289, -147.70908425],
       [-170.23641561,  413.38716767, -175.85932768, -157.98028887],
       [-167.81414008,  822.17384343, -167.69361913, -166.31822488],
       [-103.27047418, -173.95531385, -182.36473413, -179.34098715],
       [-109.78399652, -100.13061743, -162.14047254, -120.87136768],
       [   0.        ,    0.        ,    0.        ,    0.        ],
       [-138.14243481,  100.77635185, -175.1856893 , -106.58447198],
       [   0.        ,    0.        ,    0.        ,    0.        ],
       [-140.93848928, -135.4869285 , -100.00409247, -136.05007349],
       [-115.85080431, -109.40887874,   77.03411806, -124.92340381],
       [-124.43889825,  806.81602372, -125.63651535, -129.2743877 ],
       [   0.        ,    0.        ,    0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        ,    0.        ],
       [-131.91329382, -105.27672231, -125.89349978,  510.65616615],
       [ -85.65184389, -110.682972

In [93]:
env.reset()

for episode in range(10):
    state = env.reset()
    
    for step in range(max_steps):
        action = np.argmax(q_table[state,:])
        obs, reward, done, info = env.step(action)
        
        if done:
            env.render()
            print("Steps taken", step)
            break
        
        state = obs
env.close()        

  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Steps taken 2
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
Steps taken 4
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
Steps taken 3
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
Steps taken 4
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
Steps taken 3
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Steps taken 3
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
Steps taken 6
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
Steps taken 3
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Steps taken 3
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Steps taken 1
