# About the game

Winter is here. You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mostly frozen, but there are a few holes where the ice has melted. If you step into one of those holes, you'll fall into the freezing water. At this time, there's an international frisbee shortage, so it's absolutely imperative that you navigate across the lake and retrieve the disc. However, the ice is slippery, so you won't always move in the direction you intend. The surface is described using a grid like the following:



![alt text](lake.jpg "Lake")

Source: [DeepLizard](https://deeplizard.com/images/frozen%20lake%20winter.jpg)


|STATE    |DESCRIPTION                      |REWARD    |
|:--|:--|--:|
| S       |Agent’s starting point - safe    |   0      |
| F       |Frozen surface - safe            |   0      |
| H       |Hole - game over                 |   0      |
| G       |Goal - game over                 |   1      |



In [1]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

# Setting up the environment

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
#Initializing the parameters on enviroment
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size,action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [19]:
#Setting up the environment parameters

num_episodes = 10000               #The no. epsiode we want our agent to play during training.
max_steps_per_episode = 100       #Max no. of steps agent allowed to take during single episode.

learning_rate = 0.1
discount_rate = 0.99


#Parameters related to exploitation greedy-strategy.

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.0001
exploration_decay_rate = 0.001


In [17]:
rewards_all_episodes = []
#Q-learning Algorithm
for episode in range(num_episodes):
 
    """
    For each episode we are going
    back to starting state.
    """
    
    state = env.reset()
    done = False                     #Just to keep track that whether our episode is finished or not.
    rewards_current_episode = 0      #Reset reward
    
    
    for step in range(max_steps_per_episodes):
         
        """
        Nested loop which run for each timestamp
        within an episode.
        """
        
        #Exploration-rate trade off:
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        new_state , reward , done, info = env.step(action)
        
        # Update Q-table for (s,a)
        # Update Q-table for Q(s,a)
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
        learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state
        rewards_current_episode += reward
        
        if done == True:
            break
    
    
    #Exploration rate decay
    exploration_rate = min_exploration_rate + \
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    rewards_all_episodes.append(rewards_current_episode)
    
# Calculate and print the average reward per thousand reward
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000
    
# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)    

********Average reward per thousand episodes********

1000 :  0.05300000000000004
2000 :  0.18400000000000014
3000 :  0.4100000000000003
4000 :  0.5920000000000004
5000 :  0.6890000000000005
6000 :  0.7150000000000005
7000 :  0.7260000000000005
8000 :  0.7330000000000005
9000 :  0.7490000000000006
10000 :  0.7560000000000006


********Q-table********

[[0.53590214 0.48665914 0.49401431 0.48728632]
 [0.39186371 0.3739457  0.33973766 0.52584175]
 [0.40221006 0.39976444 0.39049296 0.51445095]
 [0.28942028 0.24499417 0.36213343 0.50147623]
 [0.55623894 0.30590888 0.36450878 0.39260613]
 [0.         0.         0.         0.        ]
 [0.30880851 0.13282312 0.15398423 0.12008024]
 [0.         0.         0.         0.        ]
 [0.44528971 0.45443821 0.41596886 0.57067387]
 [0.50403974 0.58037094 0.42658771 0.42684557]
 [0.56635877 0.37272724 0.34628043 0.26836233]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.47980086 0.60171351 0.77733383 

In [20]:
# Watch our agent play Frozen Lake by playing the best action 
# from each state according to the Q-table

for episode in range(3):
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)

    for step in range(max_steps_per_episode):        
        # Show current state of environment on screen
        # Choose action with highest Q-value for current state       
        # Take new action
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])        
        new_state, reward, done, info = env.step(action)

        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break
        state = new_state    

env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
****You reached the goal!****
