In [2]:
import numpy as np
import gym
import random
from IPython.display import Image
import os

In [None]:
env = gym.make("FrozenLake-v0")
env.render()

action_size = env.action_space.n
print('Action Size - ',action_size)

state_size = env.observation_space.n
print('State Size - ', state_size)

qtable = np.zeros((state_size, action_size))
#print(qtable)

tuning_params = [2500, 5000, 10000, 15000, 25000, 50000, 70000]


for param in tuning_params:    


    total_episodes = param        # Total episodes
    total_test_episodes = 100     # Total test episodes
    max_steps = 99                # Max steps per episode

    learning_rate = 0.7           # Learning rate
    gamma = 0.8                 # Discounting rate

    # Exploration parameters
    epsilon = 1.0                 # Exploration rate
    max_epsilon = 1.0             # Exploration probability at start
    min_epsilon = 0.01            # Minimum exploration probability 
    decay_rate = 0.01             # Exponential decay rate for exploration prob


    rewards = []
    avg_epsilon = []
    print('*************************  Q-Learning  ********************************')
    # 2 For life or until learning is stopped
    for episode in range(total_episodes):
        # Reset the environment
        state = env.reset()
        step = 0
        done = False
        total_rewards = 0
        
        for step in range(max_steps):

            # 3. Choose an action a in the current world state (s)
            ## First we randomize a number
            exp_exp_tradeoff = random.uniform(0,1)

            ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
            if exp_exp_tradeoff > epsilon:
                action = np.argmax(qtable[state,:])

            # Else doing a random choice --> exploration
            else:
                action = env.action_space.sample()

            # Take the action (a) and observe the outcome state(s') and reward (r)
            new_state, reward, done, info = env.step(action)

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * 
                                        np.max(qtable[new_state, :]) - qtable[state, action])

            total_rewards += reward
            # Our new state is state
            state = new_state

            # If done : finish episode
            if done == True: 
                break
            if(step == max_steps-1):
            #print('Max Step Reached for Episode - ', episode)
            #print('Epsilon value at Max Step - ', epsilon)
                avg_epsilon.append(epsilon)

        # Reduce epsilon (because we need less and less exploration)
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        rewards.append(total_rewards)


    print("Number of Training Episodes - " + str(total_episodes))
    print ("Training Score over time: " +  str(sum(rewards)/total_episodes))
    try:
        print("Average Epsilon value when max steps is reached: " + str(sum(avg_epsilon)/len(avg_epsilon)))
    except:
        print("Average Epsilon value is 0, since Max steps are not reached")
        
    print(qtable)
    print(" ")

    env.reset()
    rewards = []
    avg_steps = []

    print('*************************  Q-Testing  ********************************')
    
    for episode in range(total_test_episodes):
        state = env.reset()
        step = 0
        done = False
        total_rewards = 0
        
        for step in range(max_steps):

            # UNCOMMENT IT IF YOU WANT TO SEE OUR AGENT PLAYING
            #env.render()
            # Take the action (index) that have the maximum expected future reward given that state
            action = np.argmax(qtable[state,:])

            new_state, reward, done, info = env.step(action)

            total_rewards += reward

            if done:
                #env.render()
                print ("Episode - "+ str(episode) + ",  Score - ", total_rewards)
                #avg_steps.append(step)
                break
            state = new_state
        avg_steps.append(step)
        rewards.append(total_rewards)


    env.close()

    print("Learning Rate value - " + str(learning_rate))
    print("Number of Test Episodes - " + str(total_test_episodes))
    print ("Testing Score over time: " +  str(sum(rewards)/total_test_episodes))
    print("Average num of Steps Per Episode: " + str(sum(avg_steps)/total_test_episodes))


[41mS[0mFFF
FHFH
FFFH
HFFG
Action Size -  4
State Size -  16
*************************  Q-Learning  ********************************
Number of Training Episodes - 2500
Training Score over time: 0.0
Average Epsilon value when max steps is reached: 0.010005037510060046
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
 
*************************  Q-Testing  ********************************
Episode - 0,  Score -  0.0
Episode - 1,  Score -  0.0
Episode - 2,  Score -  0.0
Episode - 3,  Score -  0.0
Episode - 4,  Score -  0.0
Episode - 5,  Score -  0.0
Episode - 6,  Score -  0.0
Episode - 7,  Score -  0.0
Episode - 8,  Score -  0.0
Episode - 9,  Score -  0.0
Episode - 10,  Score -  0.0
Episode - 11,  Score -  0.0
Episode - 12,  Score -  0.0
Episode - 13,  Score -  0.0
Episode - 14,  Score -  0.0
Episo

Number of Training Episodes - 15000
Training Score over time: 0.32986666666666664
Average Epsilon value when max steps is reached: 0.010128182946692313
[[1.08093499e-04 7.11536250e-03 2.22285040e-03 1.12834545e-04]
 [9.42473332e-05 6.36349654e-05 8.48947379e-05 1.17167654e-01]
 [7.53787048e-05 6.52966476e-04 2.16649850e-02 7.32088727e-05]
 [3.20060488e-05 5.02897001e-05 2.03281385e-05 7.00411583e-03]
 [1.19115161e-01 3.24401976e-03 1.31847679e-04 2.25069355e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.08637544e-03 3.51637644e-05 6.15026658e-05 2.60700326e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.16483842e-04 2.54738505e-01 3.08972649e-04 1.74557184e-02]
 [1.29359707e-03 4.07073611e-01 2.28005279e-03 1.83383633e-03]
 [3.85139729e-01 5.42055444e-04 1.68692340e-03 1.32453904e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.10927261e-02 2.70459234e-

Number of Training Episodes - 50000
Training Score over time: 0.3318
Average Epsilon value when max steps is reached: 0.010015597495585933
[[9.04745161e-04 8.67414728e-04 4.98382616e-03 4.75669839e-04]
 [2.35802545e-04 1.18389747e-04 3.55855918e-04 4.61475821e-03]
 [9.43554331e-05 1.60363717e-02 8.64958287e-05 9.64102130e-05]
 [2.88061973e-05 1.22900485e-05 3.42864614e-05 3.31905095e-03]
 [4.08627815e-03 6.00061977e-05 1.62735716e-04 6.05563181e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.77205658e-03 1.93734805e-06 8.54179139e-06 1.65019399e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.82275909e-04 3.32326731e-05 2.44342807e-03 2.09092652e-02]
 [7.41606960e-03 1.13574438e-02 2.21870446e-02 3.36780749e-03]
 [8.84979321e-02 1.89782571e-03 1.80223191e-03 7.56077369e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [9.98404142e-03 2.62751962e-02 3.99591786