In [38]:
"https://towardsdatascience.com/q-learning-algorithm-from-explanation-to-implementation-cdbeda2ea187"

# Frozen Lake
import numpy as np
import gym

In [14]:
# Init Environment
env = gym.make("FrozenLake-v1")

# Spaces - Observation, Actions Spaces
n_observations = env.observation_space.n
n_actions = env.action_space.n

In [15]:
# Init Q-Table
Q_table = np.zeros((n_observations, n_actions))
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [34]:
# Hyper-Parameters
n_episodes = 10000
max_iter_episode = 100

# Policy: Exploration Pr
exploration_proba = 1

# Exploration Decreasing Decay (Exponent)
exploration_decay = 0.001

# Minimum Exploration Pr
min_exploration_proba = 0.01

# Discount Factor
gamma = 0.99

# Learning Rate
lr = 0.1

In [41]:
# Cumulative Reward Gained, Buffer
total_rewards_episode = []
rewards_per_episode = []

In [42]:

for e in range(n_episodes):
    #we initialize the first state of the episode
    current_state = env.reset()
    done = False

    #sum the rewards that the agent gets from the environment
    total_episode_reward = 0

    for i in range(max_iter_episode):
        # we sample a float from a uniform distribution over 0 and 1
        # if the sampled flaot is less than the exploration proba
        #     the agent selects arandom action
        # else
        #     he exploits his knowledge using the bellman equation

        if np.random.uniform(0,1) < exploration_proba:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[current_state,:])

        # The environment runs the chosen action and returns
        # the next state, a reward and true if the epiosed is ended.
        next_state, reward, done, _ = env.step(action)

        # We update our Q-table using the Q-learning iteration
        Q_table[current_state, action] = (1-lr) * Q_table[current_state, action] +lr*(reward + gamma*max(Q_table[next_state,:]))
        total_episode_reward = total_episode_reward + reward
        # If the episode is finished, we leave the for loop
        if done:
            break
        current_state = next_state
    #We update the exploration proba using exponential decay formula
    exploration_proba = max(min_exploration_proba, np.exp(-exploration_decay*e))
    rewards_per_episode.append(total_episode_reward)

In [43]:
Q_table

array([[0.53437561, 0.50575093, 0.50424347, 0.49891269],
       [0.37774105, 0.32679333, 0.16347998, 0.47596745],
       [0.41714317, 0.42612608, 0.40297041, 0.45379536],
       [0.19844046, 0.28950976, 0.36762011, 0.44897566],
       [0.55891589, 0.42411352, 0.3318498 , 0.33258503],
       [0.        , 0.        , 0.        , 0.        ],
       [0.379557  , 0.16199192, 0.24622596, 0.10870504],
       [0.        , 0.        , 0.        , 0.        ],
       [0.43371229, 0.41891474, 0.35715082, 0.59949297],
       [0.53010299, 0.64924963, 0.47163123, 0.38479404],
       [0.59780098, 0.39574902, 0.30199831, 0.41091886],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.5239626 , 0.60793458, 0.76447364, 0.42122991],
       [0.70122598, 0.91025434, 0.74886541, 0.73256202],
       [0.        , 0.        , 0.        , 0.        ]])

In [44]:
print("Mean reward per thousand episodes")
for i in range(10):
    print((i+1)*1000,": mean espiode reward:",
           np.mean(rewards_per_episode[1000*i:1000*(i+1)]))

Mean reward per thousand episodes
1000 : mean espiode reward: 0.05
2000 : mean espiode reward: 0.196
3000 : mean espiode reward: 0.457
4000 : mean espiode reward: 0.624
5000 : mean espiode reward: 0.69
6000 : mean espiode reward: 0.687
7000 : mean espiode reward: 0.7
8000 : mean espiode reward: 0.691
9000 : mean espiode reward: 0.67
10000 : mean espiode reward: 0.676
