In [1]:

pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
import numpy as np
import gymnasium as gym

In [3]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True,render_mode='rgb_array')#rgb_array for recording video
n_observations = env.observation_space.n
n_actions = env.action_space.n


In [4]:
print('Number of States',n_observations)
print('Number of possible actions',n_actions)


Number of States 16
Number of possible actions 4


In [5]:
#Initialize the Q-table to 0
Q_table = np.zeros((n_observations,n_actions))
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [7]:
Q_table.shape #16 States with 4 actions each


(16, 4)

In [8]:
Q_table[9,:]

array([0., 0., 0., 0.])

In [9]:
#number of episode we will run
n_episodes = 10000

#maximum of iteration per episode- Or number of steps per episode
steps_allowed = 100

#initialize the exploration probability to 1
epsilon = 1

#exploartion decreasing decay for exponential decreasing
decay_rate = 0.001

# minimum of exploration proba
min_epsilon = 0.01

#max exploration rate
max_epsilon=1
#discounted factor
gamma = 0.99

#learning rate
lr = 0.1


In [10]:
#Storing rewards after each episode in a list
rewards_per_episode = list()

In [11]:
#we iterate over episodes
for e in range(n_episodes):
  #we initialize the first state of the episode
  state = env.reset()[0]
  done = False

  #sum the rewards that the agent gets from the environment
  total_reward = 0

  for i in range(steps_allowed):
    # epsilon greedy strategy
    # we initiate a random number between 0 and 1
    # if the random_number is less than the exploration proba(epsilon)
    #     the agent explores
    # else
    #     he exploits his knowledge

    if np.random.uniform(0,1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q_table[state,:])

    # The environment runs the chosen action and returns
    # the next state, a reward and true if the episode is ended.
    next_state, reward, done, truncated , info = env.step(action)

    # We update our Q-table using the Q-learning iteration
    Q_table[state, action] = (1-lr) * Q_table[state, action] \
                       + lr*(reward + gamma* np.max(Q_table[next_state,:]))
    total_reward = total_reward + reward

    state = next_state
    # If the episode is finished, we leave the for loop
    if done:
        break

  #We update the exploration proba using exponential decay formula
  epsilon = exploration_rate = min_epsilon + \
    (max_epsilon - min_epsilon) * np.exp(-decay_rate*e)
  rewards_per_episode.append(total_reward)

In [12]:
len(rewards_per_episode)

10000

In [13]:
rewards_per_thousand_episodes=np.split(np.array(rewards_per_episode),n_episodes/1000)

In [14]:
count=1000
print('-----Average reward per thousand episodes-------')
for r in rewards_per_thousand_episodes:
  print(count,':' ,str(sum(r/1000)))
  count+=1000


-----Average reward per thousand episodes-------
1000 : 0.060000000000000046
2000 : 0.19500000000000015
3000 : 0.4130000000000003
4000 : 0.5470000000000004
5000 : 0.6180000000000004
6000 : 0.6660000000000005
7000 : 0.6770000000000005
8000 : 0.6650000000000005
9000 : 0.7130000000000005
10000 : 0.7100000000000005


In [15]:
print('-----------Updated Q Table-----------')
print(Q_table)

-----------Updated Q Table-----------
[[0.59646968 0.53198373 0.53080963 0.53162936]
 [0.2703836  0.31812512 0.21244572 0.52823562]
 [0.44151758 0.42896256 0.41198212 0.49778574]
 [0.38964389 0.32355284 0.28064153 0.47223238]
 [0.6096343  0.35507845 0.49330061 0.45709043]
 [0.         0.         0.         0.        ]
 [0.38132596 0.16472694 0.23906849 0.20948008]
 [0.         0.         0.         0.        ]
 [0.35717397 0.50186378 0.46919519 0.63763545]
 [0.49999314 0.70708619 0.52585752 0.43563999]
 [0.76594965 0.40001225 0.49415698 0.26805682]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.39966632 0.6165925  0.77802589 0.36657031]
 [0.7742158  0.84924681 0.805856   0.79231952]
 [0.         0.         0.         0.        ]]


In [16]:
import time
from IPython.display import clear_output

In [17]:
#For recording video
%pip install gymnasium[classic_control] comet_ml
import comet_ml
comet_ml.init(project_name="frozen_lake")
env = gym.wrappers.RecordVideo(env, 'gameplay video')

Collecting comet_ml
  Downloading comet_ml-3.33.10-py3-none-any.whl (561 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.9/561.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting python-box<7.0.0 (from comet_ml)
  Downloading python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests-toolbelt>=0.8.0 (from comet_ml)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting semantic-version>=2.8.0 (from comet_ml)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Collecting sentry-sdk>=1.1.0 (from comet_ml)
  Downloading sentry_sdk-1.30.0-py2.py3-none-any.whl (218 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.8/21

[1;38;5;39mCOMET INFO:[0m Valid Comet API Key saved in /root/.comet.config (set COMET_CONFIG to change where it is saved).


In [18]:
#Visualising the game
for episode in range(3):
  state=env.reset()[0]
  done=False
  print('----------EPISODE:',episode+1,'---------\n\n\n\n')
  time.sleep(1)

  for step in range(steps_allowed):
    clear_output(wait=True)
    print(env.render())
    time.sleep(0.4)

    action = np.argmax(Q_table[state,:])
    new_state, reward, done, truncated, info = env.step(action)

    if done:
      clear_output(wait=True)
      print(env.render())
      if reward == 1:
          print("****You reached the goal!****")
          time.sleep(3)
      else:
          print("****You fell through a hole!****")
          time.sleep(3)
      clear_output(wait=True)
      break

    state=new_state

env.close()

[[[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]

 [[180 200 230]
  [204 230 255]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 ...

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [235 245 249]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]]
****You reached the goal!****
